tremplin: Detect and recover from incomplete setup.

When we get interrupted during setup we can get containers broken in
various weird ways which we can't launch or recover from without
completely deleting the container. Detect when a container hasn't
completed setup and delete then recreate from scratch when that happens.

1. Create containers under a temporary name and rename once fully
downloaded.
2. Set metadata on the container once setup, if this metadata is missing
it's not fully set up so delete and recreate.
3. Only apply changes to containers created from when this commit lands
(tracked by another piece of metadata). All existing containers we assume
are set up correctly.

BUG=chromium:1000567
TEST=manual. Install Crostini from scratch, launch old, launch new,
interrupt setup.

Change-Id: I2b25e30b0747e3336fc9bf9d3a8453a5df9b4d17
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/tremplin/+/1986534
Reviewed-by: David Munro <davidmunro@google.com>
Reviewed-by: Stephen Barber <smbarber@chromium.org>
Commit-Queue: David Munro <davidmunro@google.com>
Tested-by: David Munro <davidmunro@google.com>
Auto-Submit: David Munro <davidmunro@google.com>
diff --git a/src/chromiumos/tremplin/tremplin.go b/src/chromiumos/tremplin/tremplin.go
index 5588643..1933dce 100644
--- a/src/chromiumos/tremplin/tremplin.go
+++ b/src/chromiumos/tremplin/tremplin.go
@@ -41,10 +41,14 @@
 const (
 	backupSnapshot      = "rootfs-backup"
 	importContainerName = "rootfs-import"
+	createContainerName = "crostini-inprogress-create"
 	suffixRecover       = "-crostini-recover"
 	suffixRemap         = "-crostini-remap"
 	shiftSnapshot       = "rootfs-shift"
 
+	tremplinCreatedKey = "createdByTremplin"
+	setupFinishedKey   = "tremplinSetupFinished"
+
 	lingerPath         = "/var/lib/systemd/linger"
 	primaryUserID      = 1000
 	chronosAccessID    = 1001
@@ -561,7 +565,12 @@
 
 	switch opAPI.StatusCode {
 	case api.Success:
-		req.Status = pb.ContainerStartProgress_STARTED
+		if err := s.setContainerMetadataKey(container.Name, setupFinishedKey); err != nil {
+			req.Status = pb.ContainerStartProgress_FAILED
+			req.FailureReason = fmt.Sprintf("Error marking setup finished on container: %v", err)
+		} else {
+			req.Status = pb.ContainerStartProgress_STARTED
+		}
 	case api.Cancelled:
 		req.Status = pb.ContainerStartProgress_CANCELLED
 	case api.Failure:
@@ -590,7 +599,7 @@
 		}
 
 		containersPost := api.ContainersPost{
-			Name: name,
+			Name: createContainerName,
 			Source: api.ContainerSource{
 				Type:        "image",
 				Fingerprint: fingerprint,
@@ -602,7 +611,7 @@
 			req.FailureReason = fmt.Sprintf("failed to create container from image: %v", err)
 			break
 		}
-		_, err = op.AddHandler(func(op api.Operation) { s.handleCreateOperation(op) })
+		_, err = op.AddHandler(func(op api.Operation) { s.handleCreateOperation(op, name) })
 		if err != nil {
 			log.Fatal("Failed to add create operation handler: ", err)
 		}
@@ -624,7 +633,28 @@
 	}
 }
 
-func (s *tremplinServer) handleCreateOperation(op api.Operation) {
+// setContainerMetadataKey sets the given key on the requested container
+// (i.e. isContainerMetadataKeySet will return true for it)
+func (s *tremplinServer) setContainerMetadataKey(containerName string, key string) error {
+	metadata, etag, err := s.lxd.GetContainerMetadata(containerName)
+	if err != nil {
+		return err
+	}
+	metadata.Properties[key] = "true"
+	return s.lxd.SetContainerMetadata(containerName, *metadata, etag)
+}
+
+// isContainerMetadataKeySet returns true/false if a given key is set/not set
+// on the specified container.
+func (s *tremplinServer) isContainerMetadataKeySet(containerName string, key string) (bool, error) {
+	metadata, _, err := s.lxd.GetContainerMetadata(containerName)
+	if err != nil {
+		return false, err
+	}
+	return metadata.Properties[key] == "true", nil
+}
+
+func (s *tremplinServer) handleCreateOperation(op api.Operation, finalName string) {
 	containers := op.Resources["containers"]
 
 	if len(containers) != 1 {
@@ -632,14 +662,8 @@
 		return
 	}
 
-	name, err := getContainerName(containers[0])
-	if err != nil {
-		log.Printf("Failed to get container name for operation: %v", err)
-		return
-	}
-
 	req := &pb.ContainerCreationProgress{
-		ContainerName: name,
+		ContainerName: finalName,
 	}
 
 	switch op.StatusCode {
@@ -649,7 +673,15 @@
 		// it can do yet.
 		return
 	case api.Success:
-		req.Status = pb.ContainerCreationProgress_CREATED
+		if err := s.setContainerMetadataKey(createContainerName, tremplinCreatedKey); err != nil {
+			req.Status = pb.ContainerCreationProgress_FAILED
+			req.FailureReason = fmt.Sprintf("Error marking container as being a Tremplin container: %v", err)
+		} else if err = s.renameContainer(createContainerName, finalName); err != nil {
+			req.Status = pb.ContainerCreationProgress_FAILED
+			req.FailureReason = fmt.Sprintf("Error renaming container to final name: %v", err)
+		} else {
+			req.Status = pb.ContainerCreationProgress_CREATED
+		}
 	case api.Running:
 		req.Status = pb.ContainerCreationProgress_DOWNLOADING
 		progress, ok := op.Metadata["download_progress"].(string)
@@ -671,13 +703,30 @@
 		req.FailureReason = fmt.Sprintf("unhandled create status: %s", op.Status)
 	}
 
-	_, err = s.listenerClient.UpdateCreateStatus(context.Background(), req)
+	_, err := s.listenerClient.UpdateCreateStatus(context.Background(), req)
 	if err != nil {
 		log.Printf("Could not update create status on host: %v", err)
 		return
 	}
 }
 
+func (s *tremplinServer) isContainerSetUp(containerName string) (bool, error) {
+	createdByTremplin, err := s.isContainerMetadataKeySet(containerName, tremplinCreatedKey)
+	if err != nil {
+		return false, err
+	}
+	if !createdByTremplin {
+		// If we weren't created by tremplin (or were created before we started
+		// recording this) assume it's set up correctly.
+		return true, nil
+	}
+	setupComplete, err := s.isContainerMetadataKeySet(containerName, setupFinishedKey)
+	if err != nil {
+		return false, err
+	}
+	return setupComplete, nil
+}
+
 // CreateContainer implements tremplin.CreateContainer.
 func (s *tremplinServer) CreateContainer(ctx context.Context, in *pb.CreateContainerRequest) (*pb.CreateContainerResponse, error) {
 	log.Printf("Received CreateContainer RPC: %s", in.ContainerName)
@@ -686,10 +735,22 @@
 
 	container, _, _ := s.getContainerWithRecovery(in.ContainerName)
 	if container != nil {
-		response.Status = pb.CreateContainerResponse_EXISTS
-		return response, nil
+		ok, err := s.isContainerSetUp(in.ContainerName)
+		if err == nil && !ok {
+			log.Printf("Found incomplete tremplin-managed container. Deleting and creating a new container from scratch.")
+			s.deleteContainer(in.ContainerName)
+		} else {
+			if err != nil {
+				log.Printf("Error: unable to check metadata for an existing container: %v. Skipped setup complete check.", err)
+			}
+			response.Status = pb.CreateContainerResponse_EXISTS
+			return response, nil
+		}
 	}
 
+	// If a previous CreateContainer was interrupted delete the incomplete container and start fresh.
+	s.deleteContainer(createContainerName)
+
 	// Import the image from tarballs.
 	if len(in.RootfsPath) > 0 && len(in.MetadataPath) > 0 {
 		rootfsReader, err := os.Open(in.RootfsPath)
@@ -766,7 +827,7 @@
 	}
 
 	containersPost := api.ContainersPost{
-		Name: in.ContainerName,
+		Name: createContainerName,
 		Source: api.ContainerSource{
 			Type:  "image",
 			Alias: alias.Name,
@@ -779,7 +840,7 @@
 		return response, nil
 	}
 
-	_, err = op.AddHandler(func(op api.Operation) { s.handleCreateOperation(op) })
+	_, err = op.AddHandler(func(op api.Operation) { s.handleCreateOperation(op, in.ContainerName) })
 	if err != nil {
 		log.Fatal("Failed to add create operation handler: ", err)
 	}