tremplin: Add a fallback way to start LXD

https://github.com/lxc/lxd/issues/8227 is now the second issue where the
backup.yaml ends up out-of-sync with the filesystem state resulting in
an unbootable container. Now been fixed upstream but we still need to
handle this case until we're back in sync with upstream.
Previously we manually updated the backup.yaml file, now that we've seen
two cases let's use a more general approach.
We first try the lxd import way of loading container state, since that's
been the faster and more reliable approach, if that fails we fall back
to loading from the dqlite database.

BUG=chromium:1155404
TEST=Deploy on a broken image, verify that it starts the container

Change-Id: I0a0cd438dccf5eedab6ef8c75933068f1fe0b2d7
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/tremplin/+/2586416
Reviewed-by: Nicholas Verne <nverne@chromium.org>
Reviewed-by: David Munro <davidmunro@google.com>
Commit-Queue: David Munro <davidmunro@google.com>
Tested-by: David Munro <davidmunro@google.com>
diff --git a/src/chromiumos/tremplin/lxd_helper.go b/src/chromiumos/tremplin/lxd_helper.go
index 5333dcb..2477da1 100644
--- a/src/chromiumos/tremplin/lxd_helper.go
+++ b/src/chromiumos/tremplin/lxd_helper.go
@@ -28,7 +28,6 @@
 var lxdCmd = []string{"/usr/sbin/lxd", "--group", "lxd", "--syslog"}
 
 type lxdHelper struct {
-	lxd            lxd.ContainerServer
 	process        *os.Process
 	mutex          sync.Mutex
 	shutdownSignal chan os.Signal
@@ -163,44 +162,48 @@
 	return nil, lastErr
 }
 
-// LaunchLxd with launch LXD and return a connection to the server, or an error
-// if launching failed. Will respawn LXD if it ever exits.
+// LaunchLxd will return a connection to the LXD server, launching it first if
+// required.
 func (l *lxdHelper) LaunchLxd() (lxd.ContainerServer, error) {
 	l.mutex.Lock()
 	defer l.mutex.Unlock()
 	if l.shutdownSignal == nil {
 		l.shutdownSignal = make(chan os.Signal, 1)
-
-		// Launch LXD, we allow 10 failures per 30 seconds (copied from maitre'd)
+		// Launch LXD, we allow 10 failures per 30 seconds (copied from maitred)
 		// before giving up.
 		go runWithRetry(l.runLxd, l.shutdownSignal, 30*time.Second, 10)
 	}
-	if l.lxd == nil {
-		// Wait for LXD to be ready.
-		c, err := lxdWaitready()
-		if err != nil {
-			// Timed out
-			return nil, fmt.Errorf("Timed out waiting for LXD to start. Last error was: %w", err)
-		}
-		log.Print("LXD running and ready")
-		l.lxd = c
+	// Wait for LXD to be ready.
+	c, err := lxdWaitready()
+	if err != nil {
+		// Timed out
+		return nil, fmt.Errorf("Timed out waiting for LXD to start. Last error was: %w", err)
 	}
-	return l.lxd, nil
+	log.Print("LXD running and ready")
+	return c, nil
 }
 
-// StopLxd signals LXD to shut down with the provided signal and stops
-// respawning it automatically. Returns as soon as the signal is sent instead of
-// blocking on LXD actually shutting down.
-func (l *lxdHelper) StopLxd() error {
-	signal := syscall.SIGPWR
+// StopLxd signals LXD to shut down and stops this helper from restarting it.
+// If keepContainersAlive is false it will gracefully shut down LXD and all its
+// containers. If true it'll less gracefully terminate LXD while keeping any
+// active containers still running. Returns as soon as the signal is sent
+// instead of blocking on LXD actually shutting down.
+func (l *lxdHelper) StopLxd(keepContainersAlive bool) error {
+	var signal syscall.Signal
+	if keepContainersAlive {
+		signal = syscall.SIGTERM
+	} else {
+		signal = syscall.SIGPWR
+	}
 	l.mutex.Lock()
 	defer l.mutex.Unlock()
 	log.Print("Telling LXD to shut down")
-	if l.shutdownSignal != nil {
+	if l.shutdownSignal == nil {
 		// This can happen if we shut down before we started LXD.
 		log.Print("No channel to signal LXD on (is it running?). Nothing to do.")
 		return nil
 	}
 	l.shutdownSignal <- signal
+	l.shutdownSignal = nil
 	return l.process.Signal(signal)
 }
diff --git a/src/chromiumos/tremplin/main.go b/src/chromiumos/tremplin/main.go
index 7f32355..d832de1 100644
--- a/src/chromiumos/tremplin/main.go
+++ b/src/chromiumos/tremplin/main.go
@@ -141,5 +141,5 @@
 	}
 
 	// Now signal LXD to shut down.
-	server.lxdHelper.StopLxd()
+	server.lxdHelper.StopLxd(false)
 }
diff --git a/src/chromiumos/tremplin/start_lxd.go b/src/chromiumos/tremplin/start_lxd.go
index 599b136..fd86846 100644
--- a/src/chromiumos/tremplin/start_lxd.go
+++ b/src/chromiumos/tremplin/start_lxd.go
@@ -36,6 +36,8 @@
 	lxdConfPath            = "/mnt/stateful/lxd_conf" // path for holding LXD client configuration
 	milestonePath          = "/run/cros_milestone"    // path to the file containing the Chrome OS milestone
 	ueventBufferSize       = 4096                     // largest allowed uevent message size
+	lxdDatabasePath        = "/mnt/stateful/lxd/database"
+	lxdDatabaseBackupPath  = "/mnt/stateful/lxd/database.old"
 )
 
 // Patterns of char devices in /dev that should be mapped into the container via the LXD device list.
@@ -529,7 +531,7 @@
 	// Stop LXD if it's already running (e.g. may be left over from a previous
 	// failed launch).
 	if err := s.StopLxdIfRunning(); err != nil {
-		log.Fatal("LXD is already running, but failed to stop: ", err)
+		return fmt.Errorf("LXD is already running, but failed to stop: %w", err)
 	}
 
 	if s.ueventSocket == -1 {
@@ -547,14 +549,36 @@
 	shouldReset := resetDB || s.shouldResetLxdDbBeforeLaunch()
 	if shouldReset {
 		log.Print("Resetting LXD DB prior to launch")
-		err := os.RemoveAll("/mnt/stateful/lxd/database")
-		if err != nil {
-			return fmt.Errorf("Unable to clear the LXD DB: %w", err)
+		// Move to a backup location.
+		_, err := os.Stat(lxdDatabasePath)
+		if os.IsNotExist(err) {
+			// Existing database folder doesn't exist, nothing to move away.
+		} else if err != nil {
+			// Some other error happened, no idea if the database folder exists
+			// or not so fail.
+			return fmt.Errorf("Unable to check if LXD DB exists: %w", err)
+		} else {
+			// Move the database
+			// Delete any old copies. We ignore errors since either it's fine
+			// e.g. backup doesn't exist, or it'll cause the rename to fail.
+			os.RemoveAll(lxdDatabaseBackupPath)
+			err = os.Rename(lxdDatabasePath, lxdDatabaseBackupPath)
+			if err != nil {
+				return fmt.Errorf("Unable to clear the LXD DB: %w", err)
+			}
 		}
 	}
 
 	c, err := s.lxdHelper.LaunchLxd()
 	if err != nil {
+		// Unable to launch LXD.
+		if shouldReset {
+			// Restore the old database.
+			renameErr := os.Rename(lxdDatabaseBackupPath, lxdDatabasePath)
+			if renameErr != nil {
+				log.Print("Unable to restore the old LXD DB: ", renameErr)
+			}
+		}
 		return fmt.Errorf("Failed to connect to LXD daemon: %w", err)
 	}
 	if shouldReset {
@@ -564,12 +588,32 @@
 		// penguin (default name).
 		err := recoverContainer(defaultContainerName)
 		if err != nil {
-			// Uh oh. We deleted their existing state (e.g. list of containers)
-			// but weren't able to recover it. The good news is we won't
-			// accidentally overwrite it, initStoragePool, etc will fail since
-			// the storage pool, etc, already exists. The bad news is we can't
-			// start their container. Fail, and we'll try again next time.
-			return fmt.Errorf("Failed to lxd import container: %w", err)
+			// Our preferred approach failed, let's try loading from the old
+			// database as a fallback. So stop LXD, move the database back,
+			// restart
+			log.Print("Unable to import container: ", err)
+			log.Print("Attempting fallback method of starting LXD")
+			importErr := fmt.Errorf("Failed to lxd import container: %w", err)
+			if err := s.StopLxdIfRunning(); err != nil {
+				log.Print("Failed to stop LXD: ", err)
+				return importErr
+			}
+			err = os.RemoveAll(lxdDatabasePath)
+			if err != nil {
+				log.Print("Unable to delete the empty LXD DB: ", err)
+				return importErr
+			}
+			err = os.Rename(lxdDatabaseBackupPath, lxdDatabasePath)
+			if err != nil {
+				log.Print("Unable to restore the old LXD DB: ", err)
+				return importErr
+			}
+			c, err = s.lxdHelper.LaunchLxd()
+			if err != nil {
+				log.Print("Failed to connect to LXD daemon: ", err)
+				return importErr
+			}
+			log.Print("LXD started via fallback method")
 		}
 	}
 
diff --git a/src/chromiumos/tremplin/stop_lxd.go b/src/chromiumos/tremplin/stop_lxd.go
index 9aaf2dd..6fac629 100644
--- a/src/chromiumos/tremplin/stop_lxd.go
+++ b/src/chromiumos/tremplin/stop_lxd.go
@@ -61,34 +61,66 @@
 	return pids
 }
 
+// waitForLxdToExit blocks until there are running existing LXD processes,
+// waiting up to the specified timeout. Returns nil if LXD was observed to exit,
+// or a non-nil error if LXD is still running after the timeout or another error
+// occurred.
+func waitForLxdToExit(timeout time.Duration) error {
+	const interval = 500 * time.Millisecond
+	end := time.Now().Add(timeout)
+
+	for time.Now().Before(end) {
+		pids := findLxdProcesses()
+		if len(pids) == 0 {
+			return nil
+		}
+	}
+	return fmt.Errorf("Timed out waiting for LXD to exit")
+}
+
 // StopLxdIfRunning searches for any running LXD processes and
 // terminates them. This is intended to clean up any LXD processes
 // that might hang around if tremplin crashes and restarts. This is
 // best-effort, since it's difficult to reliably find and stop
 // processes with the interface we have.
 func (s *tremplinServer) StopLxdIfRunning() error {
-	// Bail out early in the usual case that we don't need to do anything.
-	pids := findLxdProcesses()
-	if len(pids) == 0 {
+	s.lxdHelper.StopLxd(true)
+	s.lxd = nil
+
+	if err := waitForLxdToExit(2 * time.Second); err == nil {
+		// All the LXD instances have stopped.
 		return nil
 	}
 
 	// Give LXD a chance to exit cleanly, and then SIGKILL it.
 	// This will leave the containers running and LXD will
 	// reconnect to them when it restarts.
+	pids := findLxdProcesses()
 	for _, pid := range pids {
 		proc, _ := os.FindProcess(pid)
 		if err := proc.Signal(syscall.SIGTERM); err != nil {
-			return fmt.Errorf("Failed to request LXD shutdown: %w", err)
+			// If the process already exited at any point between listing
+			// processes and here the signal will fail with a private error
+			// (stopping us from using errors.Is to check). But we try again
+			// harder later so just ignore errors.
+			fmt.Print("Failed to request LXD shutdown: ", err)
 		}
 	}
-	time.Sleep(5 * time.Second)
-	for _, pid := range findLxdProcesses() {
+	if err := waitForLxdToExit(5 * time.Second); err == nil {
+		return nil
+	}
+	for _, pid := range pids {
 		proc, _ := os.FindProcess(pid)
 		if err := proc.Kill(); err != nil {
-			return fmt.Errorf("Failed to kill LXD: %w", err)
+			// If the process already exited at any point between listing
+			// processes and here (including in response to the previous
+			// SIGTERM) the signal will fail with a private error (stopping us
+			// from using errors.Is to test for it). But in practice the ways
+			// kill fails (since we're running as root) are to do with the
+			// process not existing, so either way we're happy.
+			fmt.Print("Failed to force LXD shutdown: ", err)
 		}
 	}
-
-	return nil
+	s.lxd = nil
+	return waitForLxdToExit(5 * time.Second)
 }