tremplin: Better handling of failure modes

We assume that lxd, uevent socket and audit rule all exist or don't as a
set. This is leftover from when failing to create any of them killed
Tremplin, now that that's no longer the case it's possible for a failure
to result in e.g. the uevent socket to have been created but LXD to not
have been started causing failures when retrying to launch LXD.

BUG=chromium:1155437
TEST=start tremplin and fail to start LXD, then try again successfully

Change-Id: I1967e1dc7bd336815e742bd35a22c95650d02c75
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/tremplin/+/2592287
Reviewed-by: Fergus Dall <sidereal@google.com>
Reviewed-by: David Munro <davidmunro@google.com>
Commit-Queue: David Munro <davidmunro@google.com>
Tested-by: David Munro <davidmunro@google.com>
diff --git a/src/chromiumos/tremplin/audit.go b/src/chromiumos/tremplin/audit.go
index 7d61c07..91b144a 100644
--- a/src/chromiumos/tremplin/audit.go
+++ b/src/chromiumos/tremplin/audit.go
@@ -26,6 +26,9 @@
 // UpdatePorts queries for listening ports in all running containers and sends
 // those ports to the host.
 func (s *tremplinServer) UpdatePorts() error {
+	if s.lxd == nil {
+		return nil
+	}
 	containers, err := s.lxd.GetContainers()
 	if err != nil {
 		return fmt.Errorf("failed to get container list: %v", err)
@@ -79,29 +82,34 @@
 	return nil
 }
 
-func startAuditListener(s *tremplinServer) error {
-	ac, err := libaudit.NewAuditClient(nil)
-	if err != nil {
-		return fmt.Errorf("failed to create audit client: %v", err)
+func (s *tremplinServer) startAuditListener() error {
+	if s.auditClient == nil {
+		var err error
+		s.auditClient, err = libaudit.NewAuditClient(nil)
+		if err != nil {
+			return fmt.Errorf("failed to create audit client: %v", err)
+		}
+	} else {
+		log.Print("Found an existing audit client so reusing. Did a previous launch fail?")
 	}
 
-	status, err := ac.GetStatus()
+	status, err := s.auditClient.GetStatus()
 	if err != nil {
 		return fmt.Errorf("failed to get audit status: %v", err)
 	}
 
 	if status.Enabled == 0 {
 		log.Print("Enabling kernel audit subsystem")
-		if err = ac.SetEnabled(true, libaudit.WaitForReply); err != nil {
+		if err = s.auditClient.SetEnabled(true, libaudit.WaitForReply); err != nil {
 			return fmt.Errorf("failed to enable auditing: %v", err)
 		}
 	}
 
-	if err := ac.SetPID(libaudit.WaitForReply); err != nil {
+	if err := s.auditClient.SetPID(libaudit.WaitForReply); err != nil {
 		return fmt.Errorf("failed to set tremplin as audit daemon: %v", err)
 	}
 
-	if err := ac.SetFailure(libaudit.LogOnFailure, libaudit.WaitForReply); err != nil {
+	if err := s.auditClient.SetFailure(libaudit.LogOnFailure, libaudit.WaitForReply); err != nil {
 		return fmt.Errorf("failed to set audit to log on failure: %v", err)
 	}
 
@@ -128,7 +136,7 @@
 			return fmt.Errorf("failed to compile listen rule: %v", err)
 		}
 
-		if err := ac.AddRule(compiledRule); err != nil && err.Error() != errAuditRuleExists {
+		if err := s.auditClient.AddRule(compiledRule); err != nil && err.Error() != errAuditRuleExists {
 			return fmt.Errorf("failed to add listen rule: %v", err)
 		}
 
@@ -137,7 +145,7 @@
 	go func() {
 		// Listen for audit events forever.
 		for {
-			rawMsg, err := ac.Receive(false)
+			rawMsg, err := s.auditClient.Receive(false)
 			if err != nil {
 				log.Printf("Failed to receive audit message: %v", err)
 				break
diff --git a/src/chromiumos/tremplin/main.go b/src/chromiumos/tremplin/main.go
index 4938138..7f32355 100644
--- a/src/chromiumos/tremplin/main.go
+++ b/src/chromiumos/tremplin/main.go
@@ -72,6 +72,8 @@
 		upgradeStatus:               *NewTransactionMap(),
 		upgradeClientUpdateInterval: 5 * time.Second,
 		features:                    features,
+		ueventSocket:                -1,
+		auditClient:                 nil,
 	}
 
 	if !features.IsStartLxdEnabled() {
diff --git a/src/chromiumos/tremplin/start_lxd.go b/src/chromiumos/tremplin/start_lxd.go
index b682bcc..599b136 100644
--- a/src/chromiumos/tremplin/start_lxd.go
+++ b/src/chromiumos/tremplin/start_lxd.go
@@ -226,12 +226,12 @@
 	}
 }
 
-func ueventListen(c lxd.ContainerServer, ueventSocket int) error {
+func (s *tremplinServer) ueventListen() error {
 	log.Print("Listening for device updates via uevent")
 
 	for {
 		ueventBytes := make([]byte, ueventBufferSize)
-		recvLen, _, err := syscall.Recvfrom(ueventSocket, ueventBytes, 0)
+		recvLen, _, err := syscall.Recvfrom(s.ueventSocket, ueventBytes, 0)
 		if err != nil {
 			log.Fatal("Failed to read uevent: ", err)
 		}
@@ -261,7 +261,7 @@
 			continue
 		}
 
-		profile, etag, err := c.GetProfile(defaultProfileName)
+		profile, etag, err := s.lxd.GetProfile(defaultProfileName)
 		if err != nil {
 			log.Print("GetProfile failed: ", err)
 			continue
@@ -275,7 +275,7 @@
 			removeDevice(devName, profilePut.Devices)
 		}
 
-		err = c.UpdateProfile(defaultProfileName, profilePut, etag)
+		err = s.lxd.UpdateProfile(defaultProfileName, profilePut, etag)
 		if err != nil {
 			log.Print("UpdateProfile failed: ", err)
 		}
@@ -412,13 +412,12 @@
 	return nil
 }
 
-func (s *tremplinServer) initialSetup() error {
+func (s *tremplinServer) initialSetup(c lxd.ContainerServer) error {
 	// Create the milestone file to bind-mount into containers.
 	// This must be done before initializing the profile as LXD now checks
 	// for the existence of storage volumes when the profile is set rather
 	// then when the container is started.
 	milestone := s.milestone
-	c := s.lxd
 
 	if err := ioutil.WriteFile(milestonePath, []byte(strconv.Itoa(milestone)), 0644); err != nil {
 		return fmt.Errorf("could not write milestone file: %v", err)
@@ -533,9 +532,14 @@
 		log.Fatal("LXD is already running, but failed to stop: ", err)
 	}
 
-	ueventSocket, err := createUeventSocket()
-	if err != nil {
-		return fmt.Errorf("Failed to open uevent netlink connection: %w", err)
+	if s.ueventSocket == -1 {
+		var err error
+		s.ueventSocket, err = createUeventSocket()
+		if err != nil {
+			return fmt.Errorf("Failed to open uevent netlink connection: %w", err)
+		}
+	} else {
+		log.Print("Found an existing uevent socket so reusing. Did a previous launch fail?")
 	}
 	// Let the OS close the uevent socket, we keep it around for the entirety of
 	// tremplin's lifetime.
@@ -568,18 +572,18 @@
 			return fmt.Errorf("Failed to lxd import container: %w", err)
 		}
 	}
-	s.lxd = c
 
-	if err := s.initialSetup(); err != nil {
+	if err := s.initialSetup(c); err != nil {
 		return fmt.Errorf("Failed initialSetup: %w", err)
 	}
 
-	if err := startAuditListener(s); err != nil {
+	if err := s.startAuditListener(); err != nil {
 		return fmt.Errorf("Failed to start audit listener: %w", err)
 	}
 
 	// Listen for device updates.
-	go ueventListen(s.lxd, ueventSocket)
+	s.lxd = c
+	go s.ueventListen()
 	return nil
 }
 
diff --git a/src/chromiumos/tremplin/tremplin.go b/src/chromiumos/tremplin/tremplin.go
index a926196..d6a8647 100644
--- a/src/chromiumos/tremplin/tremplin.go
+++ b/src/chromiumos/tremplin/tremplin.go
@@ -27,6 +27,7 @@
 
 	pb "chromiumos/vm_tools/tremplin_proto"
 
+	"github.com/elastic/go-libaudit"
 	lxd "github.com/lxc/lxd/client"
 	"github.com/lxc/lxd/shared"
 	"github.com/lxc/lxd/shared/api"
@@ -173,6 +174,8 @@
 	upgradeClientUpdateInterval time.Duration
 	lxdHelper                   lxdHelper
 	features                    Features
+	ueventSocket                int
+	auditClient                 *libaudit.AuditClient
 }
 
 // execProgramAsync starts running a program in a container.