blob: e0c731c093b7d5ef2214073bb765ac25e3c9bf31 [file] [log] [blame]
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"fmt"
"log"
"os"
"os/exec"
"strings"
"sync"
"syscall"
"time"
lxd "github.com/lxc/lxd/client"
)
type lxdHelper struct {
lxd lxd.ContainerServer
process *os.Process
mutex sync.Mutex
shutdownSignal chan os.Signal
}
// runWithRetry runs cmd, and whenever cmd exits runs it again unless
// shutdownSignal has been closed or the command has run too many times recently.
func runWithRetry(cmd func() error, shutdownSignal chan os.Signal, failureWindow time.Duration, numFailures int) {
var failures []time.Time
for {
err := cmd()
// If we reach here LXD has stopped running or never started.
select {
case <-shutdownSignal:
log.Print("Shutting down LXD")
// We're shutting down, nothing to do here.
return
default:
// If LXD isn't running, and we're not shutting down, it's an error,
// LXD should only terminate when we're shutting down.
log.Print("LXD died unexpectedly with error: ", err)
for len(failures) > 0 && failures[0].Before(time.Now().Add(-1*failureWindow)) {
failures = failures[1:]
}
if len(failures) >= numFailures {
// We can't do anything with no LXD, so give up. Keep Tremplin
// running so we can report failures when the user tries to do
// anything, which is the best we can offer at the moment.
log.Print("LXD keeps dying, giving up. All future calls to Tremplin will fail until VM restart.")
return
}
failures = append(failures, time.Now())
}
}
}
// runLxd launches LXD and blocks until LXD exits. Will set l.process.
func (l *lxdHelper) runLxd() error {
lxdCmd := []string{"/usr/sbin/lxd", "--group", "lxd", "--syslog"}
log.Printf("Running %q", strings.Join(lxdCmd, " "))
cmd := exec.Command(lxdCmd[0], lxdCmd[1:]...)
err := cmd.Start()
if err == nil {
log.Print("LXD started")
l.process = cmd.Process
return cmd.Wait()
}
return err
}
// lxdWaitready waits for LXD to be ready to handle requests.
func (l *lxdHelper) lxdWaitready() (lxd.ContainerServer, error) {
var lastErr error
const lxdReadyTimeout = 2 * time.Minute
const lxdInterval = 500 * time.Millisecond
start := time.Now()
attempt := 0
for time.Now().Before(start.Add(lxdReadyTimeout)) {
attempt++
// Wait until there's a socket with something listening at the other end.
c, err := lxd.ConnectLXDUnix("", nil)
if err != nil {
if attempt%10 == 0 {
log.Print("ConnectLXDUnix not ready yet, retrying. Error was: ", err)
}
time.Sleep(lxdInterval)
lastErr = err
continue
}
// Wait until the server says it's ready.
_, _, err = c.RawQuery("GET", "/internal/ready", nil, "")
if err != nil {
if attempt%10 == 0 {
log.Print("internal/ready not ready yet, retrying. Error was: ", err)
}
time.Sleep(lxdInterval)
lastErr = err
continue
}
// LXD is ready.
log.Print("LXD launched")
return c, nil
}
return nil, lastErr
}
// LaunchLxd with launch LXD and return a connection to the server, or an error
// if launching failed. Will respawn LXD if it ever exits.
func (l *lxdHelper) LaunchLxd() (lxd.ContainerServer, error) {
l.mutex.Lock()
defer l.mutex.Unlock()
if l.shutdownSignal == nil {
l.shutdownSignal = make(chan os.Signal, 1)
// Launch LXD, we allow 10 failures per 30 seconds (copied from maitre'd)
// before giving up.
go runWithRetry(l.runLxd, l.shutdownSignal, 30*time.Second, 10)
}
if l.lxd == nil {
// Wait for LXD to be ready.
c, err := l.lxdWaitready()
if err != nil {
// Timed out
return nil, fmt.Errorf("Timed out waiting for LXD to start. Last error was: %w", err)
}
log.Print("LXD running and ready")
l.lxd = c
}
return l.lxd, nil
}
// StopLxd signals LXD to shut down with the provided signal and stops
// respawning it automatically. Returns as soon as the signal is sent instead of
// blocking on LXD actually shutting down.
func (l *lxdHelper) StopLxd() error {
signal := syscall.SIGPWR
l.mutex.Lock()
defer l.mutex.Unlock()
log.Print("Telling LXD to shut down")
if l.shutdownSignal != nil {
// This can happen if we shut down before we started LXD.
log.Print("No channel to signal LXD on (is it running?). Nothing to do.")
return nil
}
l.shutdownSignal <- signal
return l.process.Signal(signal)
}