| // +build linux |
| |
| package libcontainer |
| |
| import ( |
| "encoding/json" |
| "errors" |
| "fmt" |
| "io" |
| "os" |
| "os/exec" |
| "path/filepath" |
| "strconv" |
| "syscall" // only for Signal |
| |
| "github.com/opencontainers/runc/libcontainer/cgroups" |
| "github.com/opencontainers/runc/libcontainer/configs" |
| "github.com/opencontainers/runc/libcontainer/system" |
| "github.com/opencontainers/runc/libcontainer/utils" |
| |
| "golang.org/x/sys/unix" |
| ) |
| |
| type parentProcess interface { |
| // pid returns the pid for the running process. |
| pid() int |
| |
| // start starts the process execution. |
| start() error |
| |
| // send a SIGKILL to the process and wait for the exit. |
| terminate() error |
| |
| // wait waits on the process returning the process state. |
| wait() (*os.ProcessState, error) |
| |
| // startTime returns the process start time. |
| startTime() (uint64, error) |
| |
| signal(os.Signal) error |
| |
| externalDescriptors() []string |
| |
| setExternalDescriptors(fds []string) |
| } |
| |
| type setnsProcess struct { |
| cmd *exec.Cmd |
| parentPipe *os.File |
| childPipe *os.File |
| cgroupPaths map[string]string |
| config *initConfig |
| fds []string |
| process *Process |
| bootstrapData io.Reader |
| } |
| |
| func (p *setnsProcess) startTime() (uint64, error) { |
| stat, err := system.Stat(p.pid()) |
| return stat.StartTime, err |
| } |
| |
| func (p *setnsProcess) signal(sig os.Signal) error { |
| s, ok := sig.(syscall.Signal) |
| if !ok { |
| return errors.New("os: unsupported signal type") |
| } |
| return unix.Kill(p.pid(), s) |
| } |
| |
| func (p *setnsProcess) start() (err error) { |
| defer p.parentPipe.Close() |
| err = p.cmd.Start() |
| p.childPipe.Close() |
| if err != nil { |
| return newSystemErrorWithCause(err, "starting setns process") |
| } |
| if p.bootstrapData != nil { |
| if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { |
| return newSystemErrorWithCause(err, "copying bootstrap data to pipe") |
| } |
| } |
| if err = p.execSetns(); err != nil { |
| return newSystemErrorWithCause(err, "executing setns process") |
| } |
| // We can't join cgroups if we're in a rootless container. |
| if !p.config.Rootless && len(p.cgroupPaths) > 0 { |
| if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { |
| return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) |
| } |
| } |
| // set rlimits, this has to be done here because we lose permissions |
| // to raise the limits once we enter a user-namespace |
| if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { |
| return newSystemErrorWithCause(err, "setting rlimits for process") |
| } |
| if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { |
| return newSystemErrorWithCause(err, "writing config to pipe") |
| } |
| |
| ierr := parseSync(p.parentPipe, func(sync *syncT) error { |
| switch sync.Type { |
| case procReady: |
| // This shouldn't happen. |
| panic("unexpected procReady in setns") |
| case procHooks: |
| // This shouldn't happen. |
| panic("unexpected procHooks in setns") |
| default: |
| return newSystemError(fmt.Errorf("invalid JSON payload from child")) |
| } |
| }) |
| |
| if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil { |
| return newSystemErrorWithCause(err, "calling shutdown on init pipe") |
| } |
| // Must be done after Shutdown so the child will exit and we can wait for it. |
| if ierr != nil { |
| p.wait() |
| return ierr |
| } |
| return nil |
| } |
| |
| // execSetns runs the process that executes C code to perform the setns calls |
| // because setns support requires the C process to fork off a child and perform the setns |
| // before the go runtime boots, we wait on the process to die and receive the child's pid |
| // over the provided pipe. |
| func (p *setnsProcess) execSetns() error { |
| status, err := p.cmd.Process.Wait() |
| if err != nil { |
| p.cmd.Wait() |
| return newSystemErrorWithCause(err, "waiting on setns process to finish") |
| } |
| if !status.Success() { |
| p.cmd.Wait() |
| return newSystemError(&exec.ExitError{ProcessState: status}) |
| } |
| var pid *pid |
| if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { |
| p.cmd.Wait() |
| return newSystemErrorWithCause(err, "reading pid from init pipe") |
| } |
| |
| // Clean up the zombie parent process |
| firstChildProcess, err := os.FindProcess(pid.PidFirstChild) |
| if err != nil { |
| return err |
| } |
| |
| // Ignore the error in case the child has already been reaped for any reason |
| _, _ = firstChildProcess.Wait() |
| |
| process, err := os.FindProcess(pid.Pid) |
| if err != nil { |
| return err |
| } |
| p.cmd.Process = process |
| p.process.ops = p |
| return nil |
| } |
| |
| // terminate sends a SIGKILL to the forked process for the setns routine then waits to |
| // avoid the process becoming a zombie. |
| func (p *setnsProcess) terminate() error { |
| if p.cmd.Process == nil { |
| return nil |
| } |
| err := p.cmd.Process.Kill() |
| if _, werr := p.wait(); err == nil { |
| err = werr |
| } |
| return err |
| } |
| |
| func (p *setnsProcess) wait() (*os.ProcessState, error) { |
| err := p.cmd.Wait() |
| |
| // Return actual ProcessState even on Wait error |
| return p.cmd.ProcessState, err |
| } |
| |
| func (p *setnsProcess) pid() int { |
| return p.cmd.Process.Pid |
| } |
| |
| func (p *setnsProcess) externalDescriptors() []string { |
| return p.fds |
| } |
| |
| func (p *setnsProcess) setExternalDescriptors(newFds []string) { |
| p.fds = newFds |
| } |
| |
| type initProcess struct { |
| cmd *exec.Cmd |
| parentPipe *os.File |
| childPipe *os.File |
| config *initConfig |
| manager cgroups.Manager |
| container *linuxContainer |
| fds []string |
| process *Process |
| bootstrapData io.Reader |
| sharePidns bool |
| rootDir *os.File |
| } |
| |
| func (p *initProcess) pid() int { |
| return p.cmd.Process.Pid |
| } |
| |
| func (p *initProcess) externalDescriptors() []string { |
| return p.fds |
| } |
| |
| // execSetns runs the process that executes C code to perform the setns calls |
| // because setns support requires the C process to fork off a child and perform the setns |
| // before the go runtime boots, we wait on the process to die and receive the child's pid |
| // over the provided pipe. |
| // This is called by initProcess.start function |
| func (p *initProcess) execSetns() error { |
| status, err := p.cmd.Process.Wait() |
| if err != nil { |
| p.cmd.Wait() |
| return err |
| } |
| if !status.Success() { |
| p.cmd.Wait() |
| return &exec.ExitError{ProcessState: status} |
| } |
| var pid *pid |
| if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { |
| p.cmd.Wait() |
| return err |
| } |
| |
| // Clean up the zombie parent process |
| firstChildProcess, err := os.FindProcess(pid.PidFirstChild) |
| if err != nil { |
| return err |
| } |
| |
| // Ignore the error in case the child has already been reaped for any reason |
| _, _ = firstChildProcess.Wait() |
| |
| process, err := os.FindProcess(pid.Pid) |
| if err != nil { |
| return err |
| } |
| p.cmd.Process = process |
| p.process.ops = p |
| return nil |
| } |
| |
| func (p *initProcess) start() error { |
| defer p.parentPipe.Close() |
| err := p.cmd.Start() |
| p.process.ops = p |
| p.childPipe.Close() |
| p.rootDir.Close() |
| if err != nil { |
| p.process.ops = nil |
| return newSystemErrorWithCause(err, "starting init process command") |
| } |
| if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { |
| return newSystemErrorWithCause(err, "copying bootstrap data to pipe") |
| } |
| if err := p.execSetns(); err != nil { |
| return newSystemErrorWithCause(err, "running exec setns process for init") |
| } |
| // Save the standard descriptor names before the container process |
| // can potentially move them (e.g., via dup2()). If we don't do this now, |
| // we won't know at checkpoint time which file descriptor to look up. |
| fds, err := getPipeFds(p.pid()) |
| if err != nil { |
| return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) |
| } |
| p.setExternalDescriptors(fds) |
| // Do this before syncing with child so that no children can escape the |
| // cgroup. We don't need to worry about not doing this and not being root |
| // because we'd be using the rootless cgroup manager in that case. |
| if err := p.manager.Apply(p.pid()); err != nil { |
| return newSystemErrorWithCause(err, "applying cgroup configuration for process") |
| } |
| defer func() { |
| if err != nil { |
| // TODO: should not be the responsibility to call here |
| p.manager.Destroy() |
| } |
| }() |
| if err := p.createNetworkInterfaces(); err != nil { |
| return newSystemErrorWithCause(err, "creating network interfaces") |
| } |
| if err := p.sendConfig(); err != nil { |
| return newSystemErrorWithCause(err, "sending config to init process") |
| } |
| var ( |
| sentRun bool |
| sentResume bool |
| ) |
| |
| ierr := parseSync(p.parentPipe, func(sync *syncT) error { |
| switch sync.Type { |
| case procReady: |
| // set rlimits, this has to be done here because we lose permissions |
| // to raise the limits once we enter a user-namespace |
| if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { |
| return newSystemErrorWithCause(err, "setting rlimits for ready process") |
| } |
| // call prestart hooks |
| if !p.config.Config.Namespaces.Contains(configs.NEWNS) { |
| // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. |
| if err := p.manager.Set(p.config.Config); err != nil { |
| return newSystemErrorWithCause(err, "setting cgroup config for ready process") |
| } |
| |
| if p.config.Config.Hooks != nil { |
| s := configs.HookState{ |
| Version: p.container.config.Version, |
| ID: p.container.id, |
| Pid: p.pid(), |
| Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), |
| } |
| for i, hook := range p.config.Config.Hooks.Prestart { |
| if err := hook.Run(s); err != nil { |
| return newSystemErrorWithCausef(err, "running prestart hook %d", i) |
| } |
| } |
| } |
| } |
| // Sync with child. |
| if err := writeSync(p.parentPipe, procRun); err != nil { |
| return newSystemErrorWithCause(err, "writing syncT 'run'") |
| } |
| sentRun = true |
| case procHooks: |
| // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. |
| if err := p.manager.Set(p.config.Config); err != nil { |
| return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") |
| } |
| if p.config.Config.Hooks != nil { |
| s := configs.HookState{ |
| Version: p.container.config.Version, |
| ID: p.container.id, |
| Pid: p.pid(), |
| Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), |
| } |
| for i, hook := range p.config.Config.Hooks.Prestart { |
| if err := hook.Run(s); err != nil { |
| return newSystemErrorWithCausef(err, "running prestart hook %d", i) |
| } |
| } |
| } |
| // Sync with child. |
| if err := writeSync(p.parentPipe, procResume); err != nil { |
| return newSystemErrorWithCause(err, "writing syncT 'resume'") |
| } |
| sentResume = true |
| default: |
| return newSystemError(fmt.Errorf("invalid JSON payload from child")) |
| } |
| |
| return nil |
| }) |
| |
| if !sentRun { |
| return newSystemErrorWithCause(ierr, "container init") |
| } |
| if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { |
| return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) |
| } |
| if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil { |
| return newSystemErrorWithCause(err, "shutting down init pipe") |
| } |
| |
| // Must be done after Shutdown so the child will exit and we can wait for it. |
| if ierr != nil { |
| p.wait() |
| return ierr |
| } |
| return nil |
| } |
| |
| func (p *initProcess) wait() (*os.ProcessState, error) { |
| err := p.cmd.Wait() |
| if err != nil { |
| return p.cmd.ProcessState, err |
| } |
| // we should kill all processes in cgroup when init is died if we use host PID namespace |
| if p.sharePidns { |
| signalAllProcesses(p.manager, unix.SIGKILL) |
| } |
| return p.cmd.ProcessState, nil |
| } |
| |
| func (p *initProcess) terminate() error { |
| if p.cmd.Process == nil { |
| return nil |
| } |
| err := p.cmd.Process.Kill() |
| if _, werr := p.wait(); err == nil { |
| err = werr |
| } |
| return err |
| } |
| |
| func (p *initProcess) startTime() (uint64, error) { |
| stat, err := system.Stat(p.pid()) |
| return stat.StartTime, err |
| } |
| |
| func (p *initProcess) sendConfig() error { |
| // send the config to the container's init process, we don't use JSON Encode |
| // here because there might be a problem in JSON decoder in some cases, see: |
| // https://github.com/docker/docker/issues/14203#issuecomment-174177790 |
| return utils.WriteJSON(p.parentPipe, p.config) |
| } |
| |
| func (p *initProcess) createNetworkInterfaces() error { |
| for _, config := range p.config.Config.Networks { |
| strategy, err := getStrategy(config.Type) |
| if err != nil { |
| return err |
| } |
| n := &network{ |
| Network: *config, |
| } |
| if err := strategy.create(n, p.pid()); err != nil { |
| return err |
| } |
| p.config.Networks = append(p.config.Networks, n) |
| } |
| return nil |
| } |
| |
| func (p *initProcess) signal(sig os.Signal) error { |
| s, ok := sig.(syscall.Signal) |
| if !ok { |
| return errors.New("os: unsupported signal type") |
| } |
| return unix.Kill(p.pid(), s) |
| } |
| |
| func (p *initProcess) setExternalDescriptors(newFds []string) { |
| p.fds = newFds |
| } |
| |
| func getPipeFds(pid int) ([]string, error) { |
| fds := make([]string, 3) |
| |
| dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") |
| for i := 0; i < 3; i++ { |
| // XXX: This breaks if the path is not a valid symlink (which can |
| // happen in certain particularly unlucky mount namespace setups). |
| f := filepath.Join(dirPath, strconv.Itoa(i)) |
| target, err := os.Readlink(f) |
| if err != nil { |
| // Ignore permission errors, for rootless containers and other |
| // non-dumpable processes. if we can't get the fd for a particular |
| // file, there's not much we can do. |
| if os.IsPermission(err) { |
| continue |
| } |
| return fds, err |
| } |
| fds[i] = target |
| } |
| return fds, nil |
| } |
| |
| // InitializeIO creates pipes for use with the process's stdio and returns the |
| // opposite side for each. Do not use this if you want to have a pseudoterminal |
| // set up for you by libcontainer (TODO: fix that too). |
| // TODO: This is mostly unnecessary, and should be handled by clients. |
| func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { |
| var fds []uintptr |
| i = &IO{} |
| // cleanup in case of an error |
| defer func() { |
| if err != nil { |
| for _, fd := range fds { |
| unix.Close(int(fd)) |
| } |
| } |
| }() |
| // STDIN |
| r, w, err := os.Pipe() |
| if err != nil { |
| return nil, err |
| } |
| fds = append(fds, r.Fd(), w.Fd()) |
| p.Stdin, i.Stdin = r, w |
| // STDOUT |
| if r, w, err = os.Pipe(); err != nil { |
| return nil, err |
| } |
| fds = append(fds, r.Fd(), w.Fd()) |
| p.Stdout, i.Stdout = w, r |
| // STDERR |
| if r, w, err = os.Pipe(); err != nil { |
| return nil, err |
| } |
| fds = append(fds, r.Fd(), w.Fd()) |
| p.Stderr, i.Stderr = w, r |
| // change ownership of the pipes incase we are in a user namespace |
| for _, fd := range fds { |
| if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { |
| return nil, err |
| } |
| } |
| return i, nil |
| } |