Merge pull request #336 from hqhq/hq_parent_cgroup_systemd

systemd: support cgroup parent with specified slice
diff --git a/README.md b/README.md
index c15d97c..9996580 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,8 @@
 ## State of the project
 
 Currently `runc` is an implementation of the OCI specification.  We are currently sprinting
-to have a v1 of the spec out within a quick timeframe of a few weeks, ~July 2015,
-so the `runc` config format will be constantly changing until
-the spec is finalized.  However, we encourage you to try out the tool and give feedback.
+to have a v1 of the spec out. So the `runc` config format will be constantly changing until
+the spec is finalized. However, we encourage you to try out the tool and give feedback.
 
 ### OCF
 
diff --git a/libcontainer/console_freebsd.go b/libcontainer/console_freebsd.go
index 4d20b8d..3c89eda 100644
--- a/libcontainer/console_freebsd.go
+++ b/libcontainer/console_freebsd.go
@@ -6,8 +6,8 @@
 	"errors"
 )
 
-// newConsole returns an initalized console that can be used within a container by copying bytes
+// NewConsole returns an initalized console that can be used within a container by copying bytes
 // from the master side to the slave that is attached as the tty for the container's init process.
-func newConsole(uid, gid int) (Console, error) {
+func NewConsole(uid, gid int) (Console, error) {
 	return nil, errors.New("libcontainer console is not supported on FreeBSD")
 }
diff --git a/libcontainer/console_linux.go b/libcontainer/console_linux.go
index f345f57..7af771b 100644
--- a/libcontainer/console_linux.go
+++ b/libcontainer/console_linux.go
@@ -10,9 +10,9 @@
 	"github.com/opencontainers/runc/libcontainer/label"
 )
 
-// newConsole returns an initalized console that can be used within a container by copying bytes
+// NewConsole returns an initalized console that can be used within a container by copying bytes
 // from the master side to the slave that is attached as the tty for the container's init process.
-func newConsole(uid, gid int) (Console, error) {
+func NewConsole(uid, gid int) (Console, error) {
 	master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
 	if err != nil {
 		return nil, err
diff --git a/libcontainer/console_windows.go b/libcontainer/console_windows.go
index 80c7463..a68c02f 100644
--- a/libcontainer/console_windows.go
+++ b/libcontainer/console_windows.go
@@ -1,7 +1,7 @@
 package libcontainer
 
-// newConsole returns an initalized console that can be used within a container
-func newConsole(uid, gid int) (Console, error) {
+// NewConsole returns an initalized console that can be used within a container
+func NewConsole(uid, gid int) (Console, error) {
 	return &windowsConsole{}, nil
 }
 
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 912673a..82476ed 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -3,8 +3,10 @@
 package libcontainer
 
 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -19,6 +21,7 @@
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/criurpc"
+	"github.com/vishvananda/netlink/nl"
 )
 
 const stdioFdCount = 3
@@ -218,7 +221,7 @@
 		return nil, newSystemError(err)
 	}
 	if !doInit {
-		return c.newSetnsProcess(p, cmd, parentPipe, childPipe), nil
+		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
 	}
 	return c.newInitProcess(p, cmd, parentPipe, childPipe)
 }
@@ -273,23 +276,24 @@
 	}, nil
 }
 
-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) *setnsProcess {
-	cmd.Env = append(cmd.Env,
-		fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.initProcess.pid()),
-		"_LIBCONTAINER_INITTYPE=setns",
-	)
-	if p.consolePath != "" {
-		cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE_PATH="+p.consolePath)
+func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
+	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns")
+	// for setns process, we dont have to set cloneflags as the process namespaces
+	// will only be set via setns syscall
+	data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
+	if err != nil {
+		return nil, err
 	}
 	// TODO: set on container for process management
 	return &setnsProcess{
-		cmd:         cmd,
-		cgroupPaths: c.cgroupManager.GetPaths(),
-		childPipe:   childPipe,
-		parentPipe:  parentPipe,
-		config:      c.newInitConfig(p),
-		process:     p,
-	}
+		cmd:           cmd,
+		cgroupPaths:   c.cgroupManager.GetPaths(),
+		childPipe:     childPipe,
+		parentPipe:    parentPipe,
+		config:        c.newInitConfig(p),
+		process:       p,
+		bootstrapData: data,
+	}, nil
 }
 
 func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
@@ -1021,3 +1025,25 @@
 	}
 	return state, nil
 }
+
+// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
+// Consumer can write the data to a bootstrap program such as one that uses
+// nsenter package to bootstrap the container's init process correctly, i.e. with
+// correct namespaces, uid/gid mapping etc.
+func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
+	// create the netlink message
+	r := nl.NewNetlinkRequest(int(InitMsg), 0)
+	// write pid
+	r.AddData(&Int32msg{
+		Type:  PidAttr,
+		Value: uint32(pid),
+	})
+	// write console path
+	if consolePath != "" {
+		r.AddData(&Bytemsg{
+			Type:  ConsolePathAttr,
+			Value: []byte(consolePath),
+		})
+	}
+	return bytes.NewReader(r.Serialize()), nil
+}
diff --git a/libcontainer/error.go b/libcontainer/error.go
index 6c26662..aa59d2a 100644
--- a/libcontainer/error.go
+++ b/libcontainer/error.go
@@ -22,6 +22,7 @@
 
 	// Common errors
 	ConfigInvalid
+	ConsoleExists
 	SystemError
 )
 
@@ -43,6 +44,8 @@
 		return "Container is not stopped"
 	case ContainerNotRunning:
 		return "Container is not running"
+	case ConsoleExists:
+		return "Console exists for process"
 	default:
 		return "Unknown error"
 	}
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
new file mode 100644
index 0000000..0e95e3b
--- /dev/null
+++ b/libcontainer/message_linux.go
@@ -0,0 +1,60 @@
+// +build linux
+
+package libcontainer
+
+import (
+	"syscall"
+
+	"github.com/vishvananda/netlink/nl"
+)
+
+// list of known message types we want to send to bootstrap program
+// The number is randomly chosen to not conflict with known netlink types
+const (
+	InitMsg         uint16 = 62000
+	PidAttr         uint16 = 27281
+	ConsolePathAttr uint16 = 27282
+)
+
+type Int32msg struct {
+	Type  uint16
+	Value uint32
+}
+
+// int32msg has the following representation
+// | nlattr len | nlattr type |
+// | uint32 value             |
+func (msg *Int32msg) Serialize() []byte {
+	buf := make([]byte, msg.Len())
+	native := nl.NativeEndian()
+	native.PutUint16(buf[0:2], uint16(msg.Len()))
+	native.PutUint16(buf[2:4], msg.Type)
+	native.PutUint32(buf[4:8], msg.Value)
+	return buf
+}
+
+func (msg *Int32msg) Len() int {
+	return syscall.NLA_HDRLEN + 4
+}
+
+// bytemsg has the following representation
+// | nlattr len | nlattr type |
+// | value              | pad |
+type Bytemsg struct {
+	Type  uint16
+	Value []byte
+}
+
+func (msg *Bytemsg) Serialize() []byte {
+	l := msg.Len()
+	buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1))
+	native := nl.NativeEndian()
+	native.PutUint16(buf[0:2], uint16(l))
+	native.PutUint16(buf[2:4], msg.Type)
+	copy(buf[4:], msg.Value)
+	return buf
+}
+
+func (msg *Bytemsg) Len() int {
+	return syscall.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
+}
diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go
index db27b8a..976ae6b 100644
--- a/libcontainer/nsenter/nsenter_test.go
+++ b/libcontainer/nsenter/nsenter_test.go
@@ -1,12 +1,17 @@
 package nsenter
 
 import (
+	"bytes"
 	"encoding/json"
-	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"strings"
+	"syscall"
 	"testing"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/vishvananda/netlink/nl"
 )
 
 type pid struct {
@@ -15,7 +20,7 @@
 
 func TestNsenterAlivePid(t *testing.T) {
 	args := []string{"nsenter-exec"}
-	r, w, err := os.Pipe()
+	parent, child, err := newPipe()
 	if err != nil {
 		t.Fatalf("failed to create pipe %v", err)
 	}
@@ -23,16 +28,22 @@
 	cmd := &exec.Cmd{
 		Path:       os.Args[0],
 		Args:       args,
-		ExtraFiles: []*os.File{w},
-		Env:        []string{fmt.Sprintf("_LIBCONTAINER_INITPID=%d", os.Getpid()), "_LIBCONTAINER_INITPIPE=3"},
+		ExtraFiles: []*os.File{child},
+		Env:        []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
 	}
 
 	if err := cmd.Start(); err != nil {
 		t.Fatalf("nsenter failed to start %v", err)
 	}
-	w.Close()
-
-	decoder := json.NewDecoder(r)
+	r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+	r.AddData(&libcontainer.Int32msg{
+		Type:  libcontainer.PidAttr,
+		Value: uint32(os.Getpid()),
+	})
+	if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+		t.Fatal(err)
+	}
+	decoder := json.NewDecoder(parent)
 	var pid *pid
 
 	if err := decoder.Decode(&pid); err != nil {
@@ -51,34 +62,67 @@
 
 func TestNsenterInvalidPid(t *testing.T) {
 	args := []string{"nsenter-exec"}
-
-	cmd := &exec.Cmd{
-		Path: os.Args[0],
-		Args: args,
-		Env:  []string{"_LIBCONTAINER_INITPID=-1"},
+	parent, child, err := newPipe()
+	if err != nil {
+		t.Fatalf("failed to create pipe %v", err)
 	}
 
-	err := cmd.Run()
-	if err == nil {
+	cmd := &exec.Cmd{
+		Path:       os.Args[0],
+		Args:       args,
+		ExtraFiles: []*os.File{child},
+		Env:        []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
+	}
+
+	if err := cmd.Start(); err != nil {
+		t.Fatal("nsenter exits with a zero exit status")
+	}
+	r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+	r.AddData(&libcontainer.Int32msg{
+		Type:  libcontainer.PidAttr,
+		Value: 0,
+	})
+	if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := cmd.Wait(); err == nil {
 		t.Fatal("nsenter exits with a zero exit status")
 	}
 }
 
 func TestNsenterDeadPid(t *testing.T) {
-	dead_cmd := exec.Command("true")
-	if err := dead_cmd.Run(); err != nil {
+	deadCmd := exec.Command("true")
+	if err := deadCmd.Run(); err != nil {
 		t.Fatal(err)
 	}
 	args := []string{"nsenter-exec"}
-
-	cmd := &exec.Cmd{
-		Path: os.Args[0],
-		Args: args,
-		Env:  []string{fmt.Sprintf("_LIBCONTAINER_INITPID=%d", dead_cmd.Process.Pid)},
+	parent, child, err := newPipe()
+	if err != nil {
+		t.Fatalf("failed to create pipe %v", err)
 	}
 
-	err := cmd.Run()
-	if err == nil {
+	cmd := &exec.Cmd{
+		Path:       os.Args[0],
+		Args:       args,
+		ExtraFiles: []*os.File{child},
+		Env:        []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"},
+	}
+
+	if err := cmd.Start(); err != nil {
+		t.Fatal("nsenter exits with a zero exit status")
+	}
+
+	r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+	r.AddData(&libcontainer.Int32msg{
+		Type:  libcontainer.PidAttr,
+		Value: uint32(deadCmd.Process.Pid),
+	})
+	if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := cmd.Wait(); err == nil {
 		t.Fatal("nsenter exits with a zero exit status")
 	}
 }
@@ -89,3 +133,11 @@
 	}
 	return
 }
+
+func newPipe() (parent *os.File, child *os.File, err error) {
+	fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return nil, nil, err
+	}
+	return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
+}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 01450a9..27e6e53 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -17,6 +17,11 @@
 #include <sched.h>
 #include <signal.h>
 
+#include <linux/netlink.h>
+#include <linux/types.h>
+#include <stdint.h>
+#include <sys/socket.h>
+
 /* All arguments should be above stack, because it grows down */
 struct clone_arg {
 	/*
@@ -63,24 +68,33 @@
 	return child;
 }
 
+static uint32_t readint32(char *buf)
+{
+	return *(uint32_t *) buf;
+}
+
+// list of known message types we want to send to bootstrap program
+// These are defined in libcontainer/message_linux.go
+#define INIT_MSG 62000
+#define PID_ATTR 27281
+#define CONSOLE_PATH_ATTR 27282
+
 void nsexec()
 {
 	char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" };
 	const int num = sizeof(namespaces) / sizeof(char *);
 	jmp_buf env;
 	char buf[PATH_MAX], *val;
-	int i, tfd, self_tfd, child, len, pipenum, consolefd = -1;
-	pid_t pid;
-	char *console;
+	int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1;
+	pid_t pid = 0;
 
-	val = getenv("_LIBCONTAINER_INITPID");
-	if (val == NULL)
+	// if we dont have INITTYPE or this is the init process, skip the bootstrap process
+	val = getenv("_LIBCONTAINER_INITTYPE");
+	if (val == NULL || strcmp(val, "standard") == 0) {
 		return;
-
-	pid = atoi(val);
-	snprintf(buf, sizeof(buf), "%d", pid);
-	if (strcmp(val, buf)) {
-		pr_perror("Unable to parse _LIBCONTAINER_INITPID");
+	}
+	if (strcmp(val, "setns") != 0) {
+		pr_perror("Invalid inittype %s", val);
 		exit(1);
 	}
 
@@ -89,7 +103,6 @@
 		pr_perror("Child pipe not found");
 		exit(1);
 	}
-
 	pipenum = atoi(val);
 	snprintf(buf, sizeof(buf), "%d", pipenum);
 	if (strcmp(val, buf)) {
@@ -97,13 +110,56 @@
 		exit(1);
 	}
 
-	console = getenv("_LIBCONTAINER_CONSOLE_PATH");
-	if (console != NULL) {
-		consolefd = open(console, O_RDWR);
-		if (consolefd < 0) {
-			pr_perror("Failed to open console %s", console);
-			exit(1);
+	char nlbuf[NLMSG_HDRLEN];
+	struct nlmsghdr *nh;
+	if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
+		pr_perror("Failed to read netlink header, got %d", n);
+		exit(1);
+	}
+
+	nh = (struct nlmsghdr *)nlbuf;
+	if (nh->nlmsg_type == NLMSG_ERROR) {
+		pr_perror("Invalid netlink header message");
+		exit(1);
+	}
+	if (nh->nlmsg_type != INIT_MSG) {
+		pr_perror("Unexpected netlink message type %d", nh->nlmsg_type);
+		exit(1);
+	}
+	// read the netlink payload
+	len = NLMSG_PAYLOAD(nh, 0);
+	char data[len];
+	if ((n = read(pipenum, data, len)) != len) {
+		pr_perror("Failed to read netlink payload, got %d", n);
+		exit(1);
+	}
+
+	int start = 0;
+	struct nlattr *attr;
+	while (start < len) {
+		int payload_len;
+		attr = (struct nlattr *)((void *)data + start);
+		start += NLA_HDRLEN;
+		payload_len = attr->nla_len - NLA_HDRLEN;
+		switch (attr->nla_type) {
+		case PID_ATTR:
+			pid = (pid_t) readint32(data + start);
+			break;
+		case CONSOLE_PATH_ATTR:
+			consolefd = open((char *)data + start, O_RDWR);
+			if (consolefd < 0) {
+				pr_perror("Failed to open console %s", (char *)data + start);
+				exit(1);
+			}
+			break;
 		}
+		start += NLA_ALIGN(payload_len);
+	}
+
+	// required pid to be passed
+	if (pid == 0) {
+		pr_perror("missing pid");
+		exit(1);
 	}
 
 	/* Check that the specified process exists */
@@ -133,15 +189,13 @@
 		}
 
 		/* Skip namespaces we're already part of */
-		if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 &&
-		    st.st_ino == self_st.st_ino) {
+		if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) {
 			continue;
 		}
 
 		fd = openat(tfd, namespaces[i], O_RDONLY);
 		if (fd == -1) {
-			pr_perror("Failed to open ns file %s for ns %s", buf,
-				  namespaces[i]);
+			pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]);
 			exit(1);
 		}
 		// Set the namespace.
diff --git a/libcontainer/process.go b/libcontainer/process.go
index 7902d08..e96dc0d 100644
--- a/libcontainer/process.go
+++ b/libcontainer/process.go
@@ -80,10 +80,19 @@
 
 // NewConsole creates new console for process and returns it
 func (p *Process) NewConsole(rootuid int) (Console, error) {
-	console, err := newConsole(rootuid, rootuid)
+	console, err := NewConsole(rootuid, rootuid)
 	if err != nil {
 		return nil, err
 	}
 	p.consolePath = console.Path()
 	return console, nil
 }
+
+// ConsoleFromPath sets the process's console with the path provided
+func (p *Process) ConsoleFromPath(path string) error {
+	if p.consolePath != "" {
+		return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
+	}
+	p.consolePath = path
+	return nil
+}
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 4d17cbc..f27b6cf 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -41,13 +41,14 @@
 }
 
 type setnsProcess struct {
-	cmd         *exec.Cmd
-	parentPipe  *os.File
-	childPipe   *os.File
-	cgroupPaths map[string]string
-	config      *initConfig
-	fds         []string
-	process     *Process
+	cmd           *exec.Cmd
+	parentPipe    *os.File
+	childPipe     *os.File
+	cgroupPaths   map[string]string
+	config        *initConfig
+	fds           []string
+	process       *Process
+	bootstrapData io.Reader
 }
 
 func (p *setnsProcess) startTime() (string, error) {
@@ -64,6 +65,16 @@
 
 func (p *setnsProcess) start() (err error) {
 	defer p.parentPipe.Close()
+	err = p.cmd.Start()
+	p.childPipe.Close()
+	if err != nil {
+		return newSystemError(err)
+	}
+	if p.bootstrapData != nil {
+		if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+			return newSystemError(err)
+		}
+	}
 	if err = p.execSetns(); err != nil {
 		return newSystemError(err)
 	}
@@ -96,11 +107,6 @@
 // before the go runtime boots, we wait on the process to die and receive the child's pid
 // over the provided pipe.
 func (p *setnsProcess) execSetns() error {
-	err := p.cmd.Start()
-	p.childPipe.Close()
-	if err != nil {
-		return newSystemError(err)
-	}
 	status, err := p.cmd.Process.Wait()
 	if err != nil {
 		p.cmd.Wait()