blob: 52f171bc8fa9a209133370eb6814e6801826c0eb [file] [log] [blame]
// +build linux
package main
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/codegangsta/cli"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/specs"
)
var specCommand = cli.Command{
Name: "spec",
Usage: "create a new specification file",
Action: func(context *cli.Context) {
spec := specs.LinuxSpec{
Spec: specs.Spec{
Version: specs.Version,
Platform: specs.Platform{
OS: runtime.GOOS,
Arch: runtime.GOARCH,
},
Root: specs.Root{
Path: "rootfs",
Readonly: true,
},
Process: specs.Process{
Terminal: true,
User: specs.User{},
Args: []string{
"sh",
},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm",
},
},
Hostname: "shell",
Mounts: []specs.Mount{
{
Type: "proc",
Source: "proc",
Destination: "/proc",
Options: "",
},
{
Type: "tmpfs",
Source: "tmpfs",
Destination: "/dev",
Options: "nosuid,strictatime,mode=755,size=65536k",
},
{
Type: "devpts",
Source: "devpts",
Destination: "/dev/pts",
Options: "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Type: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Options: "nosuid,noexec,nodev,mode=1777,size=65536k",
},
{
Type: "mqueue",
Source: "mqueue",
Destination: "/dev/mqueue",
Options: "nosuid,noexec,nodev",
},
{
Type: "sysfs",
Source: "sysfs",
Destination: "/sys",
Options: "nosuid,noexec,nodev",
},
{
Type: "cgroup",
Source: "cgroup",
Destination: "/sys/fs/cgroup",
Options: "nosuid,noexec,nodev,relatime,ro",
},
},
},
Linux: specs.Linux{
Namespaces: []specs.Namespace{
{
Type: "pid",
},
{
Type: "network",
},
{
Type: "ipc",
},
{
Type: "uts",
},
{
Type: "mount",
},
},
Capabilities: []string{
"AUDIT_WRITE",
"KILL",
"NET_BIND_SERVICE",
},
Rlimits: []specs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1024),
Soft: uint64(1024),
},
},
Devices: []specs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
Permissions: "rwm",
FileMode: 0666,
UID: 0,
GID: 0,
},
},
Resources: specs.Resources{
Memory: specs.Memory{
Swappiness: -1,
},
},
Seccomp: specs.Seccomp{
DefaultAction: "SCMP_ACT_ALLOW",
Syscalls: []*specs.Syscall{},
},
},
}
data, err := json.MarshalIndent(&spec, "", "\t")
if err != nil {
logrus.Fatal(err)
}
fmt.Printf("%s", data)
},
}
var namespaceMapping = map[string]configs.NamespaceType{
"pid": configs.NEWPID,
"network": configs.NEWNET,
"mount": configs.NEWNS,
"user": configs.NEWUSER,
"ipc": configs.NEWIPC,
"uts": configs.NEWUTS,
}
// loadSpec loads the specification from the provided path.
// If the path is empty then the default path will be "config.json"
func loadSpec(path string) (*specs.LinuxSpec, error) {
if path == "" {
path = "config.json"
}
f, err := os.Open(path)
if err != nil {
if os.IsNotExist(err) {
return nil, fmt.Errorf("JSON specification file for %s not found", path)
}
return nil, err
}
defer f.Close()
var s *specs.LinuxSpec
if err := json.NewDecoder(f).Decode(&s); err != nil {
return nil, err
}
return s, checkSpecVersion(s)
}
// checkSpecVersion makes sure that the spec version matches runc's while we are in the initial
// development period. It is better to hard fail than have missing fields or options in the spec.
func checkSpecVersion(s *specs.LinuxSpec) error {
if s.Version != specs.Version {
return fmt.Errorf("spec version is not compatible with implemented version %q: spec %q", specs.Version, s.Version)
}
return nil
}
func createLibcontainerConfig(cgroupName string, spec *specs.LinuxSpec) (*configs.Config, error) {
cwd, err := os.Getwd()
if err != nil {
return nil, err
}
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
config := &configs.Config{
Rootfs: rootfsPath,
Capabilities: spec.Linux.Capabilities,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Privatefs: true,
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
config.Networks = []*configs.Network{
{
Type: "loopback",
},
}
}
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
if err := createDevices(spec, config); err != nil {
return nil, err
}
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
for _, rlimit := range spec.Linux.Rlimits {
config.Rlimits = append(config.Rlimits, createLibContainerRlimit(rlimit))
}
c, err := createCgroupConfig(cgroupName, spec, config.Devices)
if err != nil {
return nil, err
}
config.Cgroups = c
if config.Readonlyfs {
setReadonly(config)
config.MaskPaths = []string{
"/proc/kcore",
}
config.ReadonlyPaths = []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
}
}
seccomp, err := setupSeccomp(&spec.Linux.Seccomp)
if err != nil {
return nil, err
}
config.Seccomp = seccomp
config.Sysctl = spec.Linux.Sysctl
config.ProcessLabel = spec.Linux.SelinuxProcessLabel
config.AppArmorProfile = spec.Linux.ApparmorProfile
return config, nil
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, data := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: m.Type,
Source: source,
Destination: m.Destination,
Data: data,
Flags: flags,
}
}
func createCgroupConfig(name string, spec *specs.LinuxSpec, devices []*configs.Device) (*configs.Cgroup, error) {
myCgroupPath, err := cgroups.GetThisCgroupDir("devices")
if err != nil {
return nil, err
}
c := &configs.Cgroup{
Name: name,
Parent: myCgroupPath,
AllowedDevices: append(devices, allowedDevices...),
}
r := spec.Linux.Resources
c.Memory = r.Memory.Limit
c.MemoryReservation = r.Memory.Reservation
c.MemorySwap = r.Memory.Swap
c.KernelMemory = r.Memory.Kernel
c.MemorySwappiness = r.Memory.Swappiness
c.CpuShares = r.CPU.Shares
c.CpuQuota = r.CPU.Quota
c.CpuPeriod = r.CPU.Period
c.CpuRtRuntime = r.CPU.RealtimeRuntime
c.CpuRtPeriod = r.CPU.RealtimePeriod
c.CpusetCpus = r.CPU.Cpus
c.CpusetMems = r.CPU.Mems
c.BlkioThrottleReadBpsDevice = r.BlockIO.ThrottleReadBpsDevice
c.BlkioThrottleWriteBpsDevice = r.BlockIO.ThrottleWriteBpsDevice
c.BlkioThrottleReadIOpsDevice = r.BlockIO.ThrottleReadIOpsDevice
c.BlkioThrottleWriteIOpsDevice = r.BlockIO.ThrottleWriteIOpsDevice
c.BlkioWeight = r.BlockIO.Weight
c.BlkioWeightDevice = r.BlockIO.WeightDevice
for _, l := range r.HugepageLimits {
c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{
Pagesize: l.Pagesize,
Limit: l.Limit,
})
}
c.OomKillDisable = r.DisableOOMKiller
c.NetClsClassid = r.Network.ClassID
for _, m := range r.Network.Priorities {
c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: m.Priority,
})
}
return c, nil
}
func createDevices(spec *specs.LinuxSpec, config *configs.Config) error {
for _, d := range spec.Linux.Devices {
device := &configs.Device{
Type: d.Type,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
Permissions: d.Permissions,
FileMode: d.FileMode,
Uid: d.UID,
Gid: d.GID,
}
config.Devices = append(config.Devices, device)
}
return nil
}
func setReadonly(config *configs.Config) {
for _, m := range config.Mounts {
if m.Device == "sysfs" {
m.Flags |= syscall.MS_RDONLY
}
}
}
func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error {
if len(spec.Linux.UIDMappings) == 0 {
return nil
}
config.Namespaces.Add(configs.NEWUSER, "")
create := func(m specs.IDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
}
}
for _, m := range spec.Linux.UIDMappings {
config.UidMappings = append(config.UidMappings, create(m))
}
for _, m := range spec.Linux.GIDMappings {
config.GidMappings = append(config.GidMappings, create(m))
}
rootUID, err := config.HostUID()
if err != nil {
return err
}
rootGID, err := config.HostGID()
if err != nil {
return err
}
for _, node := range config.Devices {
node.Uid = uint32(rootUID)
node.Gid = uint32(rootGID)
}
return nil
}
func createLibContainerRlimit(rlimit specs.Rlimit) configs.Rlimit {
return configs.Rlimit{
Type: int(rlimit.Type),
Hard: uint64(rlimit.Hard),
Soft: uint64(rlimit.Soft),
}
}
// parseMountOptions parses the string and returns the flags and any mount data that
// it contains.
func parseMountOptions(options string) (int, string) {
var (
flag int
data []string
)
flags := map[string]struct {
clear bool
flag int
}{
"async": {true, syscall.MS_SYNCHRONOUS},
"atime": {true, syscall.MS_NOATIME},
"bind": {false, syscall.MS_BIND},
"defaults": {false, 0},
"dev": {true, syscall.MS_NODEV},
"diratime": {true, syscall.MS_NODIRATIME},
"dirsync": {false, syscall.MS_DIRSYNC},
"exec": {true, syscall.MS_NOEXEC},
"mand": {false, syscall.MS_MANDLOCK},
"noatime": {false, syscall.MS_NOATIME},
"nodev": {false, syscall.MS_NODEV},
"nodiratime": {false, syscall.MS_NODIRATIME},
"noexec": {false, syscall.MS_NOEXEC},
"nomand": {true, syscall.MS_MANDLOCK},
"norelatime": {true, syscall.MS_RELATIME},
"nostrictatime": {true, syscall.MS_STRICTATIME},
"nosuid": {false, syscall.MS_NOSUID},
"private": {false, syscall.MS_PRIVATE},
"rbind": {false, syscall.MS_BIND | syscall.MS_REC},
"relatime": {false, syscall.MS_RELATIME},
"remount": {false, syscall.MS_REMOUNT},
"ro": {false, syscall.MS_RDONLY},
"rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC},
"rshared": {false, syscall.MS_SHARED | syscall.MS_REC},
"rslave": {false, syscall.MS_SLAVE | syscall.MS_REC},
"runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC},
"rw": {true, syscall.MS_RDONLY},
"shared": {false, syscall.MS_SHARED},
"slave": {false, syscall.MS_SLAVE},
"strictatime": {false, syscall.MS_STRICTATIME},
"suid": {true, syscall.MS_NOSUID},
"sync": {false, syscall.MS_SYNCHRONOUS},
"unbindable": {false, syscall.MS_UNBINDABLE},
}
for _, o := range strings.Split(options, ",") {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &= ^f.flag
} else {
flag |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, strings.Join(data, ",")
}
func setupSeccomp(config *specs.Seccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := new(configs.Seccomp)
newConfig.Syscalls = []*configs.Syscall{}
// Convert default action from string representation
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
if err != nil {
return nil, err
}
newConfig.DefaultAction = newDefaultAction
// Loop through all syscall blocks and convert them to libcontainer format
for _, call := range config.Syscalls {
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
if err != nil {
return nil, err
}
newCall := configs.Syscall{
Name: call.Name,
Action: newAction,
Args: []*configs.Arg{},
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
if err != nil {
return nil, err
}
newArg := configs.Arg{
Index: arg.Index,
Value: arg.Value,
ValueTwo: arg.ValueTwo,
Op: newOp,
}
newCall.Args = append(newCall.Args, &newArg)
}
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
}
return newConfig, nil
}