diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 64384cff0ac..6e79a6eb051 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -1643,83 +1643,83 @@ }, { "ImportPath": "github.com/opencontainers/runc/libcontainer", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/keys", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/label", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/system", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/user", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/utils", - "Comment": "v0.1.1", - "Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" + "Comment": "v1.0.0-rc1-100-g142df38", + "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c" }, { "ImportPath": "github.com/pborman/uuid", diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index 7e80c09aa4c..af5e0a942a5 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -188,12 +188,13 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I // Create a cgroup container manager. func createManager(containerName string) *fs.Manager { + allowAllDevices := true return &fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: containerName, Resources: &configs.Resources{ - AllowAllDevices: true, + AllowAllDevices: &allowAllDevices, }, }, } @@ -319,7 +320,7 @@ func (cm *containerManagerImpl) setupNode() error { } glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit) - + allowAllDevices := true dockerContainer := &fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", @@ -327,7 +328,7 @@ func (cm *containerManagerImpl) setupNode() error { Resources: &configs.Resources{ Memory: memoryLimit, MemorySwap: -1, - AllowAllDevices: true, + AllowAllDevices: &allowAllDevices, }, }, } @@ -370,12 +371,13 @@ func (cm *containerManagerImpl) setupNode() error { if cm.KubeletCgroupsName != "" { cont := newSystemCgroups(cm.KubeletCgroupsName) + allowAllDevices := true manager := fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: cm.KubeletCgroupsName, Resources: &configs.Resources{ - AllowAllDevices: true, + AllowAllDevices: &allowAllDevices, }, }, } diff --git a/pkg/util/resourcecontainer/resource_container_linux.go b/pkg/util/resourcecontainer/resource_container_linux.go index f7c2046a097..86477c5aa2e 100644 --- a/pkg/util/resourcecontainer/resource_container_linux.go +++ b/pkg/util/resourcecontainer/resource_container_linux.go @@ -30,12 +30,13 @@ import ( // // containerName must be an absolute container name. func RunInResourceContainer(containerName string) error { + allowAllDevices := true manager := fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: containerName, Resources: &configs.Resources{ - AllowAllDevices: true, + AllowAllDevices: &allowAllDevices, }, }, } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 614969462f8..457b132e6f7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -77,7 +77,7 @@ config := &configs.Config{ Parent: "system", Resources: &configs.Resources{ MemorySwappiness: nil, - AllowAllDevices: false, + AllowAllDevices: nil, AllowedDevices: configs.DefaultAllowedDevices, }, }, @@ -186,8 +186,8 @@ process := &libcontainer.Process{ err := container.Start(process) if err != nil { - logrus.Fatal(err) container.Destroy() + logrus.Fatal(err) return } @@ -219,6 +219,9 @@ container.Resume() // send signal to container's init process. container.Signal(signal) + +// update container resource constraints. +container.Set(config) ``` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md index 221545c01dc..32578f01a30 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md @@ -90,7 +90,7 @@ in tmpfs. After `/dev/null` has been setup we check for any external links between the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing -to `/dev/null` outside the container we close and `dup2` the the `/dev/null` +to `/dev/null` outside the container we close and `dup2` the `/dev/null` that is local to the container's rootfs. @@ -297,7 +297,7 @@ a container. | -------------- | ------------------------------------------------------------------ | | Get processes | Return all the pids for processes running inside a container | | Get Stats | Return resource statistics for the container as a whole | -| Wait | Wait waits on the container's init process ( pid 1 ) | +| Wait | Waits on the container's init process ( pid 1 ) | | Wait Process | Wait on any of the container's processes returning the exit status | | Destroy | Kill the container's init process and remove any filesystem state | | Signal | Send a signal to the container's init process | diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go index 22c17f5272c..82ed1a68a69 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -7,6 +7,7 @@ package apparmor // #include import "C" import ( + "fmt" "io/ioutil" "os" "unsafe" @@ -32,7 +33,7 @@ func ApplyProfile(name string) error { cName := C.CString(name) defer C.free(unsafe.Pointer(cName)) if _, err := C.aa_change_onexec(cName); err != nil { - return err + return fmt.Errorf("apparmor failed to apply profile: %s", err) } return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 114f002ec84..ed46561a251 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -9,7 +9,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strconv" "sync" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -33,7 +32,6 @@ var ( &FreezerGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, } - CgroupProcesses = "cgroup.procs" HugePageSizes, _ = cgroups.GetHugePageSize() ) @@ -142,7 +140,9 @@ func (m *Manager) Apply(pid int) (err error) { // created then join consists of writing the process pids to cgroup.procs p, err := d.path(sys.Name()) if err != nil { - if cgroups.IsNotFound(err) { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && sys.Name() != "devices" { continue } return err @@ -190,6 +190,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups.Paths != nil { + return nil + } for _, sys := range subsystems { // Generate fake cgroup data. d, err := getCgroupData(container.Cgroups, -1) @@ -339,7 +344,7 @@ func (raw *cgroupData) join(subsystem string) (string, error) { if err := os.MkdirAll(path, 0755); err != nil { return "", err } - if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil { + if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil { return "", err } return path, nil @@ -349,7 +354,7 @@ func writeFile(dir, file, data string) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. if dir == "" { - return fmt.Errorf("no such directory for %s.", file) + return fmt.Errorf("no such directory for %s", file) } if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil { return fmt.Errorf("failed to write %v to %v: %v", data, file, err) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index cbe62bd983a..29265c70ee5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -8,7 +8,6 @@ import ( "io/ioutil" "os" "path/filepath" - "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -67,7 +66,7 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { + if err := cgroups.WriteCgroupProc(dir, pid); err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go index 5f783310947..0ac5b4ed700 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -43,21 +43,23 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { } return nil } - if !cgroup.Resources.AllowAllDevices { - if err := writeFile(path, "devices.deny", "a"); err != nil { - return err - } - - for _, dev := range cgroup.Resources.AllowedDevices { - if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { + if cgroup.Resources.AllowAllDevices != nil { + if *cgroup.Resources.AllowAllDevices == false { + if err := writeFile(path, "devices.deny", "a"); err != nil { return err } - } - return nil - } - if err := writeFile(path, "devices.allow", "a"); err != nil { - return err + for _, dev := range cgroup.Resources.AllowedDevices { + if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { + return err + } + } + return nil + } + + if err := writeFile(path, "devices.allow", "a"); err != nil { + return err + } } for _, dev := range cgroup.Resources.DeniedDevices { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index 6b4a9eac462..6c0fd02281c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -5,15 +5,21 @@ package fs import ( "bufio" "fmt" + "io/ioutil" "os" "path/filepath" "strconv" "strings" + "syscall" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) +const ( + cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" +) + type MemoryGroup struct { } @@ -32,13 +38,10 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return err } } - // We have to set kernel memory here, as we can't change it once - // processes have been attached. - if err := s.SetKernelMemory(path, d.config); err != nil { + if err := EnableKernelMemoryAccounting(path); err != nil { return err } } - defer func() { if err != nil { os.RemoveAll(path) @@ -54,13 +57,43 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } -func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { - // This has to be done separately because it has special constraints (it - // can't be done after there are processes attached to the cgroup). - if cgroup.Resources.KernelMemory > 0 { - if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { - return err +func EnableKernelMemoryAccounting(path string) error { + // Check if kernel memory is enabled + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + kernelMemoryLimit := int64(1) + if err := setKernelMemory(path, kernelMemoryLimit); err != nil { + return err + } + kernelMemoryLimit = int64(-1) + if err := setKernelMemory(path, kernelMemoryLimit); err != nil { + return err + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // kernel memory is not enabled on the system so we should do nothing + return nil + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == syscall.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) } return nil } @@ -113,11 +146,18 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { return err } + if cgroup.Resources.KernelMemory != 0 { + if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + return err + } + } + if cgroup.Resources.MemoryReservation != 0 { if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { return err } } + if cgroup.Resources.KernelMemoryTCP != 0 { if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go index 8a4054ba877..8e74b645eac 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -3,6 +3,8 @@ package fs import ( + "strconv" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error { } func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.NetClsClassid != "" { - if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { + if cgroup.Resources.NetClsClassid != 0 { + if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil { return err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go index 852b18391d0..5ff0a161504 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go @@ -12,7 +12,6 @@ import ( ) var ( - ErrNotSupportStat = errors.New("stats are not supported for subsystem") ErrNotValidFormat = errors.New("line is not a valid key value format") ) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 797a923c388..b483f1bf983 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -11,6 +11,7 @@ type ThrottlingData struct { ThrottledTime uint64 `json:"throttled_time,omitempty"` } +// CpuUsage denotes the usage of a CPU. // All CPU stats are aggregate since container inception. type CpuUsage struct { // Total CPU time consumed. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index b6158095603..22ff711779b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -74,6 +74,7 @@ var ( theConn *systemdDbus.Conn hasStartTransientUnit bool hasTransientDefaultDependencies bool + hasDelegate bool ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -146,6 +147,20 @@ func UseSystemd() bool { // Not critical because of the stop unit logic above. theConn.StopUnit(scope, "replace", nil) + + // Assume StartTransientUnit on a scope allows Delegate + hasDelegate = true + dl := newProp("Delegate", true) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasDelegate = false + } + } + } + + // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) } return hasStartTransientUnit } @@ -183,10 +198,13 @@ func (m *Manager) Apply(pid int) error { systemdDbus.PropSlice(slice), systemdDbus.PropDescription("docker container "+c.Name), newProp("PIDs", []uint32{uint32(pid)}), - // This is only supported on systemd versions 218 and above. - newProp("Delegate", true), ) + if hasDelegate { + // This is only supported on systemd versions 218 and above. + properties = append(properties, newProp("Delegate", true)) + } + // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, @@ -214,11 +232,9 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } - // We need to set kernel memory before processes join cgroup because - // kmem.limit_in_bytes can only be set when the cgroup is empty. - // And swap memory limit needs to be set after memory limit, only - // memory limit is handled by systemd, so it's kind of ugly here. - if c.Resources.KernelMemory > 0 { + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { if err := setKernelMemory(c); err != nil { return err } @@ -273,7 +289,7 @@ func writeFile(dir, file, data string) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. if dir == "" { - return fmt.Errorf("no such directory for %s.", file) + return fmt.Errorf("no such directory for %s", file) } return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } @@ -372,6 +388,8 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { if err != nil { return "", err } + // if pid 1 is systemd 226 or later, it will be in init.scope, not the root + initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") slice := "system.slice" if c.Parent != "" { @@ -439,6 +457,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups.Paths != nil { + return nil + } for _, sys := range subsystems { // Get the subsystem path, but don't error out for not found cgroups. path, err := getSubsystemPath(container.Cgroups, sys.Name()) @@ -472,8 +495,5 @@ func setKernelMemory(c *configs.Cgroup) error { if err := os.MkdirAll(path, 0755); err != nil { return err } - - // This doesn't get called by manager.Set, so we need to do it here. - s := &fs.MemoryGroup{} - return s.SetKernelMemory(path, c) + return fs.EnableKernelMemoryAccounting(path) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 235273299fa..d43270032c9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -16,13 +16,19 @@ import ( "github.com/docker/go-units" ) -const cgroupNamePrefix = "name=" +const ( + cgroupNamePrefix = "name=" + CgroupProcesses = "cgroup.procs" +) -// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. + if !isSubsystemAvailable(subsystem) { + return "", NewNotFoundError(subsystem) + } f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", err @@ -47,6 +53,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { + if !isSubsystemAvailable(subsystem) { + return "", "", NewNotFoundError(subsystem) + } f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err @@ -70,6 +79,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { return "", "", NewNotFoundError(subsystem) } +func isSubsystemAvailable(subsystem string) bool { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return false + } + _, avail := cgroups[subsystem] + return avail +} + func FindCgroupMountpointDir() (string, error) { f, err := os.Open("/proc/self/mountinfo") if err != nil { @@ -124,7 +142,8 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) { res := make([]Mount, 0, len(ss)) scanner := bufio.NewScanner(mi) - for scanner.Scan() { + numFound := 0 + for scanner.Scan() && numFound < len(ss) { txt := scanner.Text() sepIdx := strings.Index(txt, " - ") if sepIdx == -1 { @@ -139,12 +158,15 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) { Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if !ss[opt] { + continue + } if strings.HasPrefix(opt, cgroupNamePrefix) { m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } - if ss[opt] { + } else { m.Subsystems = append(m.Subsystems, opt) } + numFound++ } res = append(res, m) } @@ -161,19 +183,19 @@ func GetCgroupMounts() ([]Mount, error) { } defer f.Close() - all, err := GetAllSubsystems() + all, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err } allMap := make(map[string]bool) - for _, s := range all { + for s := range all { allMap[s] = true } return getCgroupMountsHelper(allMap, f) } -// Returns all the cgroup subsystems supported by the kernel +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel func GetAllSubsystems() ([]string, error) { f, err := os.Open("/proc/cgroups") if err != nil { @@ -199,7 +221,7 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -// Returns the relative path to the cgroup docker is running in. +// GetThisCgroupDir returns the relative path to the cgroup docker is running in. func GetThisCgroupDir(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { @@ -220,7 +242,7 @@ func GetInitCgroupDir(subsystem string) (string, error) { } func readProcsFile(dir string) ([]int, error) { - f, err := os.Open(filepath.Join(dir, "cgroup.procs")) + f, err := os.Open(filepath.Join(dir, CgroupProcesses)) if err != nil { return nil, err } @@ -243,6 +265,8 @@ func readProcsFile(dir string) ([]int, error) { return out, nil } +// ParseCgroupFile parses the given cgroup file, typically from +// /proc//cgroup, into a map of subgroups to cgroup names. func ParseCgroupFile(path string) (map[string]string, error) { f, err := os.Open(path) if err != nil { @@ -250,7 +274,12 @@ func ParseCgroupFile(path string) (map[string]string, error) { } defer f.Close() - s := bufio.NewScanner(f) + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) cgroups := make(map[string]string) for s.Scan() { @@ -259,7 +288,16 @@ func ParseCgroupFile(path string) (map[string]string, error) { } text := s.Text() - parts := strings.Split(text, ":") + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } for _, subs := range strings.Split(parts[1], ",") { cgroups[subs] = parts[2] @@ -291,8 +329,7 @@ func PathExists(path string) bool { func EnterPid(cgroupPaths map[string]string, pid int) error { for _, path := range cgroupPaths { if PathExists(path) { - if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), - []byte(strconv.Itoa(pid)), 0700); err != nil { + if err := WriteCgroupProc(path, pid); err != nil { return err } } @@ -361,7 +398,7 @@ func GetAllPids(path string) ([]int, error) { // collect pids from all sub-cgroups err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { dir, file := filepath.Split(p) - if file != "cgroup.procs" { + if file != CgroupProcesses { return nil } if iErr != nil { @@ -376,3 +413,20 @@ func GetAllPids(path string) ([]int, error) { }) return pids, err } + +// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file +func WriteCgroupProc(dir string, pid int) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", CgroupProcesses) + } + + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go index f2eff91cf45..bd6f69b82f4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -36,7 +36,7 @@ type Cgroup struct { type Resources struct { // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. // Deprecated - AllowAllDevices bool `json:"allow_all_devices,omitempty"` + AllowAllDevices *bool `json:"allow_all_devices,omitempty"` // Deprecated AllowedDevices []*Device `json:"allowed_devices,omitempty"` // Deprecated @@ -69,10 +69,10 @@ type Resources struct { CpuPeriod int64 `json:"cpu_period"` // How many time CPU will use in realtime scheduling (in usecs). - CpuRtRuntime int64 `json:"cpu_quota"` + CpuRtRuntime int64 `json:"cpu_rt_quota"` // CPU period to be used for realtime scheduling (in usecs). - CpuRtPeriod int64 `json:"cpu_period"` + CpuRtPeriod int64 `json:"cpu_rt_period"` // CPU to use CpusetCpus string `json:"cpuset_cpus"` @@ -120,5 +120,5 @@ type Resources struct { NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` // Set class identifier for container's network packets - NetClsClassid string `json:"net_cls_classid"` + NetClsClassid uint32 `json:"net_cls_classid"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 1221ce272e0..3c38191b354 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -33,7 +33,7 @@ type Seccomp struct { Syscalls []*Syscall `json:"syscalls"` } -// An action to be taken upon rule match in Seccomp +// Action is taken upon rule match in Seccomp type Action int const ( @@ -44,7 +44,7 @@ const ( Trace ) -// A comparison operator to be used when matching syscall arguments in Seccomp +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp type Operator int const ( @@ -57,7 +57,7 @@ const ( MaskEqualTo ) -// A rule to match a specific syscall argument in Seccomp +// Arg is a rule to match a specific syscall argument in Seccomp type Arg struct { Index uint `json:"index"` Value uint64 `json:"value"` @@ -65,7 +65,7 @@ type Arg struct { Op Operator `json:"op"` } -// An rule to match a syscall in Seccomp +// Syscall is a rule to match a syscall in Seccomp type Syscall struct { Name string `json:"name"` Action Action `json:"action"` @@ -148,10 +148,6 @@ type Config struct { // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ OomScoreAdj int `json:"oom_score_adj"` - // AdditionalGroups specifies the gids that should be added to supplementary groups - // in addition to those that the user belongs to. - AdditionalGroups []string `json:"additional_groups"` - // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -187,6 +183,10 @@ type Config struct { // Labels are user defined metadata that is stored in the config and populated on the state Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` } type Hooks struct { @@ -261,7 +261,7 @@ type Hook interface { Run(HookState) error } -// NewFunctionHooks will call the provided function when the hook is run. +// NewFunctionHook will call the provided function when the hook is run. func NewFunctionHook(f func(HookState) error) FuncHook { return FuncHook{ run: f, @@ -284,7 +284,7 @@ type Command struct { Timeout *time.Duration `json:"timeout"` } -// NewCommandHooks will execute the provided command when the hook is run. +// NewCommandHook will execute the provided command when the hook is run. func NewCommandHook(cmd Command) CommandHook { return CommandHook{ Command: cmd, diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go index c447f3ef29f..a60554a7b96 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go @@ -4,7 +4,7 @@ package configs import "fmt" -// Gets the root uid for the process on host which could be non-zero +// HostUID gets the root uid for the process on host which could be non-zero // when user namespaces are enabled. func (c Config) HostUID() (int, error) { if c.Namespaces.Contains(NEWUSER) { @@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) { return 0, nil } -// Gets the root gid for the process on host which could be non-zero +// HostGID gets the root gid for the process on host which could be non-zero // when user namespaces are enabled. func (c Config) HostGID() (int, error) { if c.Namespaces.Contains(NEWUSER) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go index e45299264c8..ba1f437f3bb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -3,7 +3,7 @@ package configs var ( - // These are devices that are to be both allowed and created. + // DefaultSimpleDevices are devices that are to be both allowed and created. DefaultSimpleDevices = []*Device{ // /dev/null and zero { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index bd6964c56e8..448cde27a27 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/selinux" ) type Validator interface { @@ -80,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error { !config.Namespaces.Contains(configs.NEWNS) { return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") } + if config.ProcessLabel != "" && !selinux.SelinuxEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go new file mode 100644 index 00000000000..9e89f505321 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go @@ -0,0 +1,11 @@ +package libcontainer + +import ( + "errors" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func NewConsole(uid, gid int) (Console, error) { + return nil, errors.New("libcontainer console is not supported on Solaris") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go index 32daa97675a..1a71179c96d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -1,4 +1,4 @@ -// Libcontainer provides a native Go implementation for creating containers +// Package libcontainer provides a native Go implementation for creating containers // with namespaces, cgroups, capabilities, and filesystem access controls. // It allows you to manage the lifecycle of the container performing additional operations // after the container is created. @@ -11,24 +11,20 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -// The status of a container. +// Status is the status of a container. type Status int const ( - // The container exists but has not been run yet + // Created is the status that denotes the container exists but has not been run yet. Created Status = iota - - // The container exists and is running. + // Running is the status that denotes the container exists and is running. Running - - // The container exists, it is in the process of being paused. + // Pausing is the status that denotes the container exists, it is in the process of being paused. Pausing - - // The container exists, but all its processes are paused. + // Paused is the status that denotes the container exists, but all its processes are paused. Paused - - // The container does not exist. - Destroyed + // Stopped is the status that denotes the container does not have a created or running process. + Stopped ) func (s Status) String() string { @@ -41,8 +37,8 @@ func (s Status) String() string { return "pausing" case Paused: return "paused" - case Destroyed: - return "destroyed" + case Stopped: + return "stopped" default: return "unknown" } @@ -67,7 +63,7 @@ type BaseState struct { Config configs.Config `json:"config"` } -// A libcontainer container object. +// BaseContainer is a libcontainer container object. // // Each container is thread-safe within the same process. Since a container can // be destroyed by a separate process, any function may return that the container @@ -80,13 +76,13 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. Status() (Status, error) // State returns the current container's state information. // // errors: - // Systemerror - System error. + // SystemError - System error. State() (*State, error) // Returns the current config of the container. @@ -96,7 +92,7 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. // // Some of the returned PIDs may no longer refer to processes in the Container, unless // the Container state is PAUSED in which case every PID in the slice is valid. @@ -106,7 +102,7 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. Stats() (*Stats, error) // Set resources of container as configured @@ -114,7 +110,7 @@ type BaseContainer interface { // We can use this to change resources when containers are running. // // errors: - // Systemerror - System error. + // SystemError - System error. Set(config configs.Config) error // Start a process inside the container. Returns error if process fails to @@ -124,21 +120,38 @@ type BaseContainer interface { // ContainerDestroyed - Container no longer exists, // ConfigInvalid - config is invalid, // ContainerPaused - Container is paused, - // Systemerror - System error. + // SystemError - System error. Start(process *Process) (err error) + // Run immediatly starts the process inside the conatiner. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + // Destroys the container after killing all running processes. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. // // errors: - // Systemerror - System error. + // SystemError - System error. Destroy() error // Signal sends the provided signal code to the container's initial process. // // errors: - // Systemerror - System error. + // SystemError - System error. Signal(s os.Signal) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 2ae50c465c5..70cbc6359f3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -22,6 +22,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" @@ -30,18 +31,19 @@ import ( const stdioFdCount = 3 type linuxContainer struct { - id string - root string - config *configs.Config - cgroupManager cgroups.Manager - initPath string - initArgs []string - initProcess parentProcess - criuPath string - m sync.Mutex - criuVersion int - state containerState - created time.Time + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + initPath string + initArgs []string + initProcess parentProcess + initProcessStartTime string + criuPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time } // State represents a running container's state @@ -62,7 +64,7 @@ type State struct { ExternalDescriptors []string `json:"external_descriptors,omitempty"` } -// A libcontainer container object. +// Container is a libcontainer container object. // // Each container is thread-safe within the same process. Since a container can // be destroyed by a separate process, any function may return that the container @@ -84,7 +86,7 @@ type Container interface { // Systemerror - System error. Restore(process *Process, criuOpts *CriuOpts) error - // If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses + // If the Container state is RUNNING, sets the Container state to PAUSING and pauses // the execution of any user processes. Asynchronously, when the container finished being paused the // state is changed to PAUSED. // If the Container state is PAUSED, do nothing. @@ -141,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) { func (c *linuxContainer) Processes() ([]int, error) { pids, err := c.cgroupManager.GetAllPids() if err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") } return pids, nil } @@ -152,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) { stats = &Stats{} ) if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { - return stats, newSystemError(err) + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") } for _, iface := range c.config.Networks { switch iface.Type { case "veth": istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) if err != nil { - return stats, newSystemError(err) + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) } stats.Interfaces = append(stats.Interfaces, istats) } @@ -170,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) { func (c *linuxContainer) Set(config configs.Config) error { c.m.Lock() defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } c.config = &config return c.cgroupManager.Set(c.config) } @@ -181,28 +190,76 @@ func (c *linuxContainer) Start(process *Process) error { if err != nil { return err } - doInit := status == Destroyed - parent, err := c.newParentProcess(process, doInit) + return c.start(process, status == Stopped) +} + +func (c *linuxContainer) Run(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() if err != nil { - return newSystemError(err) + return err + } + if err := c.start(process, status == Stopped); err != nil { + return err + } + if status == Stopped { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo for reading") + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + if len(data) > 0 { + os.Remove(path) + return nil + } + return fmt.Errorf("cannot start an already running container") +} + +func (c *linuxContainer) start(process *Process, isInit bool) error { + parent, err := c.newParentProcess(process, isInit) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") } if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := parent.terminate(); err != nil { logrus.Warn(err) } - return newSystemError(err) + return newSystemErrorWithCause(err, "starting container process") } // generate a timestamp indicating when the container was started c.created = time.Now().UTC() - c.state = &runningState{ c: c, } - if doInit { - if err := c.updateState(parent); err != nil { + if isInit { + c.state = &createdState{ + c: c, + } + state, err := c.updateState(parent) + if err != nil { return err } + c.initProcessStartTime = state.InitProcessStartTime + if c.config.Hooks != nil { s := configs.HookState{ Version: c.config.Version, @@ -211,12 +268,12 @@ func (c *linuxContainer) Start(process *Process) error { Root: c.config.Rootfs, BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), } - for _, hook := range c.config.Hooks.Poststart { + for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { if err := parent.terminate(); err != nil { logrus.Warn(err) } - return newSystemError(err) + return newSystemErrorWithCausef(err, "running poststart hook %d", i) } } } @@ -226,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error { func (c *linuxContainer) Signal(s os.Signal) error { if err := c.initProcess.signal(s); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "signaling init process") } return nil } @@ -234,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error { func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { parentPipe, childPipe, err := newPipe() if err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCause(err, "creating new init pipe") } - cmd, err := c.commandTemplate(p, childPipe) + rootDir, err := os.Open(c.root) if err != nil { - return nil, newSystemError(err) + return nil, err + } + cmd, err := c.commandTemplate(p, childPipe, rootDir) + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new command template") } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) } - return c.newInitProcess(p, cmd, parentPipe, childPipe) + return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) } -func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { cmd := &exec.Cmd{ Path: c.initPath, Args: c.initArgs, @@ -258,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.ExtraFiles = append(p.ExtraFiles, childPipe) - cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. @@ -269,7 +332,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. return cmd, nil } -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -292,14 +355,15 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c process: p, bootstrapData: data, sharePidns: sharePidns, + rootDir: rootDir, }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCause(err, "getting container's current state") } // for setns process, we dont have to set cloneflags as the process namespaces // will only be set via setns syscall @@ -316,6 +380,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, config: c.newInitConfig(p), process: p, bootstrapData: data, + rootDir: rootDir, }, nil } @@ -325,6 +390,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { Args: process.Args, Env: process.Env, User: process.User, + AdditionalGroups: process.AdditionalGroups, Cwd: process.Cwd, Console: process.consolePath, Capabilities: process.Capabilities, @@ -334,6 +400,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, + ExecFifoPath: filepath.Join(c.root, execFifoFilename), } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges @@ -371,15 +438,16 @@ func (c *linuxContainer) Pause() error { if err != nil { return err } - if status != Running { - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) } - if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { - return err - } - return c.state.transition(&pausedState{ - c: c, - }) + return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning) } func (c *linuxContainer) Resume() error { @@ -408,13 +476,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } -// check Criu version greater than or equal to min_version -func (c *linuxContainer) checkCriuVersion(min_version string) error { +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion string) error { var x, y, z, versionReq int - _, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 + _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 if err != nil { - _, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6 + _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 } versionReq = x*10000 + y*100 + z @@ -459,7 +527,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error { c.criuVersion = x*10000 + y*100 + z if c.criuVersion < versionReq { - return fmt.Errorf("CRIU version must be %s or higher", min_version) + return fmt.Errorf("CRIU version must be %s or higher", minVersion) } return nil @@ -607,6 +675,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + break + case "loopback": + break + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() @@ -690,23 +779,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { break } } - for _, iface := range c.config.Networks { - switch iface.Type { - case "veth": - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(iface.HostInterfaceName) - veth.IfIn = proto.String(iface.Name) - req.Opts.Veths = append(req.Opts.Veths, veth) - break - case "loopback": - break - } - } - for _, i := range criuOpts.VethPairs { - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(i.HostInterfaceName) - veth.IfIn = proto.String(i.ContainerInterfaceName) - req.Opts.Veths = append(req.Opts.Veths, veth) + + if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) } // append optional manage cgroups mode @@ -955,9 +1030,9 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc Pid: int(notify.GetPid()), Root: c.config.Rootfs, } - for _, hook := range c.config.Hooks.Prestart { + for i, hook := range c.config.Hooks.Prestart { if err := hook.Run(s); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "running prestart hook %d", i) } } } @@ -974,7 +1049,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc }); err != nil { return err } - if err := c.updateState(r); err != nil { + if _, err := c.updateState(r); err != nil { return err } if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { @@ -986,13 +1061,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc return nil } -func (c *linuxContainer) updateState(process parentProcess) error { +func (c *linuxContainer) updateState(process parentProcess) (*State, error) { c.initProcess = process state, err := c.currentState() if err != nil { - return err + return nil, err } - return c.saveState(state) + err = c.saveState(state) + if err != nil { + return nil, err + } + return state, nil } func (c *linuxContainer) saveState(s *State) error { @@ -1027,37 +1106,75 @@ func (c *linuxContainer) refreshState() error { if paused { return c.state.transition(&pausedState{c: c}) } - running, err := c.isRunning() + t, err := c.runType() if err != nil { return err } - if running { + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: return c.state.transition(&runningState{c: c}) } return c.state.transition(&stoppedState{c: c}) } -func (c *linuxContainer) isRunning() (bool, error) { - if c.initProcess == nil { +// doesInitProcessExist checks if the init process is still the same process +// as the initial one, it could happen that the original process has exited +// and a new process has been created with the same pid, in this case, the +// container would already be stopped. +func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) { + startTime, err := system.GetProcessStartTime(initPid) + if err != nil { + return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid) + } + if c.initProcessStartTime != startTime { return false, nil } - // return Running if the init process is alive - if err := syscall.Kill(c.initProcess.pid(), 0); err != nil { - if err == syscall.ESRCH { - return false, nil - } - return false, newSystemError(err) - } return true, nil } +func (c *linuxContainer) runType() (Status, error) { + if c.initProcess == nil { + return Stopped, nil + } + pid := c.initProcess.pid() + // return Running if the init process is alive + if err := syscall.Kill(pid, 0); err != nil { + if err == syscall.ESRCH { + // It means the process does not exist anymore, could happen when the + // process exited just when we call the function, we should not return + // error in this case. + return Stopped, nil + } + return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) + } + // check if the process is still the original init process. + exist, err := c.doesInitProcessExist(pid) + if !exist || err != nil { + return Stopped, err + } + // check if the process that is running is the init process or the user's process. + // this is the difference between the container Running and Created. + environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) + if err != nil { + return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid) + } + check := []byte("_LIBCONTAINER") + if bytes.Contains(environ, check) { + return Created, nil + } + return Running, nil +} + func (c *linuxContainer) isPaused() (bool, error) { data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) if err != nil { + // If freezer cgroup is not mounted, the container would just be not paused. if os.IsNotExist(err) { return false, nil } - return false, newSystemError(err) + return false, newSystemErrorWithCause(err, "checking if container is paused") } return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil } @@ -1125,7 +1242,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp } // only set to join this namespace if it exists if _, err := os.Lstat(p); err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) } // do not allow namespace path with comma as we use it to separate // the namespace paths diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go new file mode 100644 index 00000000000..bb84ff7402b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go index 133238583b5..b163fbbb030 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go @@ -3,13 +3,13 @@ package libcontainer // cgroup restoring strategy provided by criu -type cg_mode uint32 +type cgMode uint32 const ( - CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu - CRIU_CG_MODE_FULL // always restore all cgroups and their properties - CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system - CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT + CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu + CRIU_CG_MODE_FULL // always restore all cgroups and their properties + CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system + CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT ) type CriuPageServerInfo struct { @@ -32,6 +32,6 @@ type CriuOpts struct { FileLocks bool // handle file locks, for safety PageServer CriuPageServerInfo // allow to dump to criu page server VethPairs []VethPairName // pass the veth to criu when restore - ManageCgroupsMode cg_mode // dump or restore cgroup mode + ManageCgroupsMode cgMode // dump or restore cgroup mode EmptyNs uint32 // don't c/r properties for namespace from this mask } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/error.go b/vendor/github.com/opencontainers/runc/libcontainer/error.go index b50aaae84ed..b06392700e9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/error.go @@ -2,7 +2,7 @@ package libcontainer import "io" -// API error code type. +// ErrorCode is the API error code type. type ErrorCode int // API error codes. @@ -56,7 +56,7 @@ func (c ErrorCode) String() string { } } -// API Error type. +// Error is the API error type. type Error interface { error diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index e67b001fb0d..6cce46e0dd6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -23,11 +23,12 @@ import ( ) const ( - stateFilename = "state.json" + stateFilename = "state.json" + execFifoFilename = "exec.fifo" ) var ( - idRegex = regexp.MustCompile(`^[\w-\.]+$`) + idRegex = regexp.MustCompile(`^[\w+-\.]+$`) maxIdLen = 1024 ) @@ -102,6 +103,15 @@ func TmpfsRoot(l *LinuxFactory) error { return nil } +// CriuPath returns an option func to configure a LinuxFactory with the +// provided criupath +func CriuPath(criupath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.CriuPath = criupath + return nil + } +} + // New returns a linux based container factory based in the root directory and // configures the factory with the provided option funcs. func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { @@ -158,13 +168,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } + uid, err := config.HostUID() + if err != nil { + return nil, newGenericError(err, SystemError) + } + gid, err := config.HostGID() + if err != nil { + return nil, newGenericError(err, SystemError) + } containerRoot := filepath.Join(l.Root, id) if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) } else if !os.IsNotExist(err) { return nil, newGenericError(err, SystemError) } - if err := os.MkdirAll(containerRoot, 0700); err != nil { + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, uid, gid); err != nil { + return nil, newGenericError(err, SystemError) + } + fifoName := filepath.Join(containerRoot, execFifoFilename) + oldMask := syscall.Umask(0000) + if err := syscall.Mkfifo(fifoName, 0622); err != nil { + syscall.Umask(oldMask) + return nil, newGenericError(err, SystemError) + } + syscall.Umask(oldMask) + if err := os.Chown(fifoName, uid, gid); err != nil { return nil, newGenericError(err, SystemError) } c := &linuxContainer{ @@ -195,17 +226,18 @@ func (l *LinuxFactory) Load(id string) (Container, error) { fds: state.ExternalDescriptors, } c := &linuxContainer{ - initProcess: r, - id: id, - config: &state.Config, - initPath: l.InitPath, - initArgs: l.InitArgs, - criuPath: l.CriuPath, - cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), - root: containerRoot, - created: state.Created, + initProcess: r, + initProcessStartTime: state.InitProcessStartTime, + id: id, + config: &state.Config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, } - c.state = &createdState{c: c, s: Created} + c.state = &loadedState{c: c} if err := c.refreshState(); err != nil { return nil, err } @@ -219,10 +251,18 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - fdStr := os.Getenv("_LIBCONTAINER_INITPIPE") - pipefd, err := strconv.Atoi(fdStr) - if err != nil { - return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err) + var pipefd, rootfd int + for k, v := range map[string]*int{ + "_LIBCONTAINER_INITPIPE": &pipefd, + "_LIBCONTAINER_STATEDIR": &rootfd, + } { + s := os.Getenv(k) + + i, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("unable to convert %s=%s to int", k, s) + } + *v = i } var ( pipe = os.NewFile(uintptr(pipefd), "pipe") @@ -231,6 +271,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // We have an error during the initialization of the container's init, @@ -239,24 +280,22 @@ func (l *LinuxFactory) StartInitialization() (err error) { // this defer function will never be called. if _, ok := i.(*linuxStandardInit); ok { // Synchronisation only necessary for standard init. - if err := utils.WriteJSON(pipe, syncT{procError}); err != nil { + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { panic(err) } } - if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { panic(err) } // ensure that this pipe is always closed pipe.Close() }() - defer func() { if e := recover(); e != nil { err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) } }() - - i, err = newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe, rootfd) if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go index 3ed33da6d17..9c3d32492ba 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -1,6 +1,7 @@ package libcontainer import ( + "fmt" "io" "text/template" "time" @@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error { } func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { if le, ok := err.(Error); ok { return le } @@ -58,7 +74,8 @@ func newSystemError(err error) Error { Timestamp: time.Now(), Err: err, ECode: SystemError, - Stack: stacktrace.Capture(1), + Cause: cause, + Stack: stacktrace.Capture(2), } if err != nil { gerr.Message = err.Error() @@ -70,12 +87,17 @@ type genericError struct { Timestamp time.Time ECode ErrorCode Err error `json:"-"` + Cause string Message string Stack stacktrace.Stacktrace } func (e *genericError) Error() string { - return e.Message + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) } func (e *genericError) Code() ErrorCode { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 0bde656e292..01ff0d133df 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -52,19 +52,21 @@ type initConfig struct { AppArmorProfile string `json:"apparmor_profile"` NoNewPrivileges bool `json:"no_new_privileges"` User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` Config *configs.Config `json:"config"` Console string `json:"console"` Networks []*network `json:"network"` PassedFilesCount int `json:"passed_files_count"` ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` + ExecFifoPath string `json:"start_pipe_path"` } type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File) (initer, error) { +func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -79,9 +81,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ - pipe: pipe, - parentPid: syscall.Getppid(), - config: config, + pipe: pipe, + parentPid: syscall.Getppid(), + config: config, + stateDirFD: stateDirFD, }, nil } return nil, fmt.Errorf("unknown init type %q", t) @@ -211,8 +214,8 @@ func setupUser(config *initConfig) error { } var addGroups []int - if len(config.Config.AdditionalGroups) > 0 { - addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) + if len(config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go index c37ca213303..8c90e56a6a4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -1,12 +1,12 @@ // +build linux -package keyctl +package keys import ( "fmt" - "syscall" - "strings" "strconv" + "strings" + "syscall" "unsafe" ) @@ -17,7 +17,7 @@ const KEYCTL_DESCRIBE = 6 type KeySerial uint32 func JoinSessionKeyring(name string) (KeySerial, error) { - var _name *byte = nil + var _name *byte var err error if len(name) > 0 { @@ -34,7 +34,7 @@ func JoinSessionKeyring(name string) (KeySerial, error) { return KeySerial(sessKeyId), nil } -// modify permissions on a keyring by reading the current permissions, +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, // anding the bits with the given mask (clearing permissions) and setting // additional permission bits func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { @@ -64,4 +64,3 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { return nil } - diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go index d443df4f7bd..4493bda7741 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go @@ -107,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error { return nil } -// Tell the kernel the label for all files to be created +// SetFileCreateLabel tells the kernel the label for all files to be created func SetFileCreateLabel(fileLabel string) error { if selinux.SelinuxEnabled() { return selinux.Setfscreatecon(fileLabel) @@ -115,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error { return nil } -// Change the label of path to the filelabel string. +// Relabel changes the label of path to the filelabel string. // It changes the MCS label to s0 if shared is true. // This will allow all containers to share the content. func Relabel(path string, fileLabel string, shared bool) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index 166301338c1..400bd362563 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -27,7 +27,8 @@ type Int32msg struct { Value uint32 } -// int32msg has the following representation +// Serialize serializes the message. +// Int32msg has the following representation // | nlattr len | nlattr type | // | uint32 value | func (msg *Int32msg) Serialize() []byte { @@ -43,7 +44,7 @@ func (msg *Int32msg) Len() int { return syscall_NLA_HDRLEN + 4 } -// bytemsg has the following representation +// Bytemsg has the following representation // | nlattr len | nlattr type | // | value | pad | type Bytemsg struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 91e8ef56f7e..334add57495 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -28,6 +28,10 @@ type Process struct { // local to the container's user and group configuration. User string + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string + // Cwd will change the processes current working directory inside the container's rootfs. Cwd string @@ -102,8 +106,8 @@ type IO struct { } // NewConsole creates new console for process and returns it -func (p *Process) NewConsole(rootuid int) (Console, error) { - console, err := NewConsole(rootuid, rootuid) +func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) { + console, err := NewConsole(rootuid, rootgid) if err != nil { return nil, err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 1a2ee0bcd28..33db39239d1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -51,6 +51,7 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader + rootDir *os.File } func (p *setnsProcess) startTime() (string, error) { @@ -69,48 +70,49 @@ func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() + p.rootDir.Close() if err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "starting setns process") } if p.bootstrapData != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } } if err = p.execSetns(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting oom score") } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting rlimits for process") } if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "writing config to pipe") } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "calling shutdown on init pipe") } // wait for the child process to fully complete and receive an error message // if one was encoutered var ierr *genericError if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { - return newSystemError(err) + return newSystemErrorWithCause(err, "decoding init error from pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() - return newSystemError(ierr) + return ierr } return nil } @@ -123,7 +125,7 @@ func (p *setnsProcess) execSetns() error { status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() - return newSystemError(err) + return newSystemErrorWithCause(err, "waiting on setns process to finish") } if !status.Success() { p.cmd.Wait() @@ -132,7 +134,7 @@ func (p *setnsProcess) execSetns() error { var pid *pid if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { p.cmd.Wait() - return newSystemError(err) + return newSystemErrorWithCause(err, "reading pid from init pipe") } process, err := os.FindProcess(pid.Pid) if err != nil { @@ -186,6 +188,7 @@ type initProcess struct { process *Process bootstrapData io.Reader sharePidns bool + rootDir *os.File } func (p *initProcess) pid() int { @@ -221,6 +224,7 @@ func (p *initProcess) execSetns() error { return err } p.cmd.Process = process + p.process.ops = p return nil } @@ -229,28 +233,29 @@ func (p *initProcess) start() error { err := p.cmd.Start() p.process.ops = p p.childPipe.Close() + p.rootDir.Close() if err != nil { p.process.ops = nil - return newSystemError(err) + return newSystemErrorWithCause(err, "starting init process command") } if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return err } if err := p.execSetns(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "running exec setns process for init") } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. fds, err := getPipeFds(p.pid()) if err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "applying cgroup configuration for process") } defer func() { if err != nil { @@ -259,10 +264,10 @@ func (p *initProcess) start() error { } }() if err := p.createNetworkInterfaces(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "creating nework interfaces") } if err := p.sendConfig(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "sending config to init process") } var ( procSync syncT @@ -278,21 +283,21 @@ loop: if err == io.EOF { break loop } - return newSystemError(err) + return newSystemErrorWithCause(err, "decoding sync type from init pipe") } switch procSync.Type { case procReady: if err := p.manager.Set(p.config.Config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting cgroup config for ready process") } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting oom score for ready process") } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting rlimits for ready process") } // call prestart hooks if !p.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -303,16 +308,16 @@ loop: Pid: p.pid(), Root: p.config.Config.Rootfs, } - for _, hook := range p.config.Config.Hooks.Prestart { + for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "running prestart hook %d", i) } } } } // Sync with child. if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "reading syncT run type") } sentRun = true case procHooks: @@ -324,22 +329,22 @@ loop: Root: p.config.Config.Rootfs, BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"), } - for _, hook := range p.config.Config.Hooks.Prestart { + for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "running prestart hook %d", i) } } } // Sync with child. if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "reading syncT resume type") } sentResume = true case procError: // wait for the child process to fully complete and receive an error message // if one was encoutered if err := dec.Decode(&ierr); err != nil && err != io.EOF { - return newSystemError(err) + return newSystemErrorWithCause(err, "decoding proc error from init") } if ierr != nil { break loop @@ -347,22 +352,22 @@ loop: // Programmer error. panic("No error following JSON procError payload.") default: - return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + return newSystemError(fmt.Errorf("invalid JSON payload from child")) } } if !sentRun { - return newSystemError(fmt.Errorf("could not synchronise with container process: %v", ierr)) + return newSystemErrorWithCause(ierr, "container init failed") } if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "shutting down init pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() - return newSystemError(ierr) + return ierr } return nil } @@ -447,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) { // InitializeIO creates pipes for use with the process's STDIO // and returns the opposite side for each -func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { var fds []uintptr i = &IO{} // cleanup in case of an error @@ -479,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { p.Stderr, i.Stderr = w, r // change ownership of the pipes incase we are in a user namespace for _, fd := range fds { - if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { + if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil { return nil, err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index 4aa4cbd5ef2..943b2fc0998 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -25,10 +25,10 @@ import ( const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV -// setupDev returns true if /dev needs to be set up. +// needsSetupDev returns true if /dev needs to be set up. func needsSetupDev(config *configs.Config) bool { for _, m := range config.Mounts { - if m.Device == "bind" && (m.Destination == "/dev" || m.Destination == "/dev/") { + if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { return false } } @@ -39,35 +39,35 @@ func needsSetupDev(config *configs.Config) bool { // new mount namespace. func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { if err := prepareRoot(config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "preparing rootfs") } setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { if err := mountCmd(precmd); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "running premount command") } } if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs) } for _, postcmd := range m.PostmountCmds { if err := mountCmd(postcmd); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "running postmount command") } } } if setupDev { if err := createDevices(config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "creating device nodes") } if err := setupPtmx(config, console); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting up ptmx") } if err := setupDevSymlinks(config.Rootfs); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting up /dev symlinks") } } // Signal the parent to run the pre-start hooks. @@ -78,7 +78,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit return err } if err := syscall.Chdir(config.Rootfs); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) } if config.NoPivotRoot { err = msMoveRoot(config.Rootfs) @@ -86,19 +86,19 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit err = pivotRoot(config.Rootfs, config.PivotDir) } if err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "jailing process inside rootfs") } if setupDev { if err := reOpenDevNull(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "reopening /dev/null inside container") } } // remount dev as ro if specifed for _, m := range config.Mounts { - if m.Destination == "/dev" { + if libcontainerUtils.CleanPath(m.Destination) == "/dev" { if m.Flags&syscall.MS_RDONLY != 0 { if err := remountReadonly(m.Destination); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) } } break @@ -107,7 +107,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting rootfs as readonly") } } syscall.Umask(0022) @@ -115,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit } func mountCmd(cmd configs.Command) error { - command := exec.Command(cmd.Path, cmd.Args[:]...) command.Env = cmd.Env command.Dir = cmd.Dir if out, err := command.CombinedOutput(); err != nil { return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) } - return nil } @@ -240,34 +238,23 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { return err } } - // create symlinks for merged cgroups - cwd, err := os.Getwd() - if err != nil { - return err - } - if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil { - return err - } for _, mc := range merged { for _, ss := range strings.Split(mc, ",") { - if err := os.Symlink(mc, ss); err != nil { - // if cgroup already exists, then okay(it could have been created before) - if os.IsExist(err) { - continue - } - os.Chdir(cwd) + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { return err } } } - if err := os.Chdir(cwd); err != nil { - return err - } if m.Flags&syscall.MS_RDONLY != 0 { // remount cgroup root as readonly mcgrouproot := &configs.Mount{ + Source: m.Destination, + Device: "bind", Destination: m.Destination, - Flags: defaultMountFlags | syscall.MS_RDONLY, + Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND, } if err := remount(mcgrouproot, rootfs); err != nil { return err @@ -515,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) { } // Make parent mount private if it was shared -func rootfsParentMountPrivate(config *configs.Config) error { +func rootfsParentMountPrivate(rootfs string) error { sharedMount := false - parentMount, optionalOpts, err := getParentMount(config.Rootfs) + parentMount, optionalOpts, err := getParentMount(rootfs) if err != nil { return err } @@ -550,9 +537,10 @@ func prepareRoot(config *configs.Config) error { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { return err } - - if err := rootfsParentMountPrivate(config); err != nil { - return err + if config.NoPivotRoot { + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err + } } return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") @@ -595,7 +583,14 @@ func pivotRoot(rootfs, pivotBaseDir string) (err error) { } }() if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { - return fmt.Errorf("pivot_root %s", err) + // Make the parent mount private + if err := rootfsParentMountPrivate(rootfs); err != nil { + return err + } + // Try again + if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } } if err := syscall.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) @@ -705,7 +700,7 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { data = label.FormatMountLabel(m.Data, mountLabel) flags = m.Flags ) - if dest == "/dev" { + if libcontainerUtils.CleanPath(dest) == "/dev" { flags &= ^syscall.MS_RDONLY } if !strings.HasPrefix(dest, rootfs) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go index 3b9a7595f2b..ded5a6bbc8b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -36,6 +36,11 @@ var archs = map[string]string{ "SCMP_ARCH_MIPSEL": "mipsel", "SCMP_ARCH_MIPSEL64": "mipsel64", "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", } // ConvertStringToOperator converts a string into a Seccomp comparison operator. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go index 888483e7687..44df1ad4c26 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -10,7 +10,7 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") -// Seccomp not supported, do nothing +// InitSeccomp does nothing because seccomp is not supported. func InitSeccomp(config *configs.Seccomp) error { if config != nil { return ErrSeccompNotEnabled diff --git a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go index 255080c646e..2a18e2ad898 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -16,7 +16,6 @@ import ( "sync" "syscall" - "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/system" ) @@ -60,16 +59,31 @@ func getSelinuxMountPoint() string { } selinuxfs = "" - mounts, err := mount.GetMounts() + f, err := os.Open("/proc/self/mountinfo") if err != nil { return selinuxfs } - for _, mount := range mounts { - if mount.Fstype == "selinuxfs" { - selinuxfs = mount.Mountpoint - break + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + txt := scanner.Text() + // Safe as mountinfo encodes mountpoints with spaces as \040. + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + continue } + if !strings.Contains(txt[sepIdx:], "selinuxfs") { + continue + } + fields := strings.Split(txt, " ") + if len(fields) < 5 { + continue + } + selinuxfs = fields[4] + break } + if selinuxfs != "" { var buf syscall.Statfs_t syscall.Statfs(selinuxfs, &buf) @@ -297,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string { for ORD > TIER { ORD = ORD - TIER - TIER -= 1 + TIER-- } TIER = SETSIZE - TIER ORD = ORD + TIER @@ -438,7 +452,7 @@ func badPrefix(fpath string) error { return nil } -// Change the fpath file object to the SELinux label scon. +// Chcon changes the fpath file object to the SELinux label scon. // If the fpath is a directory and recurse is true Chcon will walk the // directory tree setting the label func Chcon(fpath string, scon string, recurse bool) error { @@ -472,14 +486,14 @@ func DupSecOpt(src string) []string { con["level"] == "" { return nil } - return []string{"label:user:" + con["user"], - "label:role:" + con["role"], - "label:type:" + con["type"], - "label:level:" + con["level"]} + return []string{"label=user:" + con["user"], + "label=role:" + con["role"], + "label=type:" + con["type"], + "label=level:" + con["level"]} } // DisableSecOpt returns a security opt that can be used to disabling SELinux // labeling support for future container processes func DisableSecOpt() []string { - return []string{"label:disable"} + return []string{"label=disable"} } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index b1a198fd13b..2a8f3452817 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -24,9 +24,11 @@ func (l *linuxSetnsInit) getSessionRingName() string { } func (l *linuxSetnsInit) Init() error { - // do not inherit the parent's session keyring - if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil { - return err + if !l.config.Config.NoNewKeyring { + // do not inherit the parent's session keyring + if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { + return err + } } if l.config.NoNewPrivileges { if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { @@ -44,10 +46,8 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } - if l.config.ProcessLabel != "" { - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err - } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err } return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go index 5ee6e37a3a4..0bbe1495040 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go @@ -2,14 +2,14 @@ package stacktrace import "runtime" -// Caputure captures a stacktrace for the current calling go program +// Capture captures a stacktrace for the current calling go program // // skip is the number of frames to skip func Capture(userSkip int) Stacktrace { var ( skip = userSkip + 1 // add one for our own function frames []Frame - prevPc uintptr = 0 + prevPc uintptr ) for i := skip; ; i++ { pc, file, line, ok := runtime.Caller(i) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index 59bd370004a..87515e1e118 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "os" + "os/exec" "syscall" "github.com/opencontainers/runc/libcontainer/apparmor" @@ -17,9 +18,10 @@ import ( ) type linuxStandardInit struct { - pipe io.ReadWriter - parentPid int - config *initConfig + pipe io.ReadWriteCloser + parentPid int + stateDirFD int + config *initConfig } func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { @@ -43,16 +45,18 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { const PR_SET_NO_NEW_PRIVS = 0x26 func (l *linuxStandardInit) Init() error { - ringname, keepperms, newperms := l.getSessionRingParams() + if !l.config.Config.NoNewKeyring { + ringname, keepperms, newperms := l.getSessionRingParams() - // do not inherit the parent's session keyring - sessKeyId, err := keyctl.JoinSessionKeyring(ringname) - if err != nil { - return err - } - // make session keyring searcheable - if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return err + // do not inherit the parent's session keyring + sessKeyId, err := keys.JoinSessionKeyring(ringname) + if err != nil { + return err + } + // make session keyring searcheable + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return err + } } var console *linuxConsole @@ -123,7 +127,10 @@ func (l *linuxStandardInit) Init() error { if err := syncParentReady(l.pipe); err != nil { return err } - if l.config.Config.Seccomp != nil { + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err } @@ -137,11 +144,35 @@ func (l *linuxStandardInit) Init() error { return err } // compare the parent from the inital start of the init process and make sure that it did not change. - // if the parent changes that means it died and we were reparened to something else so we should + // if the parent changes that means it died and we were reparented to something else so we should // just kill ourself and not cause problems for someone else. if syscall.Getppid() != l.parentPid { return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) } - - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + // check for the arg before waiting to make sure it exists and it is returned + // as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // close the pipe to signal that we have completed our init. + l.pipe.Close() + // wait for the fifo to be opened on the other side before + // exec'ing the users process. + fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "openat exec fifo") + } + if _, err := syscall.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index d2618f69b25..266282404c7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "syscall" "github.com/Sirupsen/logrus" "github.com/opencontainers/runc/libcontainer/configs" @@ -77,7 +78,7 @@ type stoppedState struct { } func (b *stoppedState) status() Status { - return Destroyed + return Stopped } func (b *stoppedState) transition(s containerState) error { @@ -110,11 +111,11 @@ func (r *runningState) status() Status { func (r *runningState) transition(s containerState) error { switch s.(type) { case *stoppedState: - running, err := r.c.isRunning() + t, err := r.c.runType() if err != nil { return err } - if running { + if t == Running { return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) } r.c.state = s @@ -129,16 +130,40 @@ func (r *runningState) transition(s containerState) error { } func (r *runningState) destroy() error { - running, err := r.c.isRunning() + t, err := r.c.runType() if err != nil { return err } - if running { + if t == Running { return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) } return destroy(r.c) } +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(syscall.SIGKILL) + return destroy(i.c) +} + // pausedState represents a container that is currently pause. It cannot be destroyed in a // paused state and must transition back to running first. type pausedState struct { @@ -161,11 +186,11 @@ func (p *pausedState) transition(s containerState) error { } func (p *pausedState) destroy() error { - isRunning, err := p.c.isRunning() + t, err := p.c.runType() if err != nil { return err } - if !isRunning { + if t != Running && t != Created { if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { return err } @@ -175,7 +200,7 @@ func (p *pausedState) destroy() error { } // restoredState is the same as the running state but also has accociated checkpoint -// information that maybe need destroyed when the container is stopped and destory is called. +// information that maybe need destroyed when the container is stopped and destroy is called. type restoredState struct { imageDir string c *linuxContainer @@ -204,23 +229,23 @@ func (r *restoredState) destroy() error { return destroy(r.c) } -// createdState is used whenever a container is restored, loaded, or setting additional +// loadedState is used whenever a container is restored, loaded, or setting additional // processes inside and it should not be destroyed when it is exiting. -type createdState struct { +type loadedState struct { c *linuxContainer s Status } -func (n *createdState) status() Status { +func (n *loadedState) status() Status { return n.s } -func (n *createdState) transition(s containerState) error { +func (n *loadedState) transition(s containerState) error { n.c.state = s return nil } -func (n *createdState) destroy() error { +func (n *loadedState) destroy() error { if err := n.c.refreshState(); err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go new file mode 100644 index 00000000000..da78c1c2e15 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go @@ -0,0 +1,7 @@ +package libcontainer + +// Solaris - TODO + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 8b199d92ed5..1afc52b4bdd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -100,17 +100,12 @@ func Setctty() error { return nil } -/* - * Detect whether we are currently running in a user namespace. - * Copied from github.com/lxc/lxd/shared/util.go - */ +// RunningInUserNS detects whether we are currently running in a user namespace. +// Copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { file, err := os.Open("/proc/self/uid_map") if err != nil { - /* - * This kernel-provided file only exists if user namespaces are - * supported - */ + // This kernel-provided file only exists if user namespaces are supported return false } defer file.Close() diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 9e748a6d6f0..3466bfcea68 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -100,3 +100,22 @@ func SearchLabels(labels []string, query string) string { } return "" } + +// Annotations returns the bundle path and user defined annotations from the +// libcontianer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +}