Merge pull request #29928 from dubstack/bump-libcontainer

Automatic merge from submit-queue

Bump Libcontainer to latest head

@Random-Liu or @yujuhong Can any one of you please do a quick review.

I updated libcontainer in a previous PR but  #29492 reverted those changes. This is needed for #27204. 

Signed-off-by: Buddha Prakash <buddhap@google.com>
This commit is contained in:
Kubernetes Submit Queue 2016-08-04 15:12:13 -07:00 committed by GitHub
commit d10e47b891
45 changed files with 874 additions and 415 deletions

64
Godeps/Godeps.json generated
View File

@ -1643,83 +1643,83 @@
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer", "ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys", "ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/label", "ImportPath": "github.com/opencontainers/runc/libcontainer/label",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/system", "ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/user", "ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils", "ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v0.1.1", "Comment": "v1.0.0-rc1-100-g142df38",
"Rev": "baf6536d6259209c3edfa2b22237af82942d3dfa" "Rev": "142df3836b740af53dc6da59eed8dbc92f62917c"
}, },
{ {
"ImportPath": "github.com/pborman/uuid", "ImportPath": "github.com/pborman/uuid",

View File

@ -188,12 +188,13 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
// Create a cgroup container manager. // Create a cgroup container manager.
func createManager(containerName string) *fs.Manager { func createManager(containerName string) *fs.Manager {
allowAllDevices := true
return &fs.Manager{ return &fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Parent: "/", Parent: "/",
Name: containerName, Name: containerName,
Resources: &configs.Resources{ Resources: &configs.Resources{
AllowAllDevices: true, AllowAllDevices: &allowAllDevices,
}, },
}, },
} }
@ -319,7 +320,7 @@ func (cm *containerManagerImpl) setupNode() error {
} }
glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit) glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit)
allowAllDevices := true
dockerContainer := &fs.Manager{ dockerContainer := &fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Parent: "/", Parent: "/",
@ -327,7 +328,7 @@ func (cm *containerManagerImpl) setupNode() error {
Resources: &configs.Resources{ Resources: &configs.Resources{
Memory: memoryLimit, Memory: memoryLimit,
MemorySwap: -1, MemorySwap: -1,
AllowAllDevices: true, AllowAllDevices: &allowAllDevices,
}, },
}, },
} }
@ -370,12 +371,13 @@ func (cm *containerManagerImpl) setupNode() error {
if cm.KubeletCgroupsName != "" { if cm.KubeletCgroupsName != "" {
cont := newSystemCgroups(cm.KubeletCgroupsName) cont := newSystemCgroups(cm.KubeletCgroupsName)
allowAllDevices := true
manager := fs.Manager{ manager := fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Parent: "/", Parent: "/",
Name: cm.KubeletCgroupsName, Name: cm.KubeletCgroupsName,
Resources: &configs.Resources{ Resources: &configs.Resources{
AllowAllDevices: true, AllowAllDevices: &allowAllDevices,
}, },
}, },
} }

View File

@ -30,12 +30,13 @@ import (
// //
// containerName must be an absolute container name. // containerName must be an absolute container name.
func RunInResourceContainer(containerName string) error { func RunInResourceContainer(containerName string) error {
allowAllDevices := true
manager := fs.Manager{ manager := fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Parent: "/", Parent: "/",
Name: containerName, Name: containerName,
Resources: &configs.Resources{ Resources: &configs.Resources{
AllowAllDevices: true, AllowAllDevices: &allowAllDevices,
}, },
}, },
} }

View File

@ -77,7 +77,7 @@ config := &configs.Config{
Parent: "system", Parent: "system",
Resources: &configs.Resources{ Resources: &configs.Resources{
MemorySwappiness: nil, MemorySwappiness: nil,
AllowAllDevices: false, AllowAllDevices: nil,
AllowedDevices: configs.DefaultAllowedDevices, AllowedDevices: configs.DefaultAllowedDevices,
}, },
}, },
@ -186,8 +186,8 @@ process := &libcontainer.Process{
err := container.Start(process) err := container.Start(process)
if err != nil { if err != nil {
logrus.Fatal(err)
container.Destroy() container.Destroy()
logrus.Fatal(err)
return return
} }
@ -219,6 +219,9 @@ container.Resume()
// send signal to container's init process. // send signal to container's init process.
container.Signal(signal) container.Signal(signal)
// update container resource constraints.
container.Set(config)
``` ```

View File

@ -90,7 +90,7 @@ in tmpfs.
After `/dev/null` has been setup we check for any external links between After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null` to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs. that is local to the container's rootfs.
@ -297,7 +297,7 @@ a container.
| -------------- | ------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container | | Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole | | Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) | | Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status | | Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state | | Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process | | Signal | Send a signal to the container's init process |

View File

@ -7,6 +7,7 @@ package apparmor
// #include <stdlib.h> // #include <stdlib.h>
import "C" import "C"
import ( import (
"fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"unsafe" "unsafe"
@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
cName := C.CString(name) cName := C.CString(name)
defer C.free(unsafe.Pointer(cName)) defer C.free(unsafe.Pointer(cName))
if _, err := C.aa_change_onexec(cName); err != nil { if _, err := C.aa_change_onexec(cName); err != nil {
return err return fmt.Errorf("apparmor failed to apply profile: %s", err)
} }
return nil return nil
} }

View File

@ -9,7 +9,6 @@ import (
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"sync" "sync"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
@ -33,7 +32,6 @@ var (
&FreezerGroup{}, &FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true}, &NameGroup{GroupName: "name=systemd", Join: true},
} }
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize() HugePageSizes, _ = cgroups.GetHugePageSize()
) )
@ -142,7 +140,9 @@ func (m *Manager) Apply(pid int) (err error) {
// created then join consists of writing the process pids to cgroup.procs // created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name()) p, err := d.path(sys.Name())
if err != nil { if err != nil {
if cgroups.IsNotFound(err) { // The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue continue
} }
return err return err
@ -190,6 +190,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range subsystems { for _, sys := range subsystems {
// Generate fake cgroup data. // Generate fake cgroup data.
d, err := getCgroupData(container.Cgroups, -1) d, err := getCgroupData(container.Cgroups, -1)
@ -339,7 +344,7 @@ func (raw *cgroupData) join(subsystem string) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return "", err return "", err
} }
if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil { if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
return "", err return "", err
} }
return path, nil return path, nil
@ -349,7 +354,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem // Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here. // is not mounted, we will get empty dir, and we want it fail here.
if dir == "" { if dir == "" {
return fmt.Errorf("no such directory for %s.", file) return fmt.Errorf("no such directory for %s", file)
} }
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil { if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err) return fmt.Errorf("failed to write %v to %v: %v", data, file, err)

View File

@ -8,7 +8,6 @@ import (
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
@ -67,7 +66,7 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
} }
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
return err return err
} }

View File

@ -43,7 +43,8 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
} }
return nil return nil
} }
if !cgroup.Resources.AllowAllDevices { if cgroup.Resources.AllowAllDevices != nil {
if *cgroup.Resources.AllowAllDevices == false {
if err := writeFile(path, "devices.deny", "a"); err != nil { if err := writeFile(path, "devices.deny", "a"); err != nil {
return err return err
} }
@ -59,6 +60,7 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := writeFile(path, "devices.allow", "a"); err != nil { if err := writeFile(path, "devices.allow", "a"); err != nil {
return err return err
} }
}
for _, dev := range cgroup.Resources.DeniedDevices { for _, dev := range cgroup.Resources.DeniedDevices {
if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil { if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil {

View File

@ -5,15 +5,21 @@ package fs
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"syscall"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
)
type MemoryGroup struct { type MemoryGroup struct {
} }
@ -32,13 +38,10 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return err return err
} }
} }
// We have to set kernel memory here, as we can't change it once if err := EnableKernelMemoryAccounting(path); err != nil {
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err return err
} }
} }
defer func() { defer func() {
if err != nil { if err != nil {
os.RemoveAll(path) os.RemoveAll(path)
@ -54,13 +57,43 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return nil return nil
} }
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { func EnableKernelMemoryAccounting(path string) error {
// This has to be done separately because it has special constraints (it // Check if kernel memory is enabled
// can't be done after there are processes attached to the cgroup). // We have to limit the kernel memory here as it won't be accounted at all
if cgroup.Resources.KernelMemory > 0 { // until a limit is set on the cgroup and limit cannot be set once the
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { // cgroup has children, or if there are already tasks in the cgroup.
kernelMemoryLimit := int64(1)
if err := setKernelMemory(path, kernelMemoryLimit); err != nil {
return err return err
} }
kernelMemoryLimit = int64(-1)
if err := setKernelMemory(path, kernelMemoryLimit); err != nil {
return err
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == syscall.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
} }
return nil return nil
} }
@ -113,11 +146,18 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 { if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err return err
} }
} }
if cgroup.Resources.KernelMemoryTCP != 0 { if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err return err

View File

@ -3,6 +3,8 @@
package fs package fs
import ( import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error {
} }
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != "" { if cgroup.Resources.NetClsClassid != 0 {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err return err
} }
} }

View File

@ -12,7 +12,6 @@ import (
) )
var ( var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format") ErrNotValidFormat = errors.New("line is not a valid key value format")
) )

View File

@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"` ThrottledTime uint64 `json:"throttled_time,omitempty"`
} }
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception. // All CPU stats are aggregate since container inception.
type CpuUsage struct { type CpuUsage struct {
// Total CPU time consumed. // Total CPU time consumed.

View File

@ -74,6 +74,7 @@ var (
theConn *systemdDbus.Conn theConn *systemdDbus.Conn
hasStartTransientUnit bool hasStartTransientUnit bool
hasTransientDefaultDependencies bool hasTransientDefaultDependencies bool
hasDelegate bool
) )
func newProp(name string, units interface{}) systemdDbus.Property { func newProp(name string, units interface{}) systemdDbus.Property {
@ -146,6 +147,20 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above. // Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil) theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
} }
return hasStartTransientUnit return hasStartTransientUnit
} }
@ -183,10 +198,13 @@ func (m *Manager) Apply(pid int) error {
systemdDbus.PropSlice(slice), systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name), systemdDbus.PropDescription("docker container "+c.Name),
newProp("PIDs", []uint32{uint32(pid)}), newProp("PIDs", []uint32{uint32(pid)}),
// This is only supported on systemd versions 218 and above.
newProp("Delegate", true),
) )
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation, // Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time. // plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties, properties = append(properties,
@ -214,11 +232,9 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
} }
// We need to set kernel memory before processes join cgroup because // We have to set kernel memory here, as we can't change it once
// kmem.limit_in_bytes can only be set when the cgroup is empty. // processes have been attached to the cgroup.
// And swap memory limit needs to be set after memory limit, only if c.Resources.KernelMemory != 0 {
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
if err := setKernelMemory(c); err != nil { if err := setKernelMemory(c); err != nil {
return err return err
} }
@ -273,7 +289,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem // Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here. // is not mounted, we will get empty dir, and we want it fail here.
if dir == "" { if dir == "" {
return fmt.Errorf("no such directory for %s.", file) return fmt.Errorf("no such directory for %s", file)
} }
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
} }
@ -372,6 +388,8 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
if err != nil { if err != nil {
return "", err return "", err
} }
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
slice := "system.slice" slice := "system.slice"
if c.Parent != "" { if c.Parent != "" {
@ -439,6 +457,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range subsystems { for _, sys := range subsystems {
// Get the subsystem path, but don't error out for not found cgroups. // Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name()) path, err := getSubsystemPath(container.Cgroups, sys.Name())
@ -472,8 +495,5 @@ func setKernelMemory(c *configs.Cgroup) error {
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return err return err
} }
return fs.EnableKernelMemoryAccounting(path)
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
} }

View File

@ -16,13 +16,19 @@ import (
"github.com/docker/go-units" "github.com/docker/go-units"
) )
const cgroupNamePrefix = "name=" const (
cgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) { func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient, // We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf. // parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start. // It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return "", err return "", err
@ -47,6 +53,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
} }
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return "", "", err return "", "", err
@ -70,6 +79,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem) return "", "", NewNotFoundError(subsystem)
} }
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func FindCgroupMountpointDir() (string, error) { func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
@ -124,7 +142,8 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) { func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
res := make([]Mount, 0, len(ss)) res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi) scanner := bufio.NewScanner(mi)
for scanner.Scan() { numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text() txt := scanner.Text()
sepIdx := strings.Index(txt, " - ") sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 { if sepIdx == -1 {
@ -139,12 +158,15 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
Root: fields[3], Root: fields[3],
} }
for _, opt := range strings.Split(fields[len(fields)-1], ",") { for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) { if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} } else {
if ss[opt] {
m.Subsystems = append(m.Subsystems, opt) m.Subsystems = append(m.Subsystems, opt)
} }
numFound++
} }
res = append(res, m) res = append(res, m)
} }
@ -161,19 +183,19 @@ func GetCgroupMounts() ([]Mount, error) {
} }
defer f.Close() defer f.Close()
all, err := GetAllSubsystems() all, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil { if err != nil {
return nil, err return nil, err
} }
allMap := make(map[string]bool) allMap := make(map[string]bool)
for _, s := range all { for s := range all {
allMap[s] = true allMap[s] = true
} }
return getCgroupMountsHelper(allMap, f) return getCgroupMountsHelper(allMap, f)
} }
// Returns all the cgroup subsystems supported by the kernel // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) { func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups") f, err := os.Open("/proc/cgroups")
if err != nil { if err != nil {
@ -199,7 +221,7 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil return subsystems, nil
} }
// Returns the relative path to the cgroup docker is running in. // GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) { func GetThisCgroupDir(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup") cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil { if err != nil {
@ -220,7 +242,7 @@ func GetInitCgroupDir(subsystem string) (string, error) {
} }
func readProcsFile(dir string) ([]int, error) { func readProcsFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, "cgroup.procs")) f, err := os.Open(filepath.Join(dir, CgroupProcesses))
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -243,6 +265,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil return out, nil
} }
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) { func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
@ -250,7 +274,12 @@ func ParseCgroupFile(path string) (map[string]string, error) {
} }
defer f.Close() defer f.Close()
s := bufio.NewScanner(f) return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string) cgroups := make(map[string]string)
for s.Scan() { for s.Scan() {
@ -259,7 +288,16 @@ func ParseCgroupFile(path string) (map[string]string, error) {
} }
text := s.Text() text := s.Text()
parts := strings.Split(text, ":") // from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") { for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2] cgroups[subs] = parts[2]
@ -291,8 +329,7 @@ func PathExists(path string) bool {
func EnterPid(cgroupPaths map[string]string, pid int) error { func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths { for _, path := range cgroupPaths {
if PathExists(path) { if PathExists(path) {
if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), if err := WriteCgroupProc(path, pid); err != nil {
[]byte(strconv.Itoa(pid)), 0700); err != nil {
return err return err
} }
} }
@ -361,7 +398,7 @@ func GetAllPids(path string) ([]int, error) {
// collect pids from all sub-cgroups // collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
dir, file := filepath.Split(p) dir, file := filepath.Split(p)
if file != "cgroup.procs" { if file != CgroupProcesses {
return nil return nil
} }
if iErr != nil { if iErr != nil {
@ -376,3 +413,20 @@ func GetAllPids(path string) ([]int, error) {
}) })
return pids, err return pids, err
} }
// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
func WriteCgroupProc(dir string, pid int) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
}
return nil
}

View File

@ -36,7 +36,7 @@ type Cgroup struct {
type Resources struct { type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
// Deprecated // Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"` AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
// Deprecated // Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"` AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated // Deprecated
@ -69,10 +69,10 @@ type Resources struct {
CpuPeriod int64 `json:"cpu_period"` CpuPeriod int64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs). // How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_quota"` CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs). // CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_period"` CpuRtPeriod int64 `json:"cpu_rt_period"`
// CPU to use // CPU to use
CpusetCpus string `json:"cpuset_cpus"` CpusetCpus string `json:"cpuset_cpus"`
@ -120,5 +120,5 @@ type Resources struct {
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets // Set class identifier for container's network packets
NetClsClassid string `json:"net_cls_classid"` NetClsClassid uint32 `json:"net_cls_classid"`
} }

View File

@ -33,7 +33,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"` Syscalls []*Syscall `json:"syscalls"`
} }
// An action to be taken upon rule match in Seccomp // Action is taken upon rule match in Seccomp
type Action int type Action int
const ( const (
@ -44,7 +44,7 @@ const (
Trace Trace
) )
// A comparison operator to be used when matching syscall arguments in Seccomp // Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int type Operator int
const ( const (
@ -57,7 +57,7 @@ const (
MaskEqualTo MaskEqualTo
) )
// A rule to match a specific syscall argument in Seccomp // Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct { type Arg struct {
Index uint `json:"index"` Index uint `json:"index"`
Value uint64 `json:"value"` Value uint64 `json:"value"`
@ -65,7 +65,7 @@ type Arg struct {
Op Operator `json:"op"` Op Operator `json:"op"`
} }
// An rule to match a syscall in Seccomp // Syscall is a rule to match a syscall in Seccomp
type Syscall struct { type Syscall struct {
Name string `json:"name"` Name string `json:"name"`
Action Action `json:"action"` Action Action `json:"action"`
@ -148,10 +148,6 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"` OomScoreAdj int `json:"oom_score_adj"`
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string `json:"additional_groups"`
// UidMappings is an array of User ID mappings for User Namespaces // UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"` UidMappings []IDMap `json:"uid_mappings"`
@ -187,6 +183,10 @@ type Config struct {
// Labels are user defined metadata that is stored in the config and populated on the state // Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"` Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
} }
type Hooks struct { type Hooks struct {
@ -261,7 +261,7 @@ type Hook interface {
Run(HookState) error Run(HookState) error
} }
// NewFunctionHooks will call the provided function when the hook is run. // NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook { func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{ return FuncHook{
run: f, run: f,
@ -284,7 +284,7 @@ type Command struct {
Timeout *time.Duration `json:"timeout"` Timeout *time.Duration `json:"timeout"`
} }
// NewCommandHooks will execute the provided command when the hook is run. // NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook { func NewCommandHook(cmd Command) CommandHook {
return CommandHook{ return CommandHook{
Command: cmd, Command: cmd,

View File

@ -4,7 +4,7 @@ package configs
import "fmt" import "fmt"
// Gets the root uid for the process on host which could be non-zero // HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled. // when user namespaces are enabled.
func (c Config) HostUID() (int, error) { func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {
@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
return 0, nil return 0, nil
} }
// Gets the root gid for the process on host which could be non-zero // HostGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled. // when user namespaces are enabled.
func (c Config) HostGID() (int, error) { func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {

View File

@ -3,7 +3,7 @@
package configs package configs
var ( var (
// These are devices that are to be both allowed and created. // DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{ DefaultSimpleDevices = []*Device{
// /dev/null and zero // /dev/null and zero
{ {

View File

@ -7,6 +7,7 @@ import (
"strings" "strings"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/selinux"
) )
type Validator interface { type Validator interface {
@ -80,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
!config.Namespaces.Contains(configs.NEWNS) { !config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
} }
if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil return nil
} }

View File

@ -0,0 +1,11 @@
package libcontainer
import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View File

@ -1,4 +1,4 @@
// Libcontainer provides a native Go implementation for creating containers // Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls. // with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations // It allows you to manage the lifecycle of the container performing additional operations
// after the container is created. // after the container is created.
@ -11,24 +11,20 @@ import (
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
// The status of a container. // Status is the status of a container.
type Status int type Status int
const ( const (
// The container exists but has not been run yet // Created is the status that denotes the container exists but has not been run yet.
Created Status = iota Created Status = iota
// Running is the status that denotes the container exists and is running.
// The container exists and is running.
Running Running
// Pausing is the status that denotes the container exists, it is in the process of being paused.
// The container exists, it is in the process of being paused.
Pausing Pausing
// Paused is the status that denotes the container exists, but all its processes are paused.
// The container exists, but all its processes are paused.
Paused Paused
// Stopped is the status that denotes the container does not have a created or running process.
// The container does not exist. Stopped
Destroyed
) )
func (s Status) String() string { func (s Status) String() string {
@ -41,8 +37,8 @@ func (s Status) String() string {
return "pausing" return "pausing"
case Paused: case Paused:
return "paused" return "paused"
case Destroyed: case Stopped:
return "destroyed" return "stopped"
default: default:
return "unknown" return "unknown"
} }
@ -67,7 +63,7 @@ type BaseState struct {
Config configs.Config `json:"config"` Config configs.Config `json:"config"`
} }
// A libcontainer container object. // BaseContainer is a libcontainer container object.
// //
// Each container is thread-safe within the same process. Since a container can // Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container // be destroyed by a separate process, any function may return that the container
@ -80,13 +76,13 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
Status() (Status, error) Status() (Status, error)
// State returns the current container's state information. // State returns the current container's state information.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
State() (*State, error) State() (*State, error)
// Returns the current config of the container. // Returns the current config of the container.
@ -96,7 +92,7 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
// //
// Some of the returned PIDs may no longer refer to processes in the Container, unless // Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid. // the Container state is PAUSED in which case every PID in the slice is valid.
@ -106,7 +102,7 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
Stats() (*Stats, error) Stats() (*Stats, error)
// Set resources of container as configured // Set resources of container as configured
@ -114,7 +110,7 @@ type BaseContainer interface {
// We can use this to change resources when containers are running. // We can use this to change resources when containers are running.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Set(config configs.Config) error Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to // Start a process inside the container. Returns error if process fails to
@ -124,21 +120,38 @@ type BaseContainer interface {
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid, // ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused, // ContainerPaused - Container is paused,
// Systemerror - System error. // SystemError - System error.
Start(process *Process) (err error) Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container after killing all running processes. // Destroys the container after killing all running processes.
// //
// Any event registrations are removed before the container is destroyed. // Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed. // No error is returned if the container is already destroyed.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Destroy() error Destroy() error
// Signal sends the provided signal code to the container's initial process. // Signal sends the provided signal code to the container's initial process.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Signal(s os.Signal) error Signal(s os.Signal) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
} }

View File

@ -22,6 +22,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability" "github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl" "github.com/vishvananda/netlink/nl"
@ -37,6 +38,7 @@ type linuxContainer struct {
initPath string initPath string
initArgs []string initArgs []string
initProcess parentProcess initProcess parentProcess
initProcessStartTime string
criuPath string criuPath string
m sync.Mutex m sync.Mutex
criuVersion int criuVersion int
@ -62,7 +64,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"` ExternalDescriptors []string `json:"external_descriptors,omitempty"`
} }
// A libcontainer container object. // Container is a libcontainer container object.
// //
// Each container is thread-safe within the same process. Since a container can // Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container // be destroyed by a separate process, any function may return that the container
@ -84,7 +86,7 @@ type Container interface {
// Systemerror - System error. // Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses // If the Container state is RUNNING, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the // the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED. // state is changed to PAUSED.
// If the Container state is PAUSED, do nothing. // If the Container state is PAUSED, do nothing.
@ -141,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) { func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids() pids, err := c.cgroupManager.GetAllPids()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
} }
return pids, nil return pids, nil
} }
@ -152,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{} stats = &Stats{}
) )
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err) return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
} }
for _, iface := range c.config.Networks { for _, iface := range c.config.Networks {
switch iface.Type { switch iface.Type {
case "veth": case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil { if err != nil {
return stats, newSystemError(err) return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
} }
stats.Interfaces = append(stats.Interfaces, istats) stats.Interfaces = append(stats.Interfaces, istats)
} }
@ -170,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config c.config = &config
return c.cgroupManager.Set(c.config) return c.cgroupManager.Set(c.config)
} }
@ -181,28 +190,76 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil { if err != nil {
return err return err
} }
doInit := status == Destroyed return c.start(process, status == Stopped)
parent, err := c.newParentProcess(process, doInit) }
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil { if err != nil {
return newSystemError(err) return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
} }
if err := parent.start(); err != nil { if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped. // terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil { if err := parent.terminate(); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
return newSystemError(err) return newSystemErrorWithCause(err, "starting container process")
} }
// generate a timestamp indicating when the container was started // generate a timestamp indicating when the container was started
c.created = time.Now().UTC() c.created = time.Now().UTC()
c.state = &runningState{ c.state = &runningState{
c: c, c: c,
} }
if doInit { if isInit {
if err := c.updateState(parent); err != nil { c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err return err
} }
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil { if c.config.Hooks != nil {
s := configs.HookState{ s := configs.HookState{
Version: c.config.Version, Version: c.config.Version,
@ -211,12 +268,12 @@ func (c *linuxContainer) Start(process *Process) error {
Root: c.config.Rootfs, Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for _, hook := range c.config.Hooks.Poststart { for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil { if err := parent.terminate(); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
return newSystemError(err) return newSystemErrorWithCausef(err, "running poststart hook %d", i)
} }
} }
} }
@ -226,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {
func (c *linuxContainer) Signal(s os.Signal) error { func (c *linuxContainer) Signal(s os.Signal) error {
if err := c.initProcess.signal(s); err != nil { if err := c.initProcess.signal(s); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "signaling init process")
} }
return nil return nil
} }
@ -234,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe() parentPipe, childPipe, err := newPipe()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCause(err, "creating new init pipe")
} }
cmd, err := c.commandTemplate(p, childPipe) rootDir, err := os.Open(c.root)
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
} }
if !doInit { if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe) return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
} }
return c.newInitProcess(p, cmd, parentPipe, childPipe) return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
} }
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{ cmd := &exec.Cmd{
Path: c.initPath, Path: c.initPath,
Args: c.initArgs, Args: c.initArgs,
@ -258,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil { if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.SysProcAttr = &syscall.SysProcAttr{}
} }
cmd.ExtraFiles = append(p.ExtraFiles, childPipe) cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is // NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running. // even with the parent still running.
@ -269,7 +332,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil return cmd, nil
} }
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string) nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces { for _, ns := range c.config.Namespaces {
@ -292,14 +355,15 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
process: p, process: p,
bootstrapData: data, bootstrapData: data,
sharePidns: sharePidns, sharePidns: sharePidns,
rootDir: rootDir,
}, nil }, nil
} }
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState() state, err := c.currentState()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCause(err, "getting container's current state")
} }
// for setns process, we dont have to set cloneflags as the process namespaces // for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall // will only be set via setns syscall
@ -316,6 +380,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p), config: c.newInitConfig(p),
process: p, process: p,
bootstrapData: data, bootstrapData: data,
rootDir: rootDir,
}, nil }, nil
} }
@ -325,6 +390,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
Args: process.Args, Args: process.Args,
Env: process.Env, Env: process.Env,
User: process.User, User: process.User,
AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd, Cwd: process.Cwd,
Console: process.consolePath, Console: process.consolePath,
Capabilities: process.Capabilities, Capabilities: process.Capabilities,
@ -334,6 +400,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile, AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel, ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits, Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
} }
if process.NoNewPrivileges != nil { if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges cfg.NoNewPrivileges = *process.NoNewPrivileges
@ -371,9 +438,8 @@ func (c *linuxContainer) Pause() error {
if err != nil { if err != nil {
return err return err
} }
if status != Running { switch status {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) case Running, Created:
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err return err
} }
@ -381,6 +447,8 @@ func (c *linuxContainer) Pause() error {
c: c, c: c,
}) })
} }
return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error { func (c *linuxContainer) Resume() error {
c.m.Lock() c.m.Lock()
@ -408,13 +476,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
} }
// check Criu version greater than or equal to min_version // checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(min_version string) error { func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil { if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6 _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
} }
versionReq = x*10000 + y*100 + z versionReq = x*10000 + y*100 + z
@ -459,7 +527,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq { if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version) return fmt.Errorf("CRIU version must be %s or higher", minVersion)
} }
return nil return nil
@ -607,6 +675,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
} }
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
@ -690,23 +779,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break break
} }
} }
for _, iface := range c.config.Networks {
switch iface.Type { if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
case "veth": c.restoreNetwork(req, criuOpts)
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
} }
// append optional manage cgroups mode // append optional manage cgroups mode
@ -955,9 +1030,9 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
Pid: int(notify.GetPid()), Pid: int(notify.GetPid()),
Root: c.config.Rootfs, Root: c.config.Rootfs,
} }
for _, hook := range c.config.Hooks.Prestart { for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "running prestart hook %d", i)
} }
} }
} }
@ -974,7 +1049,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil { }); err != nil {
return err return err
} }
if err := c.updateState(r); err != nil { if _, err := c.updateState(r); err != nil {
return err return err
} }
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@ -986,13 +1061,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil return nil
} }
func (c *linuxContainer) updateState(process parentProcess) error { func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process c.initProcess = process
state, err := c.currentState() state, err := c.currentState()
if err != nil { if err != nil {
return err return nil, err
} }
return c.saveState(state) err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
} }
func (c *linuxContainer) saveState(s *State) error { func (c *linuxContainer) saveState(s *State) error {
@ -1027,37 +1106,75 @@ func (c *linuxContainer) refreshState() error {
if paused { if paused {
return c.state.transition(&pausedState{c: c}) return c.state.transition(&pausedState{c: c})
} }
running, err := c.isRunning() t, err := c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c}) return c.state.transition(&runningState{c: c})
} }
return c.state.transition(&stoppedState{c: c}) return c.state.transition(&stoppedState{c: c})
} }
func (c *linuxContainer) isRunning() (bool, error) { // doesInitProcessExist checks if the init process is still the same process
if c.initProcess == nil { // as the initial one, it could happen that the original process has exited
return false, nil // and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
} }
// return Running if the init process is alive if c.initProcessStartTime != startTime {
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil return false, nil
} }
return false, newSystemError(err)
}
return true, nil return true, nil
} }
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) { func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil { if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) { if os.IsNotExist(err) {
return false, nil return false, nil
} }
return false, newSystemError(err) return false, newSystemErrorWithCause(err, "checking if container is paused")
} }
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
} }
@ -1125,7 +1242,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
} }
// only set to join this namespace if it exists // only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil { if _, err := os.Lstat(p); err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
} }
// do not allow namespace path with comma as we use it to separate // do not allow namespace path with comma as we use it to separate
// the namespace paths // the namespace paths

View File

@ -0,0 +1,20 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View File

@ -3,10 +3,10 @@
package libcontainer package libcontainer
// cgroup restoring strategy provided by criu // cgroup restoring strategy provided by criu
type cg_mode uint32 type cgMode uint32
const ( const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
@ -32,6 +32,6 @@ type CriuOpts struct {
FileLocks bool // handle file locks, for safety FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask EmptyNs uint32 // don't c/r properties for namespace from this mask
} }

View File

@ -2,7 +2,7 @@ package libcontainer
import "io" import "io"
// API error code type. // ErrorCode is the API error code type.
type ErrorCode int type ErrorCode int
// API error codes. // API error codes.
@ -56,7 +56,7 @@ func (c ErrorCode) String() string {
} }
} }
// API Error type. // Error is the API error type.
type Error interface { type Error interface {
error error

View File

@ -24,10 +24,11 @@ import (
const ( const (
stateFilename = "state.json" stateFilename = "state.json"
execFifoFilename = "exec.fifo"
) )
var ( var (
idRegex = regexp.MustCompile(`^[\w-\.]+$`) idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
maxIdLen = 1024 maxIdLen = 1024
) )
@ -102,6 +103,15 @@ func TmpfsRoot(l *LinuxFactory) error {
return nil return nil
} }
// CriuPath returns an option func to configure a LinuxFactory with the
// provided criupath
func CriuPath(criupath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.CriuPath = criupath
return nil
}
}
// New returns a linux based container factory based in the root directory and // New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs. // configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
@ -158,13 +168,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil { if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid) return nil, newGenericError(err, ConfigInvalid)
} }
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
containerRoot := filepath.Join(l.Root, id) containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil { if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) { } else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
if err := os.MkdirAll(containerRoot, 0700); err != nil { if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
c := &linuxContainer{ c := &linuxContainer{
@ -196,6 +227,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
} }
c := &linuxContainer{ c := &linuxContainer{
initProcess: r, initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id, id: id,
config: &state.Config, config: &state.Config,
initPath: l.InitPath, initPath: l.InitPath,
@ -205,7 +237,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
root: containerRoot, root: containerRoot,
created: state.Created, created: state.Created,
} }
c.state = &createdState{c: c, s: Created} c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil { if err := c.refreshState(); err != nil {
return nil, err return nil, err
} }
@ -219,10 +251,18 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally // This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) { func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE") var pipefd, rootfd int
pipefd, err := strconv.Atoi(fdStr) for k, v := range map[string]*int{
"_LIBCONTAINER_INITPIPE": &pipefd,
"_LIBCONTAINER_STATEDIR": &rootfd,
} {
s := os.Getenv(k)
i, err := strconv.Atoi(s)
if err != nil { if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err) return fmt.Errorf("unable to convert %s=%s to int", k, s)
}
*v = i
} }
var ( var (
pipe = os.NewFile(uintptr(pipefd), "pipe") pipe = os.NewFile(uintptr(pipefd), "pipe")
@ -231,6 +271,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
var i initer var i initer
defer func() { defer func() {
// We have an error during the initialization of the container's init, // We have an error during the initialization of the container's init,
@ -239,24 +280,22 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// this defer function will never be called. // this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok { if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init. // Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil { if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err) panic(err)
} }
} }
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err) panic(err)
} }
// ensure that this pipe is always closed // ensure that this pipe is always closed
pipe.Close() pipe.Close()
}() }()
defer func() { defer func() {
if e := recover(); e != nil { if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
} }
}() }()
i, err = newContainerInit(it, pipe, rootfd)
i, err = newContainerInit(it, pipe)
if err != nil { if err != nil {
return err return err
} }

View File

@ -1,6 +1,7 @@
package libcontainer package libcontainer
import ( import (
"fmt"
"io" "io"
"text/template" "text/template"
"time" "time"
@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error {
} }
func newSystemError(err error) Error { func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
if le, ok := err.(Error); ok { if le, ok := err.(Error); ok {
return le return le
} }
@ -58,7 +74,8 @@ func newSystemError(err error) Error {
Timestamp: time.Now(), Timestamp: time.Now(),
Err: err, Err: err,
ECode: SystemError, ECode: SystemError,
Stack: stacktrace.Capture(1), Cause: cause,
Stack: stacktrace.Capture(2),
} }
if err != nil { if err != nil {
gerr.Message = err.Error() gerr.Message = err.Error()
@ -70,13 +87,18 @@ type genericError struct {
Timestamp time.Time Timestamp time.Time
ECode ErrorCode ECode ErrorCode
Err error `json:"-"` Err error `json:"-"`
Cause string
Message string Message string
Stack stacktrace.Stacktrace Stack stacktrace.Stacktrace
} }
func (e *genericError) Error() string { func (e *genericError) Error() string {
if e.Cause == "" {
return e.Message return e.Message
} }
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode { func (e *genericError) Code() ErrorCode {
return e.ECode return e.ECode

View File

@ -52,19 +52,21 @@ type initConfig struct {
AppArmorProfile string `json:"apparmor_profile"` AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"` NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"` User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"` Config *configs.Config `json:"config"`
Console string `json:"console"` Console string `json:"console"`
Networks []*network `json:"network"` Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"` PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"` ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"` Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
} }
type initer interface { type initer interface {
Init() error Init() error
} }
func newContainerInit(t initType, pipe *os.File) (initer, error) { func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil { if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err return nil, err
@ -82,6 +84,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
pipe: pipe, pipe: pipe,
parentPid: syscall.Getppid(), parentPid: syscall.Getppid(),
config: config, config: config,
stateDirFD: stateDirFD,
}, nil }, nil
} }
return nil, fmt.Errorf("unknown init type %q", t) return nil, fmt.Errorf("unknown init type %q", t)
@ -211,8 +214,8 @@ func setupUser(config *initConfig) error {
} }
var addGroups []int var addGroups []int
if len(config.Config.AdditionalGroups) > 0 { if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil { if err != nil {
return err return err
} }

View File

@ -1,12 +1,12 @@
// +build linux // +build linux
package keyctl package keys
import ( import (
"fmt" "fmt"
"syscall"
"strings"
"strconv" "strconv"
"strings"
"syscall"
"unsafe" "unsafe"
) )
@ -17,7 +17,7 @@ const KEYCTL_DESCRIBE = 6
type KeySerial uint32 type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) { func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte = nil var _name *byte
var err error var err error
if len(name) > 0 { if len(name) > 0 {
@ -34,7 +34,7 @@ func JoinSessionKeyring(name string) (KeySerial, error) {
return KeySerial(sessKeyId), nil return KeySerial(sessKeyId), nil
} }
// modify permissions on a keyring by reading the current permissions, // ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting // anding the bits with the given mask (clearing permissions) and setting
// additional permission bits // additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
@ -64,4 +64,3 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
return nil return nil
} }

View File

@ -107,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error {
return nil return nil
} }
// Tell the kernel the label for all files to be created // SetFileCreateLabel tells the kernel the label for all files to be created
func SetFileCreateLabel(fileLabel string) error { func SetFileCreateLabel(fileLabel string) error {
if selinux.SelinuxEnabled() { if selinux.SelinuxEnabled() {
return selinux.Setfscreatecon(fileLabel) return selinux.Setfscreatecon(fileLabel)
@ -115,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error {
return nil return nil
} }
// Change the label of path to the filelabel string. // Relabel changes the label of path to the filelabel string.
// It changes the MCS label to s0 if shared is true. // It changes the MCS label to s0 if shared is true.
// This will allow all containers to share the content. // This will allow all containers to share the content.
func Relabel(path string, fileLabel string, shared bool) error { func Relabel(path string, fileLabel string, shared bool) error {

View File

@ -27,7 +27,8 @@ type Int32msg struct {
Value uint32 Value uint32
} }
// int32msg has the following representation // Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type | // | nlattr len | nlattr type |
// | uint32 value | // | uint32 value |
func (msg *Int32msg) Serialize() []byte { func (msg *Int32msg) Serialize() []byte {
@ -43,7 +44,7 @@ func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4 return syscall_NLA_HDRLEN + 4
} }
// bytemsg has the following representation // Bytemsg has the following representation
// | nlattr len | nlattr type | // | nlattr len | nlattr type |
// | value | pad | // | value | pad |
type Bytemsg struct { type Bytemsg struct {

View File

@ -28,6 +28,10 @@ type Process struct {
// local to the container's user and group configuration. // local to the container's user and group configuration.
User string User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs. // Cwd will change the processes current working directory inside the container's rootfs.
Cwd string Cwd string
@ -102,8 +106,8 @@ type IO struct {
} }
// NewConsole creates new console for process and returns it // NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) { func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid) console, err := NewConsole(rootuid, rootgid)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -51,6 +51,7 @@ type setnsProcess struct {
fds []string fds []string
process *Process process *Process
bootstrapData io.Reader bootstrapData io.Reader
rootDir *os.File
} }
func (p *setnsProcess) startTime() (string, error) { func (p *setnsProcess) startTime() (string, error) {
@ -69,48 +70,49 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close() defer p.parentPipe.Close()
err = p.cmd.Start() err = p.cmd.Start()
p.childPipe.Close() p.childPipe.Close()
p.rootDir.Close()
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "starting setns process")
} }
if p.bootstrapData != nil { if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
} }
} }
if err = p.execSetns(); err != nil { if err = p.execSetns(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "executing setns process")
} }
if len(p.cgroupPaths) > 0 { if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
} }
} }
// set oom_score_adj // set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting oom score")
} }
// set rlimits, this has to be done here because we lose permissions // set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace // to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting rlimits for process")
} }
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "writing config to pipe")
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "calling shutdown on init pipe")
} }
// wait for the child process to fully complete and receive an error message // wait for the child process to fully complete and receive an error message
// if one was encoutered // if one was encoutered
var ierr *genericError var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemErrorWithCause(err, "decoding init error from pipe")
} }
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return ierr
} }
return nil return nil
} }
@ -123,7 +125,7 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait() status, err := p.cmd.Process.Wait()
if err != nil { if err != nil {
p.cmd.Wait() p.cmd.Wait()
return newSystemError(err) return newSystemErrorWithCause(err, "waiting on setns process to finish")
} }
if !status.Success() { if !status.Success() {
p.cmd.Wait() p.cmd.Wait()
@ -132,7 +134,7 @@ func (p *setnsProcess) execSetns() error {
var pid *pid var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait() p.cmd.Wait()
return newSystemError(err) return newSystemErrorWithCause(err, "reading pid from init pipe")
} }
process, err := os.FindProcess(pid.Pid) process, err := os.FindProcess(pid.Pid)
if err != nil { if err != nil {
@ -186,6 +188,7 @@ type initProcess struct {
process *Process process *Process
bootstrapData io.Reader bootstrapData io.Reader
sharePidns bool sharePidns bool
rootDir *os.File
} }
func (p *initProcess) pid() int { func (p *initProcess) pid() int {
@ -221,6 +224,7 @@ func (p *initProcess) execSetns() error {
return err return err
} }
p.cmd.Process = process p.cmd.Process = process
p.process.ops = p
return nil return nil
} }
@ -229,28 +233,29 @@ func (p *initProcess) start() error {
err := p.cmd.Start() err := p.cmd.Start()
p.process.ops = p p.process.ops = p
p.childPipe.Close() p.childPipe.Close()
p.rootDir.Close()
if err != nil { if err != nil {
p.process.ops = nil p.process.ops = nil
return newSystemError(err) return newSystemErrorWithCause(err, "starting init process command")
} }
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err return err
} }
if err := p.execSetns(); err != nil { if err := p.execSetns(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "running exec setns process for init")
} }
// Save the standard descriptor names before the container process // Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now, // can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up. // we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid()) fds, err := getPipeFds(p.pid())
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
} }
p.setExternalDescriptors(fds) p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children // Do this before syncing with child so that no children
// can escape the cgroup // can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil { if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "applying cgroup configuration for process")
} }
defer func() { defer func() {
if err != nil { if err != nil {
@ -259,10 +264,10 @@ func (p *initProcess) start() error {
} }
}() }()
if err := p.createNetworkInterfaces(); err != nil { if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "creating nework interfaces")
} }
if err := p.sendConfig(); err != nil { if err := p.sendConfig(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "sending config to init process")
} }
var ( var (
procSync syncT procSync syncT
@ -278,21 +283,21 @@ loop:
if err == io.EOF { if err == io.EOF {
break loop break loop
} }
return newSystemError(err) return newSystemErrorWithCause(err, "decoding sync type from init pipe")
} }
switch procSync.Type { switch procSync.Type {
case procReady: case procReady:
if err := p.manager.Set(p.config.Config); err != nil { if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting cgroup config for ready process")
} }
// set oom_score_adj // set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting oom score for ready process")
} }
// set rlimits, this has to be done here because we lose permissions // set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace // to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting rlimits for ready process")
} }
// call prestart hooks // call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) { if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
@ -303,16 +308,16 @@ loop:
Pid: p.pid(), Pid: p.pid(),
Root: p.config.Config.Rootfs, Root: p.config.Config.Rootfs,
} }
for _, hook := range p.config.Config.Hooks.Prestart { for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "running prestart hook %d", i)
} }
} }
} }
} }
// Sync with child. // Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "reading syncT run type")
} }
sentRun = true sentRun = true
case procHooks: case procHooks:
@ -324,22 +329,22 @@ loop:
Root: p.config.Config.Rootfs, Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"), BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
} }
for _, hook := range p.config.Config.Hooks.Prestart { for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "running prestart hook %d", i)
} }
} }
} }
// Sync with child. // Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "reading syncT resume type")
} }
sentResume = true sentResume = true
case procError: case procError:
// wait for the child process to fully complete and receive an error message // wait for the child process to fully complete and receive an error message
// if one was encoutered // if one was encoutered
if err := dec.Decode(&ierr); err != nil && err != io.EOF { if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemErrorWithCause(err, "decoding proc error from init")
} }
if ierr != nil { if ierr != nil {
break loop break loop
@ -347,22 +352,22 @@ loop:
// Programmer error. // Programmer error.
panic("No error following JSON procError payload.") panic("No error following JSON procError payload.")
default: default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) return newSystemError(fmt.Errorf("invalid JSON payload from child"))
} }
} }
if !sentRun { if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process: %v", ierr)) return newSystemErrorWithCause(ierr, "container init failed")
} }
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "shutting down init pipe")
} }
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return ierr
} }
return nil return nil
} }
@ -447,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) {
// InitializeIO creates pipes for use with the process's STDIO // InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each // and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr var fds []uintptr
i = &IO{} i = &IO{}
// cleanup in case of an error // cleanup in case of an error
@ -479,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
p.Stderr, i.Stderr = w, r p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace // change ownership of the pipes incase we are in a user namespace
for _, fd := range fds { for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err return nil, err
} }
} }

View File

@ -25,10 +25,10 @@ import (
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// setupDev returns true if /dev needs to be set up. // needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool { func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts { for _, m := range config.Mounts {
if m.Device == "bind" && (m.Destination == "/dev" || m.Destination == "/dev/") { if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false return false
} }
} }
@ -39,35 +39,35 @@ func needsSetupDev(config *configs.Config) bool {
// new mount namespace. // new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
if err := prepareRoot(config); err != nil { if err := prepareRoot(config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "preparing rootfs")
} }
setupDev := needsSetupDev(config) setupDev := needsSetupDev(config)
for _, m := range config.Mounts { for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds { for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil { if err := mountCmd(precmd); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "running premount command")
} }
} }
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs)
} }
for _, postcmd := range m.PostmountCmds { for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil { if err := mountCmd(postcmd); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "running postmount command")
} }
} }
} }
if setupDev { if setupDev {
if err := createDevices(config); err != nil { if err := createDevices(config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "creating device nodes")
} }
if err := setupPtmx(config, console); err != nil { if err := setupPtmx(config, console); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting up ptmx")
} }
if err := setupDevSymlinks(config.Rootfs); err != nil { if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting up /dev symlinks")
} }
} }
// Signal the parent to run the pre-start hooks. // Signal the parent to run the pre-start hooks.
@ -78,7 +78,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
return err return err
} }
if err := syscall.Chdir(config.Rootfs); err != nil { if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
} }
if config.NoPivotRoot { if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs) err = msMoveRoot(config.Rootfs)
@ -86,19 +86,19 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
err = pivotRoot(config.Rootfs, config.PivotDir) err = pivotRoot(config.Rootfs, config.PivotDir)
} }
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "jailing process inside rootfs")
} }
if setupDev { if setupDev {
if err := reOpenDevNull(); err != nil { if err := reOpenDevNull(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "reopening /dev/null inside container")
} }
} }
// remount dev as ro if specifed // remount dev as ro if specifed
for _, m := range config.Mounts { for _, m := range config.Mounts {
if m.Destination == "/dev" { if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 { if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil { if err := remountReadonly(m.Destination); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
} }
} }
break break
@ -107,7 +107,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
// set rootfs ( / ) as readonly // set rootfs ( / ) as readonly
if config.Readonlyfs { if config.Readonlyfs {
if err := setReadonly(); err != nil { if err := setReadonly(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting rootfs as readonly")
} }
} }
syscall.Umask(0022) syscall.Umask(0022)
@ -115,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
} }
func mountCmd(cmd configs.Command) error { func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...) command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env command.Env = cmd.Env
command.Dir = cmd.Dir command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil { if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
} }
return nil return nil
} }
@ -240,34 +238,23 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
return err return err
} }
} }
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged { for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") { for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil { // symlink(2) is very dumb, it will just shove the path into
// if cgroup already exists, then okay(it could have been created before) // the link and doesn't do any checks or relative path
if os.IsExist(err) { // conversion. Also, don't error out if the cgroup already exists.
continue if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
}
os.Chdir(cwd)
return err return err
} }
} }
} }
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 { if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly // remount cgroup root as readonly
mcgrouproot := &configs.Mount{ mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination, Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY, Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
} }
if err := remount(mcgrouproot, rootfs); err != nil { if err := remount(mcgrouproot, rootfs); err != nil {
return err return err
@ -515,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) {
} }
// Make parent mount private if it was shared // Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error { func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs) parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil { if err != nil {
return err return err
} }
@ -550,10 +537,11 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err return err
} }
if config.NoPivotRoot {
if err := rootfsParentMountPrivate(config); err != nil { if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err return err
} }
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
} }
@ -594,9 +582,16 @@ func pivotRoot(rootfs, pivotBaseDir string) (err error) {
err = errVal err = errVal
} }
}() }()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err
}
// Try again
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err) return fmt.Errorf("pivot_root %s", err)
} }
}
if err := syscall.Chdir("/"); err != nil { if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err) return fmt.Errorf("chdir / %s", err)
} }
@ -705,7 +700,7 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
data = label.FormatMountLabel(m.Data, mountLabel) data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags flags = m.Flags
) )
if dest == "/dev" { if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY flags &= ^syscall.MS_RDONLY
} }
if !strings.HasPrefix(dest, rootfs) { if !strings.HasPrefix(dest, rootfs) {

View File

@ -36,6 +36,11 @@ var archs = map[string]string{
"SCMP_ARCH_MIPSEL": "mipsel", "SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64", "SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32", "SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
} }
// ConvertStringToOperator converts a string into a Seccomp comparison operator. // ConvertStringToOperator converts a string into a Seccomp comparison operator.

View File

@ -10,7 +10,7 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing // InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error { func InitSeccomp(config *configs.Seccomp) error {
if config != nil { if config != nil {
return ErrSeccompNotEnabled return ErrSeccompNotEnabled

View File

@ -16,7 +16,6 @@ import (
"sync" "sync"
"syscall" "syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
) )
@ -60,16 +59,31 @@ func getSelinuxMountPoint() string {
} }
selinuxfs = "" selinuxfs = ""
mounts, err := mount.GetMounts() f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return selinuxfs return selinuxfs
} }
for _, mount := range mounts { defer f.Close()
if mount.Fstype == "selinuxfs" {
selinuxfs = mount.Mountpoint scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
// Safe as mountinfo encodes mountpoints with spaces as \040.
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
continue
}
if !strings.Contains(txt[sepIdx:], "selinuxfs") {
continue
}
fields := strings.Split(txt, " ")
if len(fields) < 5 {
continue
}
selinuxfs = fields[4]
break break
} }
}
if selinuxfs != "" { if selinuxfs != "" {
var buf syscall.Statfs_t var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf) syscall.Statfs(selinuxfs, &buf)
@ -297,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string {
for ORD > TIER { for ORD > TIER {
ORD = ORD - TIER ORD = ORD - TIER
TIER -= 1 TIER--
} }
TIER = SETSIZE - TIER TIER = SETSIZE - TIER
ORD = ORD + TIER ORD = ORD + TIER
@ -438,7 +452,7 @@ func badPrefix(fpath string) error {
return nil return nil
} }
// Change the fpath file object to the SELinux label scon. // Chcon changes the fpath file object to the SELinux label scon.
// If the fpath is a directory and recurse is true Chcon will walk the // If the fpath is a directory and recurse is true Chcon will walk the
// directory tree setting the label // directory tree setting the label
func Chcon(fpath string, scon string, recurse bool) error { func Chcon(fpath string, scon string, recurse bool) error {
@ -472,14 +486,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" { con["level"] == "" {
return nil return nil
} }
return []string{"label:user:" + con["user"], return []string{"label=user:" + con["user"],
"label:role:" + con["role"], "label=role:" + con["role"],
"label:type:" + con["type"], "label=type:" + con["type"],
"label:level:" + con["level"]} "label=level:" + con["level"]}
} }
// DisableSecOpt returns a security opt that can be used to disabling SELinux // DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes // labeling support for future container processes
func DisableSecOpt() []string { func DisableSecOpt() []string {
return []string{"label:disable"} return []string{"label=disable"}
} }

View File

@ -24,10 +24,12 @@ func (l *linuxSetnsInit) getSessionRingName() string {
} }
func (l *linuxSetnsInit) Init() error { func (l *linuxSetnsInit) Init() error {
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring // do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil { if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err return err
} }
}
if l.config.NoNewPrivileges { if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err return err
@ -44,10 +46,8 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return err
} }
if l.config.ProcessLabel != "" {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err return err
} }
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
} }

View File

@ -2,14 +2,14 @@ package stacktrace
import "runtime" import "runtime"
// Caputure captures a stacktrace for the current calling go program // Capture captures a stacktrace for the current calling go program
// //
// skip is the number of frames to skip // skip is the number of frames to skip
func Capture(userSkip int) Stacktrace { func Capture(userSkip int) Stacktrace {
var ( var (
skip = userSkip + 1 // add one for our own function skip = userSkip + 1 // add one for our own function
frames []Frame frames []Frame
prevPc uintptr = 0 prevPc uintptr
) )
for i := skip; ; i++ { for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i) pc, file, line, ok := runtime.Caller(i)

View File

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"io" "io"
"os" "os"
"os/exec"
"syscall" "syscall"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
@ -17,8 +18,9 @@ import (
) )
type linuxStandardInit struct { type linuxStandardInit struct {
pipe io.ReadWriter pipe io.ReadWriteCloser
parentPid int parentPid int
stateDirFD int
config *initConfig config *initConfig
} }
@ -43,17 +45,19 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
const PR_SET_NO_NEW_PRIVS = 0x26 const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error { func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams() ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring // do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname) sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil { if err != nil {
return err return err
} }
// make session keyring searcheable // make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err return err
} }
}
var console *linuxConsole var console *linuxConsole
if l.config.Console != "" { if l.config.Console != "" {
@ -123,7 +127,10 @@ func (l *linuxStandardInit) Init() error {
if err := syncParentReady(l.pipe); err != nil { if err := syncParentReady(l.pipe); err != nil {
return err return err
} }
if l.config.Config.Seccomp != nil { // Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err
} }
@ -137,11 +144,35 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
// compare the parent from the inital start of the init process and make sure that it did not change. // compare the parent from the inital start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparened to something else so we should // if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else. // just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid { if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
} }
// check for the arg before waiting to make sure it exists and it is returned
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) // as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
l.pipe.Close()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
}
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
} }

View File

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"syscall"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
@ -77,7 +78,7 @@ type stoppedState struct {
} }
func (b *stoppedState) status() Status { func (b *stoppedState) status() Status {
return Destroyed return Stopped
} }
func (b *stoppedState) transition(s containerState) error { func (b *stoppedState) transition(s containerState) error {
@ -110,11 +111,11 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error { func (r *runningState) transition(s containerState) error {
switch s.(type) { switch s.(type) {
case *stoppedState: case *stoppedState:
running, err := r.c.isRunning() t, err := r.c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
} }
r.c.state = s r.c.state = s
@ -129,16 +130,40 @@ func (r *runningState) transition(s containerState) error {
} }
func (r *runningState) destroy() error { func (r *runningState) destroy() error {
running, err := r.c.isRunning() t, err := r.c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
} }
return destroy(r.c) return destroy(r.c)
} }
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(syscall.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a // pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first. // paused state and must transition back to running first.
type pausedState struct { type pausedState struct {
@ -161,11 +186,11 @@ func (p *pausedState) transition(s containerState) error {
} }
func (p *pausedState) destroy() error { func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning() t, err := p.c.runType()
if err != nil { if err != nil {
return err return err
} }
if !isRunning { if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err return err
} }
@ -175,7 +200,7 @@ func (p *pausedState) destroy() error {
} }
// restoredState is the same as the running state but also has accociated checkpoint // restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called. // information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct { type restoredState struct {
imageDir string imageDir string
c *linuxContainer c *linuxContainer
@ -204,23 +229,23 @@ func (r *restoredState) destroy() error {
return destroy(r.c) return destroy(r.c)
} }
// createdState is used whenever a container is restored, loaded, or setting additional // loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting. // processes inside and it should not be destroyed when it is exiting.
type createdState struct { type loadedState struct {
c *linuxContainer c *linuxContainer
s Status s Status
} }
func (n *createdState) status() Status { func (n *loadedState) status() Status {
return n.s return n.s
} }
func (n *createdState) transition(s containerState) error { func (n *loadedState) transition(s containerState) error {
n.c.state = s n.c.state = s
return nil return nil
} }
func (n *createdState) destroy() error { func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil { if err := n.c.refreshState(); err != nil {
return err return err
} }

View File

@ -0,0 +1,7 @@
package libcontainer
// Solaris - TODO
type Stats struct {
Interfaces []*NetworkInterface
}

View File

@ -100,17 +100,12 @@ func Setctty() error {
return nil return nil
} }
/* // RunningInUserNS detects whether we are currently running in a user namespace.
* Detect whether we are currently running in a user namespace. // Copied from github.com/lxc/lxd/shared/util.go
* Copied from github.com/lxc/lxd/shared/util.go
*/
func RunningInUserNS() bool { func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map") file, err := os.Open("/proc/self/uid_map")
if err != nil { if err != nil {
/* // This kernel-provided file only exists if user namespaces are supported
* This kernel-provided file only exists if user namespaces are
* supported
*/
return false return false
} }
defer file.Close() defer file.Close()

View File

@ -100,3 +100,22 @@ func SearchLabels(labels []string, query string) string {
} }
return "" return ""
} }
// Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}