diff --git a/go.mod b/go.mod index d64fa40723a..a4c37068a91 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/onsi/gomega v1.7.0 github.com/opencontainers/go-digest v1.0.0-rc1 github.com/opencontainers/image-spec v1.0.1 // indirect - github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830 + github.com/opencontainers/runc v1.0.0-rc9 github.com/opencontainers/runtime-spec v1.0.0 // indirect github.com/opencontainers/selinux v1.2.2 github.com/pborman/uuid v1.2.0 @@ -367,7 +367,7 @@ replace ( github.com/onsi/gomega => github.com/onsi/gomega v1.7.0 github.com/opencontainers/go-digest => github.com/opencontainers/go-digest v1.0.0-rc1 github.com/opencontainers/image-spec => github.com/opencontainers/image-spec v1.0.1 - github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830 + github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc9 github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.0 github.com/opencontainers/selinux => github.com/opencontainers/selinux v1.2.2 github.com/pborman/uuid => github.com/pborman/uuid v1.2.0 diff --git a/go.sum b/go.sum index c54c328e471..6758f1d5211 100644 --- a/go.sum +++ b/go.sum @@ -345,8 +345,8 @@ github.com/opencontainers/go-digest v1.0.0-rc1 h1:WzifXhOVOEOuFYOJAW6aQqW0TooG2i github.com/opencontainers/go-digest v1.0.0-rc1/go.mod h1:cMLVZDEM3+U2I4VmLI6N8jQYUd2OVphdqWwCJHrFt2s= github.com/opencontainers/image-spec v1.0.1 h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVojFA6h/TRcI= github.com/opencontainers/image-spec v1.0.1/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= -github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830 h1:yvQ/2Pupw60ON8TYEIGGTAI77yZsWYkiOeHFZWkwlCk= -github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= +github.com/opencontainers/runc v1.0.0-rc9 h1:/k06BMULKF5hidyoZymkoDCzdJzltZpz/UU4LguQVtc= +github.com/opencontainers/runc v1.0.0-rc9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runtime-spec v1.0.0 h1:O6L965K88AilqnxeYPks/75HLpp4IG+FjeSCI3cVdRg= github.com/opencontainers/runtime-spec v1.0.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.2.2 h1:Kx9J6eDG5/24A6DtUquGSpJQ+m2MUTahn4FtGEe8bFg= diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 1d7fa04c082..a791ca2d249 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -261,6 +261,7 @@ process := &libcontainer.Process{ Stdin: os.Stdin, Stdout: os.Stdout, Stderr: os.Stderr, + Init: true, } err := container.Run(process) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go index 7fff0627fa1..debfc1e489e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -6,6 +6,8 @@ import ( "fmt" "io/ioutil" "os" + + "github.com/opencontainers/runc/libcontainer/utils" ) // IsEnabled returns true if apparmor is enabled for the host. @@ -19,7 +21,7 @@ func IsEnabled() bool { return false } -func setprocattr(attr, value string) error { +func setProcAttr(attr, value string) error { // Under AppArmor you can only change your own attr, so use /proc/self/ // instead of /proc// like libapparmor does path := fmt.Sprintf("/proc/self/attr/%s", attr) @@ -30,6 +32,10 @@ func setprocattr(attr, value string) error { } defer f.Close() + if err := utils.EnsureProcHandle(f); err != nil { + return err + } + _, err = fmt.Fprintf(f, "%s", value) return err } @@ -37,7 +43,7 @@ func setprocattr(attr, value string) error { // changeOnExec reimplements aa_change_onexec from libapparmor in Go func changeOnExec(name string) error { value := "exec " + name - if err := setprocattr("exec", value); err != nil { + if err := setProcAttr("exec", value); err != nil { return fmt.Errorf("apparmor failed to apply profile: %s", err) } return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go index 7c66f572580..9daef29e480 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go @@ -71,7 +71,11 @@ func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilitie } ambient = append(ambient, v) } - pid, err := capability.NewPid(0) + pid, err := capability.NewPid2(0) + if err != nil { + return nil, err + } + err = pid.Load() if err != nil { return nil, err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD index 8626e7a2360..9b113a196f9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD @@ -6,19 +6,25 @@ go_library( "apply_raw.go", "blkio.go", "cpu.go", + "cpu_v2.go", "cpuacct.go", "cpuset.go", + "cpuset_v2.go", "devices.go", "freezer.go", + "freezer_v2.go", "fs_unsupported.go", "hugetlb.go", + "io_v2.go", "kmem.go", "memory.go", + "memory_v2.go", "name.go", "net_cls.go", "net_prio.go", "perf_event.go", "pids.go", + "pids_v2.go", "utils.go", ], importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs", diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index f672ba27377..512fd700104 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -18,7 +18,7 @@ import ( ) var ( - subsystems = subsystemSet{ + subsystemsLegacy = subsystemSet{ &CpusetGroup{}, &DevicesGroup{}, &MemoryGroup{}, @@ -33,6 +33,14 @@ var ( &FreezerGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, } + subsystemsUnified = subsystemSet{ + &CpusetGroupV2{}, + &FreezerGroupV2{}, + &CpuGroupV2{}, + &MemoryGroupV2{}, + &IOGroupV2{}, + &PidsGroupV2{}, + } HugePageSizes, _ = cgroups.GetHugePageSize() ) @@ -129,6 +137,13 @@ func isIgnorableError(rootless bool, err error) bool { return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES } +func (m *Manager) getSubsystems() subsystemSet { + if cgroups.IsCgroup2UnifiedMode() { + return subsystemsUnified + } + return subsystemsLegacy +} + func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -158,7 +173,7 @@ func (m *Manager) Apply(pid int) (err error) { return cgroups.EnterPid(m.Paths, pid) } - for _, sys := range subsystems { + for _, sys := range m.getSubsystems() { // TODO: Apply should, ideally, be reentrant or be broken up into a separate // create and join phase so that the cgroup hierarchy for a container can be // created then join consists of writing the process pids to cgroup.procs @@ -214,7 +229,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { defer m.mu.Unlock() stats := cgroups.NewStats() for name, path := range m.Paths { - sys, err := subsystems.Get(name) + sys, err := m.getSubsystems().Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue } @@ -226,14 +241,18 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { + if container.Cgroups == nil { + return nil + } + // If Paths are set, then we are just joining cgroups paths // and there is no need to set any values. - if m.Cgroups.Paths != nil { + if m.Cgroups != nil && m.Cgroups.Paths != nil { return nil } paths := m.GetPaths() - for _, sys := range subsystems { + for _, sys := range m.getSubsystems() { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { if m.Rootless && sys.Name() == "devices" { @@ -262,11 +281,15 @@ func (m *Manager) Set(container *configs.Config) error { // Freeze toggles the container's freezer cgroup depending on the state // provided func (m *Manager) Freeze(state configs.FreezerState) error { + if m.Cgroups == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + paths := m.GetPaths() dir := paths["freezer"] prevState := m.Cgroups.Resources.Freezer m.Cgroups.Resources.Freezer = state - freezer, err := subsystems.Get("freezer") + freezer, err := m.getSubsystems().Get("freezer") if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go new file mode 100644 index 00000000000..245071ae504 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go @@ -0,0 +1,92 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpuGroupV2 struct { +} + +func (s *CpuGroupV2) Name() string { + return "cpu" +} + +func (s *CpuGroupV2) Apply(d *cgroupData) error { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + path, err := d.path("cpu") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(path, d.config, d.pid) +} + +func (s *CpuGroupV2) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpu cgroup mounted. + // Just do nothing and don't fail. + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuWeight != 0 { + if err := writeFile(path, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil { + return err + } + } + + if cgroup.Resources.CpuMax != "" { + if err := writeFile(path, "cpu.max", cgroup.Resources.CpuMax); err != nil { + return err + } + } + + return nil +} + +func (s *CpuGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("cpu")) +} + +func (s *CpuGroupV2) GetStats(path string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(path, "cpu.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go new file mode 100644 index 00000000000..fa88cfb7d6e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go @@ -0,0 +1,159 @@ +// +build linux + +package fs + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +type CpusetGroupV2 struct { +} + +func (s *CpusetGroupV2) Name() string { + return "cpuset" +} + +func (s *CpusetGroupV2) Apply(d *cgroupData) error { + dir, err := d.path("cpuset") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(dir, d.config, d.pid) +} + +func (s *CpusetGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("cpuset")) +} + +func (s *CpusetGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *CpusetGroupV2) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo") + if err != nil { + return err + } + root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo))) + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := s.ensureParent(filepath.Dir(dir), root); err != nil { + return err + } + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + return err + } + + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + return cgroups.WriteCgroupProc(dir, pid) +} + +func (s *CpusetGroupV2) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus.effective")); err != nil { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems.effective")); err != nil { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (s *CpusetGroupV2) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if libcontainerUtils.CleanPath(parent) == root { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if err := s.ensureParent(parent, root); err != nil { + return err + } + if err := os.MkdirAll(current, 0755); err != nil { + return err + } + return s.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (s *CpusetGroupV2) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + + if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { + return err + } + if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + return err + } + + if s.isEmpty(currentCpus) { + if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if s.isEmpty(currentMems) { + if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroupV2) isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} + +func (s *CpusetGroupV2) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { + if err := s.Set(path, cgroup); err != nil { + return err + } + return s.copyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go new file mode 100644 index 00000000000..186de9ab41f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go @@ -0,0 +1,74 @@ +// +build linux + +package fs + +import ( + "fmt" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type FreezerGroupV2 struct { +} + +func (s *FreezerGroupV2) Name() string { + return "freezer" +} + +func (s *FreezerGroupV2) Apply(d *cgroupData) error { + _, err := d.join("freezer") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *FreezerGroupV2) Set(path string, cgroup *configs.Cgroup) error { + var desiredState string + filename := "cgroup.freeze" + if cgroup.Resources.Freezer == configs.Frozen { + desiredState = "1" + } else { + desiredState = "0" + } + + switch cgroup.Resources.Freezer { + case configs.Frozen, configs.Thawed: + for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := writeFile(path, filename, desiredState); err != nil { + return err + } + + state, err := readFile(path, filename) + if err != nil { + return err + } + if strings.TrimSpace(state) == desiredState { + break + } + + time.Sleep(1 * time.Millisecond) + } + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + } + + return nil +} + +func (s *FreezerGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("freezer")) +} + +func (s *FreezerGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go new file mode 100644 index 00000000000..84b1829451a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go @@ -0,0 +1,191 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type IOGroupV2 struct { +} + +func (s *IOGroupV2) Name() string { + return "blkio" +} + +func (s *IOGroupV2) Apply(d *cgroupData) error { + _, err := d.join("blkio") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *IOGroupV2) Set(path string, cgroup *configs.Cgroup) error { + cgroupsv2 := cgroups.IsCgroup2UnifiedMode() + + if cgroup.Resources.BlkioWeight != 0 { + filename := "blkio.weight" + if cgroupsv2 { + filename = "io.bfq.weight" + } + if err := writeFile(path, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + if cgroup.Resources.BlkioLeafWeight != 0 { + if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range cgroup.Resources.BlkioWeightDevice { + if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { + return err + } + if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if cgroupsv2 { + if err := writeFile(path, "io.max", td.StringName("rbps")); err != nil { + return err + } + } else { + if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if cgroupsv2 { + if err := writeFile(path, "io.max", td.StringName("wbps")); err != nil { + return err + } + } else { + if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if cgroupsv2 { + if err := writeFile(path, "io.max", td.StringName("riops")); err != nil { + return err + } + } else { + if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if cgroupsv2 { + if err := writeFile(path, "io.max", td.StringName("wiops")); err != nil { + return err + } + } else { + if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + } + + return nil +} + +func (s *IOGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("blkio")) +} + +func readCgroup2MapFile(path string, name string) (map[string][]string, error) { + ret := map[string][]string{} + p := filepath.Join("/sys/fs/cgroup", path, name) + f, err := os.Open(p) + if err != nil { + if os.IsNotExist(err) { + return ret, nil + } + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, err + } + return ret, nil +} + +func (s *IOGroupV2) getCgroupV2Stats(path string, stats *cgroups.Stats) error { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var ioServiceBytesRecursive []cgroups.BlkioStatEntry + values, err := readCgroup2MapFile(path, "io.stat") + if err != nil { + return err + } + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive} + return nil +} + +func (s *IOGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return s.getCgroupV2Stats(path, stats) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go new file mode 100644 index 00000000000..2ad997bcc29 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go @@ -0,0 +1,164 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type MemoryGroupV2 struct { +} + +func (s *MemoryGroupV2) Name() string { + return "memory" +} + +func (s *MemoryGroupV2) Apply(d *cgroupData) (err error) { + path, err := d.path("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } else if path == "" { + return nil + } + if memoryAssigned(d.config) { + if _, err := os.Stat(path); os.IsNotExist(err) { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // Only enable kernel memory accouting when this cgroup + // is created by libcontainer, otherwise we might get + // error when people use `cgroupsPath` to join an existed + // cgroup whose kernel memory is not initialized. + if err := EnableKernelMemoryAccounting(path); err != nil { + return err + } + } + } + defer func() { + if err != nil { + os.RemoveAll(path) + } + }() + + // We need to join memory cgroup after set memory limits, because + // kmem.limit_in_bytes can only be set when the cgroup is empty. + _, err = d.join("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func setMemoryAndSwapCgroups(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.MemorySwap != 0 { + if err := writeFile(path, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + if cgroup.Resources.Memory != 0 { + if err := writeFile(path, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + return nil +} + +func (s *MemoryGroupV2) Set(path string, cgroup *configs.Cgroup) error { + + if err := setMemoryAndSwapCgroups(path, cgroup); err != nil { + return err + } + + if cgroup.Resources.KernelMemory != 0 { + if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + return err + } + } + + if cgroup.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.high", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *MemoryGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("memory")) +} + +func (s *MemoryGroupV2) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryDataV2(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(path, "swap") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + + stats.MemoryStats.UseHierarchy = true + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = strings.Join([]string{"memory", name}, ".") + } + usage := strings.Join([]string{moduleName, "current"}, ".") + limit := strings.Join([]string{moduleName, "max"}, ".") + + value, err := getCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + } + memoryData.Usage = value + + value, err = getCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go new file mode 100644 index 00000000000..3413a2a0d18 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go @@ -0,0 +1,107 @@ +// +build linux + +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type PidsGroupV2 struct { +} + +func (s *PidsGroupV2) Name() string { + return "pids" +} + +func (s *PidsGroupV2) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func isNOTSUP(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.ENOTSUP + default: + return false + } +} + +func (s *PidsGroupV2) GetStats(path string, stats *cgroups.Stats) error { + current, err := getCgroupParamUint(path, "pids.current") + if os.IsNotExist(err) { + // if the controller is not enabled, let's read the list + // PIDs (or threads if cgroup.threads is enabled) + contents, err := ioutil.ReadFile(filepath.Join(path, "cgroup.procs")) + if err != nil && isNOTSUP(err) { + contents, err = ioutil.ReadFile(filepath.Join(path, "cgroup.threads")) + } + if err != nil { + return err + } + pids := make(map[string]string) + for _, i := range strings.Split(string(contents), "\n") { + if i != "" { + pids[i] = i + } + } + stats.PidsStats.Current = uint64(len(pids)) + stats.PidsStats.Limit = 0 + return nil + + } + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + maxString, err := getCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = parseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go index 5ff0a161504..30922777f53 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io/ioutil" + "math" "path/filepath" "strconv" "strings" @@ -59,8 +60,12 @@ func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { if err != nil { return 0, err } + trimmed := strings.TrimSpace(string(contents)) + if trimmed == "max" { + return math.MaxUint64, nil + } - res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + res, err := parseUint(trimmed, 10, 64) if err != nil { return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD index 60c142d474f..3e021bbcca5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD @@ -5,6 +5,7 @@ go_library( srcs = [ "apply_nosystemd.go", "apply_systemd.go", + "unified_hierarchy.go", ], importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd", importpath = "github.com/opencontainers/runc/libcontainer/cgroups/systemd", @@ -28,7 +29,6 @@ go_library( ], "@io_bazel_rules_go//go/platform:linux": [ "//vendor/github.com/coreos/go-systemd/dbus:go_default_library", - "//vendor/github.com/coreos/go-systemd/util:go_default_library", "//vendor/github.com/godbus/dbus:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index 3bf723bf964..f274a49a3c1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -14,7 +14,6 @@ import ( "time" systemdDbus "github.com/coreos/go-systemd/dbus" - systemdUtil "github.com/coreos/go-systemd/util" "github.com/godbus/dbus" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" @@ -22,7 +21,7 @@ import ( "github.com/sirupsen/logrus" ) -type Manager struct { +type LegacyManager struct { mu sync.Mutex Cgroups *configs.Cgroup Paths map[string]string @@ -50,7 +49,7 @@ func (s subsystemSet) Get(name string) (subsystem, error) { return nil, errSubsystemDoesNotExist } -var subsystems = subsystemSet{ +var legacySubsystems = subsystemSet{ &fs.CpusetGroup{}, &fs.DevicesGroup{}, &fs.MemoryGroup{}, @@ -72,11 +71,8 @@ const ( ) var ( - connLock sync.Mutex - theConn *systemdDbus.Conn - hasStartTransientUnit bool - hasStartTransientSliceUnit bool - hasDelegateSlice bool + connLock sync.Mutex + theConn *systemdDbus.Conn ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -86,8 +82,23 @@ func newProp(name string, units interface{}) systemdDbus.Property { } } +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func isRunningSystemd() bool { + fi, err := os.Lstat("/run/systemd/system") + if err != nil { + return false + } + return fi.IsDir() +} + func UseSystemd() bool { - if !systemdUtil.IsRunningSystemd() { + if !isRunningSystemd() { return false } @@ -100,82 +111,31 @@ func UseSystemd() bool { if err != nil { return false } - - // Assume we have StartTransientUnit - hasStartTransientUnit = true - - // But if we get UnknownMethod error we don't - if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { - hasStartTransientUnit = false - return hasStartTransientUnit - } - } - } - - // Assume we have the ability to start a transient unit as a slice - // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 - // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 - hasStartTransientSliceUnit = true - - // To ensure simple clean-up, we create a slice off the root with no hierarchy - slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid()) - if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil { - if _, ok := err.(dbus.Error); ok { - hasStartTransientSliceUnit = false - } - } - - for i := 0; i <= testSliceWait; i++ { - if _, err := theConn.StopUnit(slice, "replace", nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { - hasStartTransientSliceUnit = false - break - } - } - } else { - break - } - time.Sleep(time.Millisecond) - } - - // Not critical because of the stop unit logic above. - theConn.StopUnit(slice, "replace", nil) - - // Assume StartTransientUnit on a slice allows Delegate - hasDelegateSlice = true - dlSlice := newProp("Delegate", true) - if _, err := theConn.StartTransientUnit(slice, "replace", []systemdDbus.Property{dlSlice}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - // Starting with systemd v237, Delegate is not even a property of slices anymore, - // so the D-Bus call fails with "InvalidArgs" error. - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { - hasDelegateSlice = false - } - } - } - - // Not critical because of the stop unit logic above. - theConn.StopUnit(slice, "replace", nil) } - return hasStartTransientUnit + return true } func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { - if !systemdUtil.IsRunningSystemd() { + if !isRunningSystemd() { return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager") } + if cgroups.IsCgroup2UnifiedMode() { + return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &UnifiedManager{ + Cgroups: config, + Paths: paths, + } + }, nil + } return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &Manager{ + return &LegacyManager{ Cgroups: config, Paths: paths, } }, nil } -func (m *Manager) Apply(pid int) error { +func (m *LegacyManager) Apply(pid int) error { var ( c = m.Cgroups unitName = getUnitName(c) @@ -208,10 +168,6 @@ func (m *Manager) Apply(pid int) error { // if we create a slice, the parent is defined via a Wants= if strings.HasSuffix(unitName, ".slice") { - // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 - if !hasStartTransientSliceUnit { - return fmt.Errorf("systemd version does not support ability to start a slice as transient unit") - } properties = append(properties, systemdDbus.PropWants(slice)) } else { // otherwise, we use Slice= @@ -224,12 +180,7 @@ func (m *Manager) Apply(pid int) error { } // Check if we can delegate. This is only supported on systemd versions 218 and above. - if strings.HasSuffix(unitName, ".slice") { - if hasDelegateSlice { - // systemd 237 and above no longer allows delegation on a slice - properties = append(properties, newProp("Delegate", true)) - } - } else { + if !strings.HasSuffix(unitName, ".slice") { // Assume scopes always support delegation. properties = append(properties, newProp("Delegate", true)) } @@ -310,7 +261,7 @@ func (m *Manager) Apply(pid int) error { } paths := make(map[string]string) - for _, s := range subsystems { + for _, s := range legacySubsystems { subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem @@ -325,7 +276,7 @@ func (m *Manager) Apply(pid int) error { return nil } -func (m *Manager) Destroy() error { +func (m *LegacyManager) Destroy() error { if m.Cgroups.Paths != nil { return nil } @@ -339,7 +290,7 @@ func (m *Manager) Destroy() error { return nil } -func (m *Manager) GetPaths() map[string]string { +func (m *LegacyManager) GetPaths() map[string]string { m.mu.Lock() paths := m.Paths m.mu.Unlock() @@ -351,6 +302,7 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { if err != nil { return "", err } + if err := os.MkdirAll(path, 0755); err != nil { return "", err } @@ -361,7 +313,7 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { } func joinCgroups(c *configs.Cgroup, pid int) error { - for _, sys := range subsystems { + for _, sys := range legacySubsystems { name := sys.Name() switch name { case "name=systemd": @@ -456,14 +408,14 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil } -func (m *Manager) Freeze(state configs.FreezerState) error { +func (m *LegacyManager) Freeze(state configs.FreezerState) error { path, err := getSubsystemPath(m.Cgroups, "freezer") if err != nil { return err } prevState := m.Cgroups.Resources.Freezer m.Cgroups.Resources.Freezer = state - freezer, err := subsystems.Get("freezer") + freezer, err := legacySubsystems.Get("freezer") if err != nil { return err } @@ -475,7 +427,7 @@ func (m *Manager) Freeze(state configs.FreezerState) error { return nil } -func (m *Manager) GetPids() ([]int, error) { +func (m *LegacyManager) GetPids() ([]int, error) { path, err := getSubsystemPath(m.Cgroups, "devices") if err != nil { return nil, err @@ -483,7 +435,7 @@ func (m *Manager) GetPids() ([]int, error) { return cgroups.GetPids(path) } -func (m *Manager) GetAllPids() ([]int, error) { +func (m *LegacyManager) GetAllPids() ([]int, error) { path, err := getSubsystemPath(m.Cgroups, "devices") if err != nil { return nil, err @@ -491,12 +443,12 @@ func (m *Manager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(path) } -func (m *Manager) GetStats() (*cgroups.Stats, error) { +func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() stats := cgroups.NewStats() for name, path := range m.Paths { - sys, err := subsystems.Get(name) + sys, err := legacySubsystems.Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue } @@ -508,13 +460,13 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { return stats, nil } -func (m *Manager) Set(container *configs.Config) error { +func (m *LegacyManager) Set(container *configs.Config) error { // If Paths are set, then we are just joining cgroups paths // and there is no need to set any values. if m.Cgroups.Paths != nil { return nil } - for _, sys := range subsystems { + for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. path, err := getSubsystemPath(container.Cgroups, sys.Name()) if err != nil && !cgroups.IsNotFound(err) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go new file mode 100644 index 00000000000..d394e3a5a8f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go @@ -0,0 +1,329 @@ +// +build linux,!static_build + +package systemd + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +type UnifiedManager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +var unifiedSubsystems = subsystemSet{ + &fs.CpusetGroupV2{}, + &fs.FreezerGroupV2{}, + &fs.CpuGroupV2{}, + &fs.MemoryGroupV2{}, + &fs.IOGroupV2{}, + &fs.PidsGroupV2{}, +} + +func (m *UnifiedManager) Apply(pid int) error { + var ( + c = m.Cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if !strings.HasSuffix(unitName, ".slice") { + // Assume scopes always support delegation. + properties = append(properties, newProp("Delegate", true)) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + if c.Resources.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(c.Resources.Memory))) + } + + if c.Resources.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", c.Resources.CpuShares)) + } + + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + if c.Resources.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) + } + + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { + if err := setKernelMemory(c); err != nil { + return err + } + } + + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { + return err + } + + if err := joinCgroupsV2(c, pid); err != nil { + return err + } + + paths := make(map[string]string) + for _, s := range unifiedSubsystems { + subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[s.Name()] = subsystemPath + } + m.Paths = paths + return nil +} + +func (m *UnifiedManager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *UnifiedManager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} + +func createCgroupsv2Path(path string) (Err error) { + content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + if err != nil { + return err + } + if !filepath.HasPrefix(path, "/sys/fs/cgroup") { + return fmt.Errorf("invalid cgroup path %s", path) + } + + res := "" + for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") { + if i == 0 { + res = fmt.Sprintf("+%s", c) + } else { + res = res + fmt.Sprintf(" +%s", c) + } + } + resByte := []byte(res) + + current := "/sys/fs" + elements := strings.Split(path, "/") + for i, e := range elements[3:] { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + } + if i < len(elements[3:])-1 { + if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil { + return err + } + } + } + return nil +} + +func joinCgroupsV2(c *configs.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "memory") + if err != nil { + return err + } + return createCgroupsv2Path(path) +} + +func (m *UnifiedManager) Freeze(state configs.FreezerState) error { + path, err := getSubsystemPath(m.Cgroups, "freezer") + if err != nil { + return err + } + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := unifiedSubsystems.Get("freezer") + if err != nil { + return err + } + err = freezer.Set(path, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *UnifiedManager) GetPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(path) +} + +func (m *UnifiedManager) GetAllPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + +func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := unifiedSubsystems.Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *UnifiedManager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups.Paths != nil { + return nil + } + for _, sys := range unifiedSubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index ec79ae76723..60790f83b4e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -11,6 +11,8 @@ import ( "path/filepath" "strconv" "strings" + "sync" + "syscall" "time" units "github.com/docker/go-units" @@ -22,6 +24,11 @@ const ( CgroupProcesses = "cgroup.procs" ) +var ( + isUnifiedOnce sync.Once + isUnified bool +) + // HugePageSizeUnitList is a list of the units used by the linux kernel when // naming the HugePage control files. // https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt @@ -29,6 +36,18 @@ const ( // depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st syscall.Statfs_t + if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil { + panic("cannot statfs cgroup root") + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) @@ -49,6 +68,10 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, } defer f.Close() + if IsCgroup2UnifiedMode() { + subsystem = "" + } + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) } @@ -57,12 +80,12 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst for scanner.Scan() { txt := scanner.Text() fields := strings.Fields(txt) - if len(fields) < 5 { + if len(fields) < 9 { continue } if strings.HasPrefix(fields[4], cgroupPath) { for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { + if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem { return fields[4], fields[3], nil } } @@ -76,6 +99,19 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst } func isSubsystemAvailable(subsystem string) bool { + if IsCgroup2UnifiedMode() { + controllers, err := GetAllSubsystems() + if err != nil { + return false + } + for _, c := range controllers { + if c == subsystem { + return true + } + } + return false + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return false @@ -120,7 +156,7 @@ func FindCgroupMountpointDir() (string, error) { return "", fmt.Errorf("Found no fields post '-' in %q", text) } - if postSeparatorFields[0] == "cgroup" { + if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" { // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) @@ -193,6 +229,19 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, // GetCgroupMounts returns the mounts for the cgroup subsystems. // all indicates whether to return just the first instance or all the mounts. func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: "/sys/fs/cgroup", + Root: "/sys/fs/cgroup", + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return nil, err @@ -356,6 +405,9 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { } func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "/", nil + } if p, ok := cgroups[subsystem]; ok { return p, nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/configs/BUILD index 82b8b3cbe8d..44338e3ab19 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/BUILD @@ -5,7 +5,7 @@ go_library( srcs = [ "blkio_device.go", "cgroup_linux.go", - "cgroup_windows.go", + "cgroup_unsupported.go", "config.go", "config_linux.go", "device.go", diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go index e0f3ca16533..fa195bf90f8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -59,3 +59,8 @@ func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { func (td *ThrottleDevice) String() string { return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) } + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index e15a662f522..58ed19c9e78 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -119,4 +119,12 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // CpuMax sets she maximum bandwidth limit (format: max period). + CpuMax string `json:"cpu_max"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go similarity index 89% rename from vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go rename to vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go index d74847b0db8..c0c23d70004 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -1,3 +1,5 @@ +// +build !linux + package configs // TODO Windows: This can ultimately be entirely factored out on Windows as diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 7728522fef6..24989e9f534 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -44,6 +44,7 @@ const ( Trap Allow Trace + Log ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index d6c4ebdaa10..6ff4d96a5f5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -19,7 +19,7 @@ import ( "syscall" // only for SysProcAttr and Signal "time" - "github.com/cyphar/filepath-securejoin" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" @@ -1176,7 +1176,7 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { if err != nil { return err } - if err := checkMountDestination(c.config.Rootfs, dest); err != nil { + if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { return err } m.Destination = dest @@ -1814,7 +1814,14 @@ func (c *linuxContainer) isPaused() (bool, error) { // A container doesn't have a freezer cgroup return false, nil } - data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state")) + pausedState := "FROZEN" + filename := "freezer.state" + if cgroups.IsCgroup2UnifiedMode() { + filename = "cgroup.freeze" + pausedState = "1" + } + + data, err := ioutil.ReadFile(filepath.Join(fcg, filename)) if err != nil { // If freezer cgroup is not mounted, the container would just be not paused. if os.IsNotExist(err) { @@ -1822,7 +1829,7 @@ func (c *linuxContainer) isPaused() (bool, error) { } return false, newSystemErrorWithCause(err, "checking if container is paused") } - return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil + return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil } func (c *linuxContainer) currentState() (*State, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index f13b226e444..10888b499be 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -13,7 +13,7 @@ import ( "strings" "time" - "github.com/cyphar/filepath-securejoin" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/mrunalp/fileutils" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -197,7 +197,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error { if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } - if err := checkMountDestination(rootfs, dest); err != nil { + if err := checkProcMount(rootfs, dest, m.Source); err != nil { return err } // update the mount with the correct dest after symlinks are resolved. @@ -209,6 +209,80 @@ func prepareBindMount(m *configs.Mount, rootfs string) error { return nil } +func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + var merged []string + for _, b := range binds { + ss := filepath.Base(b.Destination) + if strings.Contains(ss, ",") { + merged = append(merged, ss) + } + } + tmpfs := &configs.Mount{ + Source: "tmpfs", + Device: "tmpfs", + Destination: m.Destination, + Flags: defaultMountFlags, + Data: "mode=755", + PropagationFlags: m.PropagationFlags, + } + if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + for _, b := range binds { + if enableCgroupns { + subsystemPath := filepath.Join(rootfs, b.Destination) + if err := os.MkdirAll(subsystemPath, 0755); err != nil { + return err + } + flags := defaultMountFlags + if m.Flags&unix.MS_RDONLY != 0 { + flags = flags | unix.MS_RDONLY + } + cgroupmount := &configs.Mount{ + Source: "cgroup", + Device: "cgroup", + Destination: subsystemPath, + Flags: flags, + Data: filepath.Base(subsystemPath), + } + if err := mountNewCgroup(cgroupmount); err != nil { + return err + } + } else { + if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + } + } + for _, mc := range merged { + for _, ss := range strings.Split(mc, ",") { + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { + return err + } + } + } + return nil +} + +func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { + cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination) + if err != nil { + return err + } + if err := os.MkdirAll(cgroupPath, 0755); err != nil { + return err + } + + return unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data) +} + func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { var ( dest = m.Destination @@ -309,62 +383,14 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } } case "cgroup": - binds, err := getCgroupMounts(m) - if err != nil { - return err - } - var merged []string - for _, b := range binds { - ss := filepath.Base(b.Destination) - if strings.Contains(ss, ",") { - merged = append(merged, ss) + if cgroups.IsCgroup2UnifiedMode() { + if err := mountCgroupV2(m, rootfs, mountLabel, enableCgroupns); err != nil { + return err } - } - tmpfs := &configs.Mount{ - Source: "tmpfs", - Device: "tmpfs", - Destination: m.Destination, - Flags: defaultMountFlags, - Data: "mode=755", - PropagationFlags: m.PropagationFlags, - } - if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { - return err - } - for _, b := range binds { - if enableCgroupns { - subsystemPath := filepath.Join(rootfs, b.Destination) - if err := os.MkdirAll(subsystemPath, 0755); err != nil { - return err - } - flags := defaultMountFlags - if m.Flags&unix.MS_RDONLY != 0 { - flags = flags | unix.MS_RDONLY - } - cgroupmount := &configs.Mount{ - Source: "cgroup", - Device: "cgroup", - Destination: subsystemPath, - Flags: flags, - Data: filepath.Base(subsystemPath), - } - if err := mountNewCgroup(cgroupmount); err != nil { - return err - } - } else { - if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { - return err - } - } - } - for _, mc := range merged { - for _, ss := range strings.Split(mc, ",") { - // symlink(2) is very dumb, it will just shove the path into - // the link and doesn't do any checks or relative path - // conversion. Also, don't error out if the cgroup already exists. - if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { - return err - } + } else { + + if err := mountCgroupV1(m, rootfs, mountLabel, enableCgroupns); err != nil { + return err } } if m.Flags&unix.MS_RDONLY != 0 { @@ -388,7 +414,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } - if err := checkMountDestination(rootfs, dest); err != nil { + if err := checkProcMount(rootfs, dest, m.Source); err != nil { return err } // update the mount with the correct dest after symlinks are resolved. @@ -435,12 +461,12 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { return binds, nil } -// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. +// checkProcMount checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. -func checkMountDestination(rootfs, dest string) error { - invalidDestinations := []string{ - "/proc", - } +// +// if source is nil, don't stat the filesystem. This is used for restore of a checkpoint. +func checkProcMount(rootfs, dest, source string) error { + const procPath = "/proc" // White list, it should be sub directories of invalid destinations validDestinations := []string{ // These entries can be bind mounted by files emulated by fuse, @@ -463,16 +489,40 @@ func checkMountDestination(rootfs, dest string) error { return nil } } - for _, invalid := range invalidDestinations { - path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) + path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) + if err != nil { + return err + } + // pass if the mount path is located outside of /proc + if strings.HasPrefix(path, "..") { + return nil + } + if path == "." { + // an empty source is pasted on restore + if source == "" { + return nil + } + // only allow a mount on-top of proc if it's source is "proc" + isproc, err := isProc(source) if err != nil { return err } - if path != "." && !strings.HasPrefix(path, "..") { - return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) + // pass if the mount is happening on top of /proc and the source of + // the mount is a proc filesystem + if isproc { + return nil } + return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) } - return nil + return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) +} + +func isProc(path string) (bool, error) { + var s unix.Statfs_t + if err := unix.Statfs(path, &s); err != nil { + return false, err + } + return s.Type == unix.PROC_SUPER_MAGIC, nil } func setupDevSymlinks(rootfs string) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go index ded5a6bbc8b..c3212279875 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -22,6 +22,7 @@ var actions = map[string]configs.Action{ "SCMP_ACT_TRAP": configs.Trap, "SCMP_ACT_ALLOW": configs.Allow, "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, } var archs = map[string]string{ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index d99f3fe640c..1b7a07118fe 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -19,6 +19,7 @@ var ( actTrap = libseccomp.ActTrap actKill = libseccomp.ActKill actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actLog = libseccomp.ActLog actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) ) @@ -112,6 +113,8 @@ func getAction(act configs.Action) (libseccomp.ScmpAction, error) { return actAllow, nil case configs.Trace: return actTrace, nil + case configs.Log: + return actLog, nil default: return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go index 11c3faafbf0..e05e30adc3b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go @@ -1,5 +1,5 @@ // +build linux -// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le s390x +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index c96088988a6..1576f2d4ab6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -3,33 +3,57 @@ package utils import ( - "io/ioutil" + "fmt" "os" "strconv" "golang.org/x/sys/unix" ) +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for +// the process (except for those below the given fd value). func CloseExecFrom(minFd int) error { - fdList, err := ioutil.ReadDir("/proc/self/fd") + fdDir, err := os.Open("/proc/self/fd") if err != nil { return err } - for _, fi := range fdList { - fd, err := strconv.Atoi(fi.Name()) + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. if err != nil { - // ignore non-numeric file names continue } - + // Ignore descriptors lower than our specified minimum. if fd < minFd { - // ignore descriptors lower than our specified minimum continue } - - // intentionally ignore errors from unix.CloseOnExec + // Intentionally ignore errors from unix.CloseOnExec -- the cases where + // this might fail are basically file descriptors that have already + // been closed (including and especially the one that was created when + // ioutil.ReadDir did the "opendir" syscall). unix.CloseOnExec(fd) - // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) } return nil } diff --git a/vendor/modules.txt b/vendor/modules.txt index a45fb5fd73d..714b9cd3323 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -679,7 +679,7 @@ github.com/opencontainers/go-digest # github.com/opencontainers/image-spec v1.0.1 => github.com/opencontainers/image-spec v1.0.1 github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830 => github.com/opencontainers/runc v1.0.0-rc2.0.20190611121236-6cc515888830 +# github.com/opencontainers/runc v1.0.0-rc9 => github.com/opencontainers/runc v1.0.0-rc9 github.com/opencontainers/runc/libcontainer github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/cgroups