diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 7bb98cb5c81..348083cb035 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -2795,83 +2795,83 @@ }, { "ImportPath": "github.com/opencontainers/runc/libcontainer", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/intelrdt", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/keys", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/mount", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/system", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/user", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/utils", - "Comment": "v1.0.0-rc5-46-g871ba2e58e2431", - "Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" + "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7", + "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24" }, { "ImportPath": "github.com/opencontainers/runtime-spec/specs-go", diff --git a/vendor/github.com/opencontainers/runc/libcontainer/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/BUILD index 32e0b736873..d76cbf280a9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/BUILD @@ -53,6 +53,7 @@ go_library( "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", "//vendor/github.com/opencontainers/selinux/go-selinux/label:go_default_library", + "//vendor/github.com/pkg/errors:go_default_library", "//vendor/github.com/sirupsen/logrus:go_default_library", "//vendor/github.com/syndtr/gocapability/capability:go_default_library", "//vendor/github.com/vishvananda/netlink:go_default_library", diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 42f3efe5639..f2a2f0c6c4b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -323,6 +323,7 @@ generated when building libcontainer with docker. ## Copyright and license -Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license. -Docs released under Creative commons. - +Code and documentation copyright 2014 Docker, inc. +The code and documentation are released under the [Apache 2.0 license](../LICENSE). +The documentation is also released under Creative Commons Attribution 4.0 International License. +You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md index 4363b6f9f4e..18bf64704b8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md @@ -156,17 +156,21 @@ init process will block waiting for the parent to finish setup. ### IntelRdt -Intel platforms with new Xeon CPU support Intel Resource Director Technology -(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which -currently supports L3 cache resource allocation. +Intel platforms with new Xeon CPU support Resource Director Technology (RDT). +Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are +two sub-features of RDT. -This feature provides a way for the software to restrict cache allocation to a -defined 'subset' of L3 cache which may be overlapping with other 'subsets'. -The different subsets are identified by class of service (CLOS) and each CLOS -has a capacity bitmask (CBM). +Cache Allocation Technology (CAT) provides a way for the software to restrict +cache allocation to a defined 'subset' of L3 cache which may be overlapping +with other 'subsets'. The different subsets are identified by class of +service (CLOS) and each CLOS has a capacity bitmask (CBM). -It can be used to handle L3 cache resource allocation for containers if -hardware and kernel support Intel RDT/CAT. +Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle +over memory bandwidth for the software. A user controls the resource by +indicating the percentage of maximum memory bandwidth. + +It can be used to handle L3 cache and memory bandwidth resources allocation +for containers if hardware and kernel support Intel RDT CAT and MBA features. In Linux 4.10 kernel or newer, the interface is defined and exposed via "resource control" filesystem, which is a "cgroup-like" interface. @@ -175,6 +179,9 @@ Comparing with cgroups, it has similar process management lifecycle and interfaces in a container. But unlike cgroups' hierarchy, it has single level filesystem layout. +CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via +"resource control" filesystem. + Intel RDT "resource control" filesystem hierarchy: ``` mount -t resctrl resctrl /sys/fs/resctrl @@ -182,59 +189,85 @@ tree /sys/fs/resctrl /sys/fs/resctrl/ |-- info | |-- L3 -| |-- cbm_mask -| |-- min_cbm_bits +| | |-- cbm_mask +| | |-- min_cbm_bits +| | |-- num_closids +| |-- MB +| |-- bandwidth_gran +| |-- delay_linear +| |-- min_bandwidth | |-- num_closids -|-- cpus +|-- ... |-- schemata |-- tasks |-- - |-- cpus + |-- ... |-- schemata |-- tasks - ``` -For runc, we can make use of `tasks` and `schemata` configuration for L3 cache -resource constraints. +For runc, we can make use of `tasks` and `schemata` configuration for L3 +cache and memory bandwidth resources constraints. The file `tasks` has a list of tasks that belongs to this group (e.g., " group). Tasks can be added to a group by writing the task ID -to the "tasks" file (which will automatically remove them from the previous +to the "tasks" file (which will automatically remove them from the previous group to which they belonged). New tasks created by fork(2) and clone(2) are -added to the same group as their parent. If a pid is not in any sub group, it -is in root group. +added to the same group as their parent. -The file `schemata` has allocation masks/values for L3 cache on each socket, -which contains L3 cache id and capacity bitmask (CBM). +The file `schemata` has a list of all the resources available to this group. +Each resource (L3 cache, memory bandwidth) has its own line and format. + +L3 cache schema: +It has allocation bitmasks/values for L3 cache on each socket, which +contains L3 cache id and capacity bitmask (CBM). ``` Format: "L3:=;=;..." ``` -For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` -Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. +For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" +which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. The valid L3 cache CBM is a *contiguous bits set* and number of bits that can be set is less than the max bit. The max bits in the CBM is varied among -supported Intel Xeon platforms. In Intel RDT "resource control" filesystem -layout, the CBM in a group should be a subset of the CBM in root. Kernel will -check if it is valid when writing. e.g., 0xfffff in root indicates the max bits -of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM -values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. +supported Intel CPU models. Kernel will check if it is valid when writing. +e.g., default value 0xfffff in root indicates the max bits of CBM is 20 +bits, which mapping to entire L3 cache capacity. Some valid CBM values to +set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. -For more information about Intel RDT/CAT kernel interface: +Memory bandwidth schema: +It has allocation values for memory bandwidth on each socket, which contains +L3 cache id and memory bandwidth percentage. +``` + Format: "MB:=bandwidth0;=bandwidth1;..." +``` +For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + +The minimum bandwidth percentage value for each CPU model is predefined and +can be looked up through "info/MB/min_bandwidth". The bandwidth granularity +that is allocated is also dependent on the CPU model and can be looked up at +"info/MB/bandwidth_gran". The available bandwidth control steps are: +min_bw + N * bw_gran. Intermediate values are rounded to the next control +step available on the hardware. + +For more information about Intel RDT kernel interface: https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt -An example for runc: ``` +An example for runc: Consider a two-socket machine with two L3 caches where the default CBM is -0xfffff and the max CBM length is 20 bits. With this configuration, tasks -inside the container only have access to the "upper" 80% of L3 cache id 0 and -the "lower" 50% L3 cache id 1: +0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% +with a memory bandwidth granularity of 10%. + +Tasks inside the container only have access to the "upper" 7/11 of L3 cache +on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a +maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. "linux": { - "intelRdt": { - "l3CacheSchema": "L3:0=ffff0;1=3ff" - } + "intelRdt": { + "closID": "guaranteed_group", + "l3CacheSchema": "L3:0=7f0;1=1f", + "memBwSchema": "MB:0=20;1=70" + } } ``` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD index ff6c5c01009..8626e7a2360 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD @@ -12,6 +12,7 @@ go_library( "freezer.go", "fs_unsupported.go", "hugetlb.go", + "kmem.go", "memory.go", "name.go", "net_cls.go", @@ -29,6 +30,7 @@ go_library( "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library", + "//vendor/github.com/pkg/errors:go_default_library", "//vendor/golang.org/x/sys/unix:go_default_library", ], "//conditions:default": [], diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 6d9123dc26c..74bfcf94f0a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -3,7 +3,6 @@ package fs import ( - "errors" "fmt" "io" "io/ioutil" @@ -14,6 +13,8 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) var ( @@ -35,7 +36,7 @@ var ( HugePageSizes, _ = cgroups.GetHugePageSize() ) -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -62,9 +63,10 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Rootless bool // ignore permission-related errors + Paths map[string]string } // The absolute path to the root of the cgroup hierarchies. @@ -100,6 +102,33 @@ type cgroupData struct { pid int } +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if os.IsPermission(errors.Cause(err)) { + return true + } + + // Try to handle other errnos. + var errno error + switch err := errors.Cause(err).(type) { + case *os.PathError: + errno = err.Err + case *os.LinkError: + errno = err.Err + case *os.SyscallError: + errno = err.Err + } + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES +} + func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -145,11 +174,11 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { - if os.IsPermission(err) && m.Cgroups.Path == "" { - // If we didn't set a cgroup path, then let's defer the error here - // until we know whether we have set limits or not. - // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because - // it will have the same limits as its parent. + // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't + // been set, we don't bail on error in case of permission problems. + // Cases where limits have been set (and we couldn't create our own + // cgroup) are handled by Set. + if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { delete(m.Paths, sys.Name()) continue } @@ -207,9 +236,16 @@ func (m *Manager) Set(container *configs.Config) error { for _, sys := range subsystems { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { + if m.Rootless && sys.Name() == "devices" { + continue + } + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if path == "" { - // cgroup never applied - return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) } return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index b712bd0b1ed..e240a8313a2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -46,11 +46,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(path, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(path, pid) } func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { @@ -83,11 +79,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - if err := s.SetRtSched(path, cgroup); err != nil { - return err - } - - return nil + return s.SetRtSched(path, cgroup) } func (s *CpuGroup) Remove(d *cgroupData) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 20c9eafac24..5a1d152ea1d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -77,18 +77,14 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro // The logic is, if user specified cpuset configs, use these // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and - // keep backward compatbility. + // keep backward compatibility. if err := s.ensureCpusAndMems(dir, cgroup); err != nil { return err } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(dir, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(dir, pid) } func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go new file mode 100644 index 00000000000..8df7377702a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go @@ -0,0 +1,55 @@ +// +build linux,!nokmem + +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" // for Errno type only + + "github.com/opencontainers/runc/libcontainer/cgroups" + "golang.org/x/sys/unix" +) + +const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + +func EnableKernelMemoryAccounting(path string) error { + // Check if kernel memory is enabled + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // kernel memory is not enabled on the system so we should do nothing + return nil + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == unix.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go new file mode 100644 index 00000000000..12253b6bbbf --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go @@ -0,0 +1,11 @@ +// +build linux,nokmem + +package fs + +func EnableKernelMemoryAccounting(path string) error { + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index ad395a5d621..d5310d569f2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -5,23 +5,18 @@ package fs import ( "bufio" "fmt" - "io/ioutil" "os" "path/filepath" "strconv" "strings" - "syscall" // only for Errno "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - - "golang.org/x/sys/unix" ) const ( - cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -67,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } -func EnableKernelMemoryAccounting(path string) error { - // Check if kernel memory is enabled - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // kernel memory is not enabled on the system so we should do nothing - return nil - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == unix.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} - func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // If the memory update is set to -1 we should also // set swap to -1, it means unlimited memory. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index 66502263e0f..30bf5eaeaec 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -5,6 +5,7 @@ package systemd import ( "errors" "fmt" + "math" "os" "path/filepath" "strings" @@ -295,13 +296,19 @@ func (m *Manager) Apply(pid int) error { // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } } properties = append(properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) @@ -312,6 +319,12 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + // We have to set kernel memory here, as we can't change it once // processes have been attached to the cgroup. if c.Resources.KernelMemory != 0 { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 7c995efee51..7c61ff13fcc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -13,7 +13,7 @@ import ( "strings" "time" - "github.com/docker/go-units" + units "github.com/docker/go-units" ) const ( @@ -103,7 +103,7 @@ func FindCgroupMountpointDir() (string, error) { } if postSeparatorFields[0] == "cgroup" { - // Check that the mount is properly formated. + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -151,19 +151,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if !ss[opt] { + seen, known := ss[opt] + if !known || (!all && seen) { continue } + ss[opt] = true if strings.HasPrefix(opt, cgroupNamePrefix) { - m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } else { - m.Subsystems = append(m.Subsystems, opt) - } - if !all { - numFound++ + opt = opt[len(cgroupNamePrefix):] } + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) } - res = append(res, m) } if err := scanner.Err(); err != nil { return nil, err @@ -187,7 +188,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) { allMap := make(map[string]bool) for s := range allSubsystems { - allMap[s] = true + allMap[s] = false } return getCgroupMountsHelper(allMap, f, all) } @@ -262,7 +263,7 @@ func getCgroupPathHelper(subsystem, cgroup string) (string, error) { } // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. + // see paths from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 3cae4fd8d96..b065f7f28ec 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -141,9 +141,10 @@ type Config struct { // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj int `json:"oom_score_adj"` + OomScoreAdj *int `json:"oom_score_adj,omitempty"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -185,12 +186,19 @@ type Config struct { // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` - // Rootless specifies whether the container is a rootless container. - Rootless bool `json:"rootless"` - - // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into - // to limit the resources (e.g., L3 cache) the container has available + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type Hooks struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go index 36bd5f96a11..6f47aac077d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -4,4 +4,8 @@ type IntelRdt struct { // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth percentage per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + MemBwSchema string `json:"memBwSchema,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go index e532ac8fe28..393d9e81ee9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -2,23 +2,18 @@ package validate import ( "fmt" - "os" - "reflect" "strings" "github.com/opencontainers/runc/libcontainer/configs" ) -var ( - geteuid = os.Geteuid - getegid = os.Getegid -) - -func (v *ConfigValidator) rootless(config *configs.Config) error { - if err := rootlessMappings(config); err != nil { +// rootlessEUID makes sure that the config can be applied when runc +// is being executed as a non-root user (euid != 0) in the current user namespace. +func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { + if err := rootlessEUIDMappings(config); err != nil { return err } - if err := rootlessMount(config); err != nil { + if err := rootlessEUIDMount(config); err != nil { return err } @@ -38,11 +33,9 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool { return false } -func rootlessMappings(config *configs.Config) error { - if euid := geteuid(); euid != 0 { - if !config.Namespaces.Contains(configs.NEWUSER) { - return fmt.Errorf("rootless containers require user namespaces") - } +func rootlessEUIDMappings(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless container requires user namespaces") } if len(config.UidMappings) == 0 { @@ -51,34 +44,13 @@ func rootlessMappings(config *configs.Config) error { if len(config.GidMappings) == 0 { return fmt.Errorf("rootless containers requires at least one GID mapping") } - - return nil -} - -// cgroup verifies that the user isn't trying to set any cgroup limits or paths. -func rootlessCgroup(config *configs.Config) error { - // Nothing set at all. - if config.Cgroups == nil || config.Cgroups.Resources == nil { - return nil - } - - // Used for comparing to the zero value. - left := reflect.ValueOf(*config.Cgroups.Resources) - right := reflect.Zero(left.Type()) - - // This is all we need to do, since specconv won't add cgroup options in - // rootless mode. - if !reflect.DeepEqual(left.Interface(), right.Interface()) { - return fmt.Errorf("cannot specify resource limits in rootless container") - } - return nil } // mount verifies that the user isn't trying to set up any mounts they don't have // the rights to do. In addition, it makes sure that no mount has a `uid=` or // `gid=` option that doesn't resolve to root. -func rootlessMount(config *configs.Config) error { +func rootlessEUIDMount(config *configs.Config) error { // XXX: We could whitelist allowed devices at this point, but I'm not // convinced that's a good idea. The kernel is the best arbiter of // access control. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index cbbba9a03a2..9c78141ebd3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -44,8 +44,8 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.intelrdt(config); err != nil { return err } - if config.Rootless { - if err := v.rootless(config); err != nil { + if config.RootlessEUID { + if err := v.rootlessEUID(config); err != nil { return err } } @@ -151,6 +151,16 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) } } + if config.Namespaces.Contains(configs.NEWUTS) { + switch s { + case "kernel.domainname": + // This is namespaced and there's no explicit OCI field for it. + continue + case "kernel.hostname": + // This is namespaced but there's a conflicting (dedicated) OCI field for it. + return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname") + } + } return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) } @@ -159,11 +169,22 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { func (v *ConfigValidator) intelrdt(config *configs.Config) error { if config.IntelRdt != nil { - if !intelrdt.IsEnabled() { - return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") + if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled") } - if config.IntelRdt.L3CacheSchema == "" { - return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + + if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" { + return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") + } + if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" { + return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") + } + + if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" { + return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 246ec95f362..f8caa4b8991 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -28,7 +28,6 @@ import ( "github.com/golang/protobuf/proto" "github.com/sirupsen/logrus" - "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" "golang.org/x/sys/unix" ) @@ -60,7 +59,8 @@ type State struct { // Platform specific fields below here - // Specifies if the container was started under the rootless mode. + // Specified if the container was started under the rootless mode. + // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups Rootless bool `json:"rootless"` // Path to all the cgroups setup for a container. Key is cgroup subsystem name @@ -225,17 +225,13 @@ func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - if status == Stopped { + if process.Init { if err := c.createExecFifo(); err != nil { return err } } - if err := c.start(process, status == Stopped); err != nil { - if status == Stopped { + if err := c.start(process); err != nil { + if process.Init { c.deleteExecFifo() } return err @@ -244,17 +240,10 @@ func (c *linuxContainer) Start(process *Process) error { } func (c *linuxContainer) Run(process *Process) error { - c.m.Lock() - status, err := c.currentStatus() - if err != nil { - c.m.Unlock() - return err - } - c.m.Unlock() if err := c.Start(process); err != nil { return err } - if status == Stopped { + if process.Init { return c.exec() } return nil @@ -335,8 +324,8 @@ type openResult struct { err error } -func (c *linuxContainer) start(process *Process, isInit bool) error { - parent, err := c.newParentProcess(process, isInit) +func (c *linuxContainer) start(process *Process) error { + parent, err := c.newParentProcess(process) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } @@ -349,7 +338,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { } // generate a timestamp indicating when the container was started c.created = time.Now().UTC() - if isInit { + if process.Init { c.state = &createdState{ c: c, } @@ -411,10 +400,7 @@ func (c *linuxContainer) createExecFifo() error { return err } unix.Umask(oldMask) - if err := os.Chown(fifoName, rootuid, rootgid); err != nil { - return err - } - return nil + return os.Chown(fifoName, rootuid, rootgid) } func (c *linuxContainer) deleteExecFifo() { @@ -439,7 +425,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { return nil } -func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { +func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") @@ -448,7 +434,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } - if !doInit { + if !p.Init { return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } @@ -473,6 +459,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } + cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) if p.ConsoleSocket != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) @@ -533,14 +520,15 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return nil, err } return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - intelRdtPath: state.IntelRdtPath, - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - process: p, - bootstrapData: data, + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + rootlessCgroups: c.config.RootlessCgroups, + intelRdtPath: state.IntelRdtPath, + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, }, nil } @@ -556,7 +544,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, - Rootless: c.config.Rootless, + RootlessEUID: c.config.RootlessEUID, + RootlessCgroups: c.config.RootlessCgroups, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, @@ -624,16 +613,16 @@ func (c *linuxContainer) Resume() error { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + if c.config.RootlessCgroups { + logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + if c.config.RootlessCgroups { + logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } @@ -668,7 +657,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. Features: criuFeat, } - err := c.criuSwrk(nil, req, criuOpts, false) + err := c.criuSwrk(nil, req, criuOpts, false, nil) if err != nil { logrus.Debugf("%s", err) return fmt.Errorf("CRIU feature check failed") @@ -781,7 +770,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error { Type: &t, } - err := c.criuSwrk(nil, req, nil, false) + err := c.criuSwrk(nil, req, nil, false, nil) if err != nil { return fmt.Errorf("CRIU version check failed: %s", err) } @@ -877,12 +866,11 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has // support for doing unprivileged dumps, but the setup of // rootless containers might make this complicated. - if c.config.Rootless { - return fmt.Errorf("cannot checkpoint a rootless container") - } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -939,6 +927,33 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { LazyPages: proto.Bool(criuOpts.LazyPages), } + // If the container is running in a network namespace and has + // a path to the network namespace configured, we will dump + // that network namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect to be setup correctly. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU expects the information about an external namespace + // like this: --external net[]: + // This is always 'extRootNetNS'. + var netns syscall.Stat_t + err = syscall.Stat(nsPath, &netns) + if err != nil { + return err + } + criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) + rpcOpts.External = append(rpcOpts.External, criuExternal) + } + } + fcg := c.cgroupManager.GetPaths()["freezer"] if fcg != "" { rpcOpts.FreezeCgroup = proto.String(fcg) @@ -1043,7 +1058,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } } - err = c.criuSwrk(nil, req, criuOpts, false) + err = c.criuSwrk(nil, req, criuOpts, false, nil) if err != nil { return err } @@ -1087,11 +1102,12 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + var extraFiles []*os.File + + // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // support for unprivileged restore at the moment. - if c.config.Rootless { - return fmt.Errorf("cannot restore a rootless container") - } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -1161,6 +1177,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { }, } + // Same as during checkpointing. If the container has a specific network namespace + // assigned to it, this now expects that the checkpoint will be restored in a + // already created network namespace. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU wants the information about an existing network namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNetNS' as the key in this. + netns, err := os.Open(nsPath) + defer netns.Close() + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String("extRootNetNS") + // The offset of four is necessary because 0, 1, 2 and 3 is already + // used by stdin, stdout, stderr, 'criu swrk' socket. + inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + extraFiles = append(extraFiles, netns) + } + } + for _, m := range c.config.Mounts { switch m.Device { case "bind": @@ -1219,7 +1267,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - return c.criuSwrk(process, req, criuOpts, true) + return c.criuSwrk(process, req, criuOpts, true, extraFiles) } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { @@ -1249,7 +1297,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return nil } -func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return err @@ -1290,6 +1338,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * cmd.Stderr = process.Stderr } cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + if extraFiles != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) + } if err := cmd.Start(); err != nil { return err @@ -1664,7 +1715,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, - Rootless: c.config.Rootless, + Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, CgroupPaths: c.cgroupManager.GetPaths(), IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), @@ -1765,7 +1816,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na if !joinExistingUser { // write uid mappings if len(c.config.UidMappings) > 0 { - if c.config.Rootless && c.newuidmapPath != "" { + if c.config.RootlessEUID && c.newuidmapPath != "" { r.AddData(&Bytemsg{ Type: UidmapPathAttr, Value: []byte(c.newuidmapPath), @@ -1791,38 +1842,33 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - if c.config.Rootless && c.newgidmapPath != "" { + if c.config.RootlessEUID && c.newgidmapPath != "" { r.AddData(&Bytemsg{ Type: GidmapPathAttr, Value: []byte(c.newgidmapPath), }) } if requiresRootOrMappingTool(c.config) { - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(0) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) - } + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) } } } - // write oom_score_adj - r.AddData(&Bytemsg{ - Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), - }) + if c.config.OomScoreAdj != nil { + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), + }) + } // write rootless r.AddData(&Boolmsg{ - Type: RootlessAttr, - Value: c.config.Rootless, + Type: RootlessEUIDAttr, + Value: c.config.RootlessEUID, }) return bytes.NewReader(r.Serialize()), nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index 7d53d5e04d8..e35957c3148 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -11,6 +11,7 @@ import ( "runtime/debug" "strconv" + "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" @@ -59,9 +60,9 @@ func SystemdCgroups(l *LinuxFactory) error { return nil } -// Cgroupfs is an options func to configure a LinuxFactory to return -// containers that use the native cgroups filesystem implementation to -// create and manage cgroups. +// Cgroupfs is an options func to configure a LinuxFactory to return containers +// that use the native cgroups filesystem implementation to create and manage +// cgroups. func Cgroupfs(l *LinuxFactory) error { l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { return &fs.Manager{ @@ -72,9 +73,26 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to create +// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is +// that RootlessCgroupfs can transparently handle permission errors that occur +// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if +// they've been set up properly). +func RootlessCgroupfs(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Rootless: true, + Paths: paths, + } + } + return nil +} + // IntelRdtfs is an options func to configure a LinuxFactory to return // containers that use the Intel RDT "resource control" filesystem to -// create and manage Intel Xeon platform shared resources (e.g., L3 cache). +// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). func IntelRdtFs(l *LinuxFactory) error { l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { return &intelrdt.IntelRdtManager{ @@ -178,7 +196,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } - containerRoot := filepath.Join(l.Root, id) + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) } else if !os.IsNotExist(err) { @@ -201,7 +222,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err newgidmapPath: l.NewgidmapPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), } - if intelrdt.IsEnabled() { + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { c.intelRdtManager = l.NewIntelRdtManager(config, id, "") } c.state = &stoppedState{c: c} @@ -212,7 +233,14 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if l.Root == "" { return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) } - containerRoot := filepath.Join(l.Root, id) + //when load, we need to check id is valid or not. + if err := l.validateID(id); err != nil { + return nil, err + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } state, err := l.loadState(containerRoot, id) if err != nil { return nil, err @@ -240,7 +268,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err := c.refreshState(); err != nil { return nil, err } - if intelrdt.IsEnabled() { + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) } return c, nil @@ -322,7 +350,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { } func (l *LinuxFactory) loadState(root, id string) (*State, error) { - f, err := os.Open(filepath.Join(root, stateFilename)) + stateFilePath, err := securejoin.SecureJoin(root, stateFilename) + if err != nil { + return nil, err + } + f, err := os.Open(stateFilePath) if err != nil { if os.IsNotExist(err) { return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) @@ -338,7 +370,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) { } func (l *LinuxFactory) validateID(id string) error { - if !idRegex.MatchString(id) { + if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 2770be30718..cd7ff67a702 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "io" + "io/ioutil" "net" "os" "strings" @@ -20,6 +21,7 @@ import ( "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -64,7 +66,8 @@ type initConfig struct { CreateConsole bool `json:"create_console"` ConsoleWidth uint16 `json:"console_width"` ConsoleHeight uint16 `json:"console_height"` - Rootless bool `json:"rootless"` + RootlessEUID bool `json:"rootless_euid,omitempty"` + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type initer interface { @@ -121,7 +124,7 @@ func finalizeNamespace(config *initConfig) error { // inherited are marked close-on-exec so they stay out of the // container if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { - return err + return errors.Wrap(err, "close exec fds") } capabilities := &configs.Capabilities{} @@ -136,20 +139,20 @@ func finalizeNamespace(config *initConfig) error { } // drop capabilities in bounding set before changing user if err := w.ApplyBoundingSet(); err != nil { - return err + return errors.Wrap(err, "apply bounding set") } // preserve existing capabilities while we change users if err := system.SetKeepCaps(); err != nil { - return err + return errors.Wrap(err, "set keep caps") } if err := setupUser(config); err != nil { - return err + return errors.Wrap(err, "setup user") } if err := system.ClearKeepCaps(); err != nil { - return err + return errors.Wrap(err, "clear keep caps") } if err := w.ApplyCaps(); err != nil { - return err + return errors.Wrap(err, "apply caps") } if config.Cwd != "" { if err := unix.Chdir(config.Cwd); err != nil { @@ -217,11 +220,7 @@ func syncParentReady(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - if err := readSync(pipe, procRun); err != nil { - return err - } - - return nil + return readSync(pipe, procRun) } // syncParentHooks sends to the given pipe a JSON payload which indicates that @@ -234,11 +233,7 @@ func syncParentHooks(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - if err := readSync(pipe, procResume); err != nil { - return err - } - - return nil + return readSync(pipe, procResume) } // setupUser changes the groups, gid, and uid for the user inside the container @@ -282,7 +277,7 @@ func setupUser(config *initConfig) error { return fmt.Errorf("cannot set gid to unmapped user in user namespace") } - if config.Rootless { + if config.RootlessEUID { // We cannot set any additional groups in a rootless container and thus // we bail if the user asked us to do so. TODO: We currently can't do // this check earlier, but if libcontainer.Process.User was typesafe @@ -298,11 +293,18 @@ func setupUser(config *initConfig) error { return err } + setgroups, err := ioutil.ReadFile("/proc/self/setgroups") + if err != nil && !os.IsNotExist(err) { + return err + } + // This isn't allowed in an unprivileged user namespace since Linux 3.19. // There's nothing we can do about /etc/group entries, so we silently // ignore setting groups here (since the user didn't explicitly ask us to // set the group). - if !config.Rootless { + allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny" + + if allowSupGroups { suppGroups := append(execUser.Sgids, addGroups...) if err := unix.Setgroups(suppGroups); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go index 487c630af61..4e081ae6fe8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -16,20 +16,25 @@ import ( ) /* - * About Intel RDT/CAT feature: + * About Intel RDT features: * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). - * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 - * Cache is the only resource that is supported in RDT. + * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are + * two sub-features of RDT. * - * This feature provides a way for the software to restrict cache allocation to a - * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. - * The different subsets are identified by class of service (CLOS) and each CLOS - * has a capacity bitmask (CBM). + * Cache Allocation Technology (CAT) provides a way for the software to restrict + * cache allocation to a defined 'subset' of L3 cache which may be overlapping + * with other 'subsets'. The different subsets are identified by class of + * service (CLOS) and each CLOS has a capacity bitmask (CBM). * - * For more information about Intel RDT/CAT can be found in the section 17.17 - * of Intel Software Developer Manual. + * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle + * over memory bandwidth for the software. A user controls the resource by + * indicating the percentage of maximum memory bandwidth. * - * About Intel RDT/CAT kernel interface: + * More details about Intel RDT CAT and MBA can be found in the section 17.18 + * of Intel Software Developer Manual: + * https://software.intel.com/en-us/articles/intel-sdm + * + * About Intel RDT kernel interface: * In Linux 4.10 kernel or newer, the interface is defined and exposed via * "resource control" filesystem, which is a "cgroup-like" interface. * @@ -37,59 +42,86 @@ import ( * interfaces in a container. But unlike cgroups' hierarchy, it has single level * filesystem layout. * + * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via + * "resource control" filesystem. + * * Intel RDT "resource control" filesystem hierarchy: * mount -t resctrl resctrl /sys/fs/resctrl * tree /sys/fs/resctrl * /sys/fs/resctrl/ * |-- info * | |-- L3 - * | |-- cbm_mask - * | |-- min_cbm_bits + * | | |-- cbm_mask + * | | |-- min_cbm_bits + * | | |-- num_closids + * | |-- MB + * | |-- bandwidth_gran + * | |-- delay_linear + * | |-- min_bandwidth * | |-- num_closids - * |-- cpus + * |-- ... * |-- schemata * |-- tasks * |-- - * |-- cpus + * |-- ... * |-- schemata * |-- tasks * - * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache - * resource constraints. + * For runc, we can make use of `tasks` and `schemata` configuration for L3 + * cache and memory bandwidth resources constraints. * - * The file `tasks` has a list of tasks that belongs to this group (e.g., + * The file `tasks` has a list of tasks that belongs to this group (e.g., * " group). Tasks can be added to a group by writing the task ID - * to the "tasks" file (which will automatically remove them from the previous + * to the "tasks" file (which will automatically remove them from the previous * group to which they belonged). New tasks created by fork(2) and clone(2) are - * added to the same group as their parent. If a pid is not in any sub group, it is - * in root group. + * added to the same group as their parent. * - * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, - * which contains L3 cache id and capacity bitmask (CBM). + * The file `schemata` has a list of all the resources available to this group. + * Each resource (L3 cache, memory bandwidth) has its own line and format. + * + * L3 cache schema: + * It has allocation bitmasks/values for L3 cache on each socket, which + * contains L3 cache id and capacity bitmask (CBM). * Format: "L3:=;=;..." - * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. * * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can * be set is less than the max bit. The max bits in the CBM is varied among - * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem - * layout, the CBM in a group should be a subset of the CBM in root. Kernel will - * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits - * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM - * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * supported Intel CPU models. Kernel will check if it is valid when writing. + * e.g., default value 0xfffff in root indicates the max bits of CBM is 20 + * bits, which mapping to entire L3 cache capacity. Some valid CBM values to + * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. * - * For more information about Intel RDT/CAT kernel interface: + * Memory bandwidth schema: + * It has allocation values for memory bandwidth on each socket, which contains + * L3 cache id and memory bandwidth percentage. + * Format: "MB:=bandwidth0;=bandwidth1;..." + * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + * + * The minimum bandwidth percentage value for each CPU model is predefined and + * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity + * that is allocated is also dependent on the CPU model and can be looked up at + * "info/MB/bandwidth_gran". The available bandwidth control steps are: + * min_bw + N * bw_gran. Intermediate values are rounded to the next control + * step available on the hardware. + * + * For more information about Intel RDT kernel interface: * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt * * An example for runc: * Consider a two-socket machine with two L3 caches where the default CBM is - * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks - * inside the container only have access to the "upper" 80% of L3 cache id 0 and - * the "lower" 50% L3 cache id 1: + * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% + * with a memory bandwidth granularity of 10%. + * + * Tasks inside the container only have access to the "upper" 7/11 of L3 cache + * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a + * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. * * "linux": { - * "intelRdt": { - * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * "intelRdt": { + * "l3CacheSchema": "L3:0=7f0;1=1f", + * "memBwSchema": "MB:0=20;1=70" * } * } */ @@ -129,8 +161,10 @@ var ( intelRdtRoot string intelRdtRootLock sync.Mutex - // The flag to indicate if Intel RDT is supported - isEnabled bool + // The flag to indicate if Intel RDT/CAT is enabled + isCatEnabled bool + // The flag to indicate if Intel RDT/MBA is enabled + isMbaEnabled bool ) type intelRdtData struct { @@ -139,19 +173,35 @@ type intelRdtData struct { pid int } -// Check if Intel RDT is enabled in init() +// Check if Intel RDT sub-features are enabled in init() func init() { - // 1. Check if hardware and kernel support Intel RDT/CAT feature - // "cat_l3" flag is set if supported - isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") - if !isFlagSet || err != nil { - isEnabled = false + // 1. Check if hardware and kernel support Intel RDT sub-features + // "cat_l3" flag for CAT and "mba" flag for MBA + isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { return } // 2. Check if Intel RDT "resource control" filesystem is mounted // The user guarantees to mount the filesystem - isEnabled = isIntelRdtMounted() + if !isIntelRdtMounted() { + return + } + + // 3. Double check if Intel RDT sub-features are available in + // "resource control" filesystem. Intel RDT sub-features can be + // selectively disabled or enabled by kernel command line + // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel + if isCatFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { + isCatEnabled = true + } + } + if isMbaFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { + isMbaEnabled = true + } + } } // Return the mount point path of Intel RDT "resource control" filesysem @@ -177,7 +227,7 @@ func findIntelRdtMountpointDir() (string, error) { } if postSeparatorFields[0] == "resctrl" { - // Check that the mount is properly formated. + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -223,30 +273,40 @@ func isIntelRdtMounted() bool { return true } -func parseCpuInfoFile(path string) (bool, error) { +func parseCpuInfoFile(path string) (bool, bool, error) { + isCatFlagSet := false + isMbaFlagSet := false + f, err := os.Open(path) if err != nil { - return false, err + return false, false, err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if err := s.Err(); err != nil { - return false, err + return false, false, err } - text := s.Text() - flags := strings.Split(text, " ") + line := s.Text() - // "cat_l3" flag is set if Intel RDT/CAT is supported - for _, flag := range flags { - if flag == "cat_l3" { - return true, nil + // Search "cat_l3" and "mba" flags in first "flags" line + if strings.Contains(line, "flags") { + flags := strings.Split(line, " ") + // "cat_l3" flag for CAT and "mba" flag for MBA + for _, flag := range flags { + switch flag { + case "cat_l3": + isCatFlagSet = true + case "mba": + isMbaFlagSet = true + } } + return isCatFlagSet, isMbaFlagSet, nil } } - return false, nil + return isCatFlagSet, isMbaFlagSet, nil } func parseUint(s string, base, bitSize int) (uint64, error) { @@ -292,30 +352,6 @@ func getIntelRdtParamString(path, file string) (string, error) { return strings.TrimSpace(string(contents)), nil } -func readTasksFile(dir string) ([]int, error) { - f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) - if err != nil { - return nil, err - } - defer f.Close() - - var ( - s = bufio.NewScanner(f) - out = []int{} - ) - - for s.Scan() { - if t := s.Text(); t != "" { - pid, err := strconv.Atoi(t) - if err != nil { - return nil, err - } - out = append(out, pid) - } - } - return out, nil -} - func writeFile(dir, file, data string) error { if dir == "" { return fmt.Errorf("no such directory for %s", file) @@ -368,6 +404,57 @@ func getL3CacheInfo() (*L3CacheInfo, error) { return l3CacheInfo, nil } +// Get the read-only memory bandwidth information +func getMemBwInfo() (*MemBwInfo, error) { + memBwInfo := &MemBwInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return memBwInfo, err + } + + path := filepath.Join(rootPath, "info", "MB") + bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran") + if err != nil { + return memBwInfo, err + } + delayLinear, err := getIntelRdtParamUint(path, "delay_linear") + if err != nil { + return memBwInfo, err + } + minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth") + if err != nil { + return memBwInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return memBwInfo, err + } + + memBwInfo.BandwidthGran = bandwidthGran + memBwInfo.DelayLinear = delayLinear + memBwInfo.MinBandwidth = minBandwidth + memBwInfo.NumClosids = numClosids + + return memBwInfo, nil +} + +// Get diagnostics for last filesystem operation error from file info/last_cmd_status +func getLastCmdStatus() (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, "info") + lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status") + if err != nil { + return "", err + } + + return lastCmdStatus, nil +} + // WriteIntelRdtTasks writes the specified pid into the "tasks" file func WriteIntelRdtTasks(dir string, pid int) error { if dir == "" { @@ -383,9 +470,14 @@ func WriteIntelRdtTasks(dir string, pid int) error { return nil } -// Check if Intel RDT is enabled -func IsEnabled() bool { - return isEnabled +// Check if Intel RDT/CAT is enabled +func IsCatEnabled() bool { + return isCatEnabled +} + +// Check if Intel RDT/MBA is enabled +func IsMbaEnabled() bool { + return isMbaEnabled } // Get the 'container_id' path in Intel RDT "resource control" filesystem @@ -452,65 +544,130 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { defer m.mu.Unlock() stats := NewStats() - // The read-only L3 cache information - l3CacheInfo, err := getL3CacheInfo() - if err != nil { - return nil, err - } - stats.L3CacheInfo = l3CacheInfo - - // The read-only L3 cache schema in root rootPath, err := getIntelRdtRoot() if err != nil { return nil, err } + // The read-only L3 cache and memory bandwidth schemata in root tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") if err != nil { return nil, err } - // L3 cache schema is in the first line schemaRootStrings := strings.Split(tmpRootStrings, "\n") - stats.L3CacheSchemaRoot = schemaRootStrings[0] - // The L3 cache schema in 'container_id' group + // The L3 cache and memory bandwidth schemata in 'container_id' group tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") if err != nil { return nil, err } - // L3 cache schema is in the first line schemaStrings := strings.Split(tmpStrings, "\n") - stats.L3CacheSchema = schemaStrings[0] + + if IsCatEnabled() { + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "L3") { + stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The L3 cache schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "L3") { + stats.L3CacheSchema = strings.TrimSpace(schema) + } + } + } + + if IsMbaEnabled() { + // The read-only memory bandwidth information + memBwInfo, err := getMemBwInfo() + if err != nil { + return nil, err + } + stats.MemBwInfo = memBwInfo + + // The read-only memory bandwidth information + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "MB") { + stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The memory bandwidth schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "MB") { + stats.MemBwSchema = strings.TrimSpace(schema) + } + } + } return stats, nil } // Set Intel RDT "resource control" filesystem as configured. func (m *IntelRdtManager) Set(container *configs.Config) error { - path := m.GetPath() - - // About L3 cache schema file: - // The schema has allocation masks/values for L3 cache on each socket, + // About L3 cache schema: + // It has allocation bitmasks/values for L3 cache on each socket, // which contains L3 cache id and capacity bitmask (CBM). - // Format: "L3:=;=;..." - // For example, on a two-socket machine, L3's schema line could be: - // L3:0=ff;1=c0 - // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // Format: "L3:=;=;..." + // For example, on a two-socket machine, the schema line could be: + // L3:0=ff;1=c0 + // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM + // is 0xc0. // - // About L3 cache CBM validity: // The valid L3 cache CBM is a *contiguous bits set* and number of // bits that can be set is less than the max bit. The max bits in the - // CBM is varied among supported Intel Xeon platforms. In Intel RDT - // "resource control" filesystem layout, the CBM in a group should - // be a subset of the CBM in root. Kernel will check if it is valid - // when writing. - // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, - // which mapping to entire L3 cache capacity. Some valid CBM values - // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // CBM is varied among supported Intel CPU models. Kernel will check + // if it is valid when writing. e.g., default value 0xfffff in root + // indicates the max bits of CBM is 20 bits, which mapping to entire + // L3 cache capacity. Some valid CBM values to set in a group: + // 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // + // + // About memory bandwidth schema: + // It has allocation values for memory bandwidth on each socket, which + // contains L3 cache id and memory bandwidth percentage. + // Format: "MB:=bandwidth0;=bandwidth1;..." + // For example, on a two-socket machine, the schema line could be: + // "MB:0=20;1=70" + // + // The minimum bandwidth percentage value for each CPU model is + // predefined and can be looked up through "info/MB/min_bandwidth". + // The bandwidth granularity that is allocated is also dependent on + // the CPU model and can be looked up at "info/MB/bandwidth_gran". + // The available bandwidth control steps are: min_bw + N * bw_gran. + // Intermediate values are rounded to the next control step available + // on the hardware. if container.IntelRdt != nil { + path := m.GetPath() l3CacheSchema := container.IntelRdt.L3CacheSchema - if l3CacheSchema != "" { + memBwSchema := container.IntelRdt.MemBwSchema + + // Write a single joint schema string to schemata file + if l3CacheSchema != "" && memBwSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only L3 cache schema string to schemata file + if l3CacheSchema != "" && memBwSchema == "" { if err := writeFile(path, "schemata", l3CacheSchema); err != nil { - return err + return NewLastCmdError(err) + } + } + + // Write only memory bandwidth schema string to schemata file + if l3CacheSchema == "" && memBwSchema != "" { + if err := writeFile(path, "schemata", memBwSchema); err != nil { + return NewLastCmdError(err) } } } @@ -521,11 +678,11 @@ func (m *IntelRdtManager) Set(container *configs.Config) error { func (raw *intelRdtData) join(id string) (string, error) { path := filepath.Join(raw.root, id) if err := os.MkdirAll(path, 0755); err != nil { - return "", err + return "", NewLastCmdError(err) } if err := WriteIntelRdtTasks(path, raw.pid); err != nil { - return "", err + return "", NewLastCmdError(err) } return path, nil } @@ -551,3 +708,23 @@ func IsNotFound(err error) bool { _, ok := err.(*NotFoundError) return ok } + +type LastCmdError struct { + LastCmdStatus string + Err error +} + +func (e *LastCmdError) Error() string { + return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus) +} + +func NewLastCmdError(err error) error { + lastCmdStatus, err1 := getLastCmdStatus() + if err1 == nil { + return &LastCmdError{ + LastCmdStatus: lastCmdStatus, + Err: err, + } + } + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go index 095c0a380cd..df5686f3b80 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go @@ -8,6 +8,13 @@ type L3CacheInfo struct { NumClosids uint64 `json:"num_closids,omitempty"` } +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + type Stats struct { // The read-only L3 cache information L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` @@ -17,6 +24,15 @@ type Stats struct { // The L3 cache schema in 'container_id' group L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` } func NewStats() *Stats { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/keys/BUILD index b539ae2d45b..280323836c0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/BUILD @@ -8,6 +8,7 @@ go_library( visibility = ["//visibility:public"], deps = select({ "@io_bazel_rules_go//go/platform:linux": [ + "//vendor/github.com/pkg/errors:go_default_library", "//vendor/golang.org/x/sys/unix:go_default_library", ], "//conditions:default": [], diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go index ce8b4e6b040..74dedd56ca8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -7,6 +7,8 @@ import ( "strconv" "strings" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) @@ -15,7 +17,7 @@ type KeySerial uint32 func JoinSessionKeyring(name string) (KeySerial, error) { sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) if err != nil { - return 0, fmt.Errorf("could not create session key: %v", err) + return 0, errors.Wrap(err, "create session key") } return KeySerial(sessKeyId), nil } @@ -42,9 +44,5 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { perm := (uint32(perm64) & mask) | setbits - if err := unix.KeyctlSetperm(int(ringId), perm); err != nil { - return err - } - - return nil + return unix.KeyctlSetperm(int(ringId), perm) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index ed7f986df8d..1d4f5033aa2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -10,16 +10,16 @@ import ( // list of known message types we want to send to bootstrap program // The number is randomly chosen to not conflict with known netlink types const ( - InitMsg uint16 = 62000 - CloneFlagsAttr uint16 = 27281 - NsPathsAttr uint16 = 27282 - UidmapAttr uint16 = 27283 - GidmapAttr uint16 = 27284 - SetgroupAttr uint16 = 27285 - OomScoreAdjAttr uint16 = 27286 - RootlessAttr uint16 = 27287 - UidmapPathAttr uint16 = 27288 - GidmapPathAttr uint16 = 27289 + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessEUIDAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 ) type Int32msg struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go index 5075bee4db5..569c53f6e8a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -5,18 +5,15 @@ package libcontainer import ( "fmt" "io/ioutil" - "net" "path/filepath" "strconv" "strings" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" "github.com/vishvananda/netlink" ) var strategies = map[string]networkStrategy{ - "veth": &veth{}, "loopback": &loopback{}, } @@ -103,157 +100,3 @@ func (l *loopback) attach(n *configs.Network) (err error) { func (l *loopback) detach(n *configs.Network) (err error) { return nil } - -// veth is a network strategy that uses a bridge and creates -// a veth pair, one that is attached to the bridge on the host and the other -// is placed inside the container's namespace -type veth struct { -} - -func (v *veth) detach(n *configs.Network) (err error) { - return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) -} - -// attach a container network interface to an external network -func (v *veth) attach(n *configs.Network) (err error) { - brl, err := netlink.LinkByName(n.Bridge) - if err != nil { - return err - } - br, ok := brl.(*netlink.Bridge) - if !ok { - return fmt.Errorf("Wrong device type %T", brl) - } - host, err := netlink.LinkByName(n.HostInterfaceName) - if err != nil { - return err - } - - if err := netlink.LinkSetMaster(host, br); err != nil { - return err - } - if err := netlink.LinkSetMTU(host, n.Mtu); err != nil { - return err - } - if n.HairpinMode { - if err := netlink.LinkSetHairpin(host, true); err != nil { - return err - } - } - if err := netlink.LinkSetUp(host); err != nil { - return err - } - - return nil -} - -func (v *veth) create(n *network, nspid int) (err error) { - tmpName, err := v.generateTempPeerName() - if err != nil { - return err - } - n.TempVethPeerName = tmpName - if n.Bridge == "" { - return fmt.Errorf("bridge is not specified") - } - veth := &netlink.Veth{ - LinkAttrs: netlink.LinkAttrs{ - Name: n.HostInterfaceName, - TxQLen: n.TxQueueLen, - }, - PeerName: n.TempVethPeerName, - } - if err := netlink.LinkAdd(veth); err != nil { - return err - } - defer func() { - if err != nil { - netlink.LinkDel(veth) - } - }() - if err := v.attach(&n.Network); err != nil { - return err - } - child, err := netlink.LinkByName(n.TempVethPeerName) - if err != nil { - return err - } - return netlink.LinkSetNsPid(child, nspid) -} - -func (v *veth) generateTempPeerName() (string, error) { - return utils.GenerateRandomName("veth", 7) -} - -func (v *veth) initialize(config *network) error { - peer := config.TempVethPeerName - if peer == "" { - return fmt.Errorf("peer is not specified") - } - child, err := netlink.LinkByName(peer) - if err != nil { - return err - } - if err := netlink.LinkSetDown(child); err != nil { - return err - } - if err := netlink.LinkSetName(child, config.Name); err != nil { - return err - } - // get the interface again after we changed the name as the index also changes. - if child, err = netlink.LinkByName(config.Name); err != nil { - return err - } - if config.MacAddress != "" { - mac, err := net.ParseMAC(config.MacAddress) - if err != nil { - return err - } - if err := netlink.LinkSetHardwareAddr(child, mac); err != nil { - return err - } - } - ip, err := netlink.ParseAddr(config.Address) - if err != nil { - return err - } - if err := netlink.AddrAdd(child, ip); err != nil { - return err - } - if config.IPv6Address != "" { - ip6, err := netlink.ParseAddr(config.IPv6Address) - if err != nil { - return err - } - if err := netlink.AddrAdd(child, ip6); err != nil { - return err - } - } - if err := netlink.LinkSetMTU(child, config.Mtu); err != nil { - return err - } - if err := netlink.LinkSetUp(child); err != nil { - return err - } - if config.Gateway != "" { - gw := net.ParseIP(config.Gateway) - if err := netlink.RouteAdd(&netlink.Route{ - Scope: netlink.SCOPE_UNIVERSE, - LinkIndex: child.Attrs().Index, - Gw: gw, - }); err != nil { - return err - } - } - if config.IPv6Gateway != "" { - gw := net.ParseIP(config.IPv6Gateway) - if err := netlink.RouteAdd(&netlink.Route{ - Scope: netlink.SCOPE_UNIVERSE, - LinkIndex: child.Attrs().Index, - Gw: gw, - }); err != nil { - return err - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 86bf7387f8c..9a7c6014121 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -72,6 +72,9 @@ type Process struct { // ConsoleSocket provides the masterfd console. ConsoleSocket *os.File + // Init specifies whether the process is the first process in the container. + Init bool + ops processOperations } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 58980b0594e..217c213f8a3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -46,15 +46,16 @@ type parentProcess interface { } type setnsProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - cgroupPaths map[string]string - intelRdtPath string - config *initConfig - fds []string - process *Process - bootstrapData io.Reader + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + rootlessCgroups bool + intelRdtPath string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader } func (p *setnsProcess) startTime() (uint64, error) { @@ -86,7 +87,7 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { - if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } @@ -537,7 +538,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { } fds = append(fds, r.Fd(), w.Fd()) p.Stderr, i.Stderr = w, r - // change ownership of the pipes incase we are in a user namespace + // change ownership of the pipes in case we are in a user namespace for _, fd := range fds { if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { return nil, err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index 7f852efceb2..a278dad35db 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -46,6 +46,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { return newSystemErrorWithCause(err, "preparing rootfs") } + setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { if err := mountCmd(precmd); err != nil { @@ -64,8 +65,6 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { } } - setupDev := needsSetupDev(config) - if setupDev { if err := createDevices(config); err != nil { return newSystemErrorWithCause(err, "creating device nodes") @@ -153,6 +152,26 @@ func finalizeRootfs(config *configs.Config) (err error) { return nil } +// /tmp has to be mounted as private to allow MS_MOVE to work in all situations +func prepareTmp(topTmpDir string) (string, error) { + tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") + if err != nil { + return "", err + } + if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { + return "", err + } + if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { + return "", err + } + return tmpdir, nil +} + +func cleanupTmp(tmpdir string) error { + unix.Unmount(tmpdir, 0) + return os.RemoveAll(tmpdir) +} + func mountCmd(cmd configs.Command) error { command := exec.Command(cmd.Path, cmd.Args[:]...) command.Env = cmd.Env @@ -200,7 +219,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } } if copyUp { - tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") + tmpdir, err := prepareTmp("/tmp") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") + } + defer cleanupTmp(tmpdir) + tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir") if err != nil { return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") } @@ -397,6 +421,7 @@ func checkMountDestination(rootfs, dest string) error { "/proc/stat", "/proc/swaps", "/proc/uptime", + "/proc/loadavg", "/proc/net/dev", } for _, valid := range validDestinations { @@ -413,7 +438,7 @@ func checkMountDestination(rootfs, dest string) error { if err != nil { return err } - if path == "." || !strings.HasPrefix(path, "..") { + if path != "." && !strings.HasPrefix(path, "..") { return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) } } @@ -803,10 +828,7 @@ func remount(m *configs.Mount, rootfs string) error { if !strings.HasPrefix(dest, rootfs) { dest = filepath.Join(rootfs, dest) } - if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil { - return err - } - return nil + return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "") } // Do the mount operation followed by additional mounts required to take care diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 096c601e767..6613bb65cb5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -5,12 +5,14 @@ package libcontainer import ( "fmt" "os" + "runtime" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -28,10 +30,19 @@ func (l *linuxSetnsInit) getSessionRingName() string { } func (l *linuxSetnsInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if !l.config.Config.NoNewKeyring { - // do not inherit the parent's session keyring + // Do not inherit the parent's session keyring. if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { - return err + // Same justification as in standart_init_linux.go as to why we + // don't bail on ENOSYS. + // + // TODO(cyphar): And we should have logging here too. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } } } if l.config.CreateConsole { @@ -47,6 +58,10 @@ func (l *linuxSetnsInit) Init() error { return err } } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -61,9 +76,6 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err - } // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index e74d8002566..ad7ee8d8c8a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "os/exec" + "runtime" "syscall" //only for Exec "github.com/opencontainers/runc/libcontainer/apparmor" @@ -14,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -43,17 +45,31 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { } func (l *linuxStandardInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() if !l.config.Config.NoNewKeyring { ringname, keepperms, newperms := l.getSessionRingParams() // Do not inherit the parent's session keyring. - sessKeyId, err := keys.JoinSessionKeyring(ringname) - if err != nil { - return err - } - // Make session keyring searcheable. - if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return err + if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { + // If keyrings aren't supported then it is likely we are on an + // older kernel (or inside an LXC container). While we could bail, + // the security feature we are using here is best-effort (it only + // really provides marginal protection since VFS credentials are + // the only significant protection of keyrings). + // + // TODO(cyphar): Log this so people know what's going on, once we + // have proper logging in 'runc init'. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } else { + // Make session keyring searcheable. If we've gotten this far we + // bail on any error -- we don't want to have a keyring with bad + // permissions. + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return errors.Wrap(err, "mod keyring permissions") + } } } @@ -76,7 +92,7 @@ func (l *linuxStandardInit) Init() error { return err } if err := system.Setctty(); err != nil { - return err + return errors.Wrap(err, "setctty") } } @@ -89,46 +105,47 @@ func (l *linuxStandardInit) Init() error { if hostname := l.config.Config.Hostname; hostname != "" { if err := unix.Sethostname([]byte(hostname)); err != nil { - return err + return errors.Wrap(err, "sethostname") } } if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { - return err - } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err + return errors.Wrap(err, "apply apparmor profile") } for key, value := range l.config.Config.Sysctl { if err := writeSystemProperty(key, value); err != nil { - return err + return errors.Wrapf(err, "write sysctl key %s", key) } } for _, path := range l.config.Config.ReadonlyPaths { if err := readonlyPath(path); err != nil { - return err + return errors.Wrapf(err, "readonly path %s", path) } } for _, path := range l.config.Config.MaskPaths { if err := maskPath(path, l.config.Config.MountLabel); err != nil { - return err + return errors.Wrapf(err, "mask path %s", path) } } pdeath, err := system.GetParentDeathSignal() if err != nil { - return err + return errors.Wrap(err, "get pdeath signal") } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return err + return errors.Wrap(err, "set nonewprivileges") } } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. if err := syncParentReady(l.pipe); err != nil { - return err + return errors.Wrap(err, "sync ready") } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return errors.Wrap(err, "set process label") + } + defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -143,7 +160,7 @@ func (l *linuxStandardInit) Init() error { // finalizeNamespace can change user/group which clears the parent death // signal, so we restore it here. if err := pdeath.Restore(); err != nil { - return err + return errors.Wrap(err, "restore pdeath signal") } // Compare the parent from the initial start of the init process and make // sure that it did not change. if the parent changes that means it died diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go index cf7b45bc329..a8704a2679d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/sync.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -41,10 +41,7 @@ type syncT struct { // writeSync is used to write to a synchronisation pipe. An error is returned // if there was a problem writing the payload. func writeSync(pipe io.Writer, sync syncType) error { - if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { - return err - } - return nil + return utils.WriteJSON(pipe, syncT{sync}) } // readSync is used to read from a synchronisation pipe. An error is returned diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/BUILD b/vendor/github.com/opencontainers/runc/libcontainer/system/BUILD index ad5438de66f..7efcde3b573 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/BUILD +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/BUILD @@ -17,6 +17,41 @@ go_library( importpath = "github.com/opencontainers/runc/libcontainer/system", visibility = ["//visibility:public"], deps = select({ + "@io_bazel_rules_go//go/platform:android": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:darwin": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:dragonfly": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:freebsd": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:linux": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:nacl": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:netbsd": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:openbsd": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:plan9": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:solaris": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "@io_bazel_rules_go//go/platform:windows": [ + "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", + ], + "//conditions:default": [], + }) + select({ "@io_bazel_rules_go//go/platform:linux_386": [ "//vendor/golang.org/x/sys/unix:go_default_library", ], diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 5f124cd8bbc..a4ae8901acc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,13 +3,12 @@ package system import ( - "bufio" - "fmt" "os" "os/exec" "syscall" // only for exec "unsafe" + "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) @@ -102,34 +101,43 @@ func Setctty() error { } // RunningInUserNS detects whether we are currently running in a user namespace. -// Copied from github.com/lxc/lxd/shared/util.go +// Originally copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { - file, err := os.Open("/proc/self/uid_map") + uidmap, err := user.CurrentProcessUIDMap() if err != nil { // This kernel-provided file only exists if user namespaces are supported return false } - defer file.Close() + return UIDMapInUserNS(uidmap) +} - buf := bufio.NewReader(file) - l, _, err := buf.ReadLine() - if err != nil { - return false - } - - line := string(l) - var a, b, c int64 - fmt.Sscanf(line, "%d %d %d", &a, &b, &c) +func UIDMapInUserNS(uidmap []user.IDMap) bool { /* * We assume we are in the initial user namespace if we have a full * range - 4294967295 uids starting at uid 0. */ - if a == 0 && b == 0 && c == 4294967295 { + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { return false } return true } +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int64 { + euid := int64(os.Geteuid()) + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go index e7cfd62b293..b94be74a664 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -2,8 +2,26 @@ package system +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + // RunningInUserNS is a stub for non-Linux systems // Always returns false func RunningInUserNS() bool { return false } + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index c45e3004118..92b5ae8de01 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -5,6 +5,7 @@ package user import ( "io" "os" + "strconv" "golang.org/x/sys/unix" ) @@ -114,3 +115,30 @@ func CurrentUser() (User, error) { func CurrentGroup() (Group, error) { return LookupGid(unix.Getgid()) } + +func currentUserSubIDs(fileName string) ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) +} + +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 93414516caa..7b912bbf8b4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) { return newGroup, nil } +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int64 + Count int64 +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int64 + ParentID int64 + Count int64 +} + func parseLine(line string, v ...interface{}) { - if line == "" { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { return } - parts := strings.Split(line, ":") for i, p := range parts { // Ignore cases where we don't have enough fields to populate the arguments. // Some configuration files like to misbehave. @@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) { case *int: // "numbers", with conversion errors ignored because of some misbehaving configuration files. *e, _ = strconv.Atoi(p) + case *int64: + *e, _ = strconv.ParseInt(p, 10, 64) case *[]string: // Comma-separated lists. if p != "" { @@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) { } default: // Someone goof'd when writing code using this function. Scream so they can hear us. - panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e)) + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) } } } @@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int } return GetAdditionalGroups(additionalGroups, group) } + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index baa54c9ba2c..40ccfaa1a01 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -1,8 +1,6 @@ package utils import ( - "crypto/rand" - "encoding/hex" "encoding/json" "io" "os" @@ -17,19 +15,6 @@ const ( exitSignalOffset = 128 ) -// GenerateRandomName returns a new name joined with a prefix. This size -// specified is used to truncate the randomly generated value -func GenerateRandomName(prefix string, size int) (string, error) { - id := make([]byte, 32) - if _, err := io.ReadFull(rand.Reader, id); err != nil { - return "", err - } - if size > 64 { - size = 64 - } - return prefix + hex.EncodeToString(id)[:size], nil -} - // ResolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func ResolveRootfs(uncleanRootfs string) (string, error) {