From 9c0f9b204f67ab0f4232163e41ba22762af2ca59 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 23 Aug 2021 13:05:23 -0700 Subject: [PATCH 1/2] vendor: bump runc to 1.0.2 For the complete release notes, see - https://github.com/opencontainers/runc/releases/tag/v1.0.2 In particular, this fixes the check cgroup v1 systemd manager check if a container needs to be frozen before Set(), and adds a knob to skip the check/freeze entirely (to be used by the next commit). Signed-off-by: Kir Kolyshkin --- go.mod | 4 +-- go.sum | 4 +-- .../runc/libcontainer/cgroups/fs/cpu.go | 24 ++++++++++++++++-- .../libcontainer/cgroups/systemd/common.go | 12 +++++++-- .../runc/libcontainer/cgroups/systemd/v1.go | 25 +++++++++++++------ .../runc/libcontainer/configs/cgroup_linux.go | 12 +++++++++ .../libcontainer/seccomp/seccomp_linux.go | 21 ++++++++++------ vendor/modules.txt | 4 +-- 8 files changed, 81 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index 8fb0d449a38..28d0ee40cd6 100644 --- a/go.mod +++ b/go.mod @@ -65,7 +65,7 @@ require ( github.com/onsi/ginkgo v1.14.0 github.com/onsi/gomega v1.10.1 github.com/opencontainers/go-digest v1.0.0 - github.com/opencontainers/runc v1.0.1 + github.com/opencontainers/runc v1.0.2 github.com/opencontainers/selinux v1.8.2 github.com/pkg/errors v0.9.1 github.com/pmezard/go-difflib v1.0.0 @@ -360,7 +360,7 @@ replace ( github.com/onsi/gomega => github.com/onsi/gomega v1.10.1 github.com/opencontainers/go-digest => github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec => github.com/opencontainers/image-spec v1.0.1 - github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.1 + github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.2 github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/opencontainers/selinux => github.com/opencontainers/selinux v1.8.2 github.com/opentracing/opentracing-go => github.com/opentracing/opentracing-go v1.1.0 diff --git a/go.sum b/go.sum index e9a47b6e336..7b24fc3620c 100644 --- a/go.sum +++ b/go.sum @@ -364,8 +364,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.0.1 h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVojFA6h/TRcI= github.com/opencontainers/image-spec v1.0.1/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= -github.com/opencontainers/runc v1.0.1 h1:G18PGckGdAm3yVQRWDVQ1rLSLntiniKJ0cNRT2Tm5gs= -github.com/opencontainers/runc v1.0.1/go.mod h1:aTaHFFwQXuA71CiyxOdFFIorAoemI04suvGRQFzWTD0= +github.com/opencontainers/runc v1.0.2 h1:opHZMaswlyxz1OuGpBE53Dwe4/xF7EZTY0A2L/FpCOg= +github.com/opencontainers/runc v1.0.2/go.mod h1:aTaHFFwQXuA71CiyxOdFFIorAoemI04suvGRQFzWTD0= github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc= github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.8.2 h1:c4ca10UMgRcvZ6h0K4HtS15UaVSBEaE+iln2LVpAuGc= diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index 31c1c874ea0..7ca8b8a8d40 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -4,6 +4,7 @@ package fs import ( "bufio" + "errors" "fmt" "os" "strconv" @@ -11,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" ) type CpuGroup struct{} @@ -71,15 +73,33 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error { return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) } } + + var period string if r.CpuPeriod != 0 { - if err := cgroups.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil { - return err + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" } } if r.CpuQuota != 0 { if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { return err } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } } return s.SetRtSched(path, r) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go index 3506c82746d..06c6c2e95a6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go @@ -310,6 +310,14 @@ func getUnitName(c *configs.Cgroup) string { return c.Name } +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + // isDbusError returns true if the error is a specific dbus error. func isDbusError(err error, name string) bool { if err != nil { @@ -388,10 +396,10 @@ func resetFailedUnit(cm *dbusConnManager, name string) { } } -func getUnitProperty(cm *dbusConnManager, unitName string, propertyName string) (*systemdDbus.Property, error) { +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { var prop *systemdDbus.Property err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { - prop, Err = c.GetUnitPropertyContext(context.TODO(), unitName, propertyName) + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) return Err }) return prop, err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go index 1a8e1e3c6c1..cd4720c5d44 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go @@ -6,6 +6,7 @@ import ( "errors" "os" "path/filepath" + "reflect" "strings" "sync" @@ -345,6 +346,11 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // Special case for SkipDevices, as used by Kubernetes to create pod // cgroups with allow-all device policy). if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + // No need to freeze if SkipDevices is set, and either // (1) systemd unit does not (yet) exist, or // (2) it has DevicePolicy=auto and empty DeviceAllow list. @@ -353,15 +359,20 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // a non-existent unit returns default properties, // and settings in (2) are the defaults. // - // Do not return errors from getUnitProperty, as they alone + // Do not return errors from getUnitTypeProperty, as they alone // should not prevent Set from working. - devPolicy, e := getUnitProperty(m.dbus, unitName, "DevicePolicy") + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { - devAllow, e := getUnitProperty(m.dbus, unitName, "DeviceAllow") - if e == nil && devAllow.Value == dbus.MakeVariant([]deviceAllowEntry{}) { - needsFreeze = false - needsThaw = false - return + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index a1e7f0afd44..5ea9d940cef 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -131,4 +131,16 @@ type Resources struct { // // NOTE it is impossible to start a container which has this flag set. SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index b14d0ede3bb..f80118f2c9b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -67,7 +67,7 @@ func InitSeccomp(config *configs.Seccomp) error { if call == nil { return errors.New("encountered nil syscall while initializing Seccomp") } - if err := matchCall(filter, call); err != nil { + if err := matchCall(filter, call, defaultAction); err != nil { return err } } @@ -142,7 +142,7 @@ func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { } // Add a rule to match a single syscall -func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error { if call == nil || filter == nil { return errors.New("cannot use nil as syscall to block") } @@ -151,6 +151,17 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { return errors.New("empty string is not a valid syscall") } + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action, call.ErrnoRet) + if err != nil { + return fmt.Errorf("action in seccomp profile is invalid: %w", err) + } + if callAct == defAct { + // This rule is redundant, silently skip it + // to avoid error from AddRule. + return nil + } + // If we can't resolve the syscall, assume it's not supported on this kernel // Ignore it, don't error out callNum, err := libseccomp.GetSyscallFromName(call.Name) @@ -158,12 +169,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { return nil } - // Convert the call's action to the libseccomp equivalent - callAct, err := getAction(call.Action, call.ErrnoRet) - if err != nil { - return fmt.Errorf("action in seccomp profile is invalid: %s", err) - } - // Unconditional match - just add the rule if len(call.Args) == 0 { if err := filter.AddRule(callNum, callAct); err != nil { diff --git a/vendor/modules.txt b/vendor/modules.txt index 52f77324b8e..0bd22f81eb5 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -616,7 +616,7 @@ github.com/opencontainers/go-digest # github.com/opencontainers/image-spec v1.0.1 => github.com/opencontainers/image-spec v1.0.1 github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.0.1 => github.com/opencontainers/runc v1.0.1 +# github.com/opencontainers/runc v1.0.2 => github.com/opencontainers/runc v1.0.2 ## explicit github.com/opencontainers/runc/libcontainer github.com/opencontainers/runc/libcontainer/apparmor @@ -2604,7 +2604,7 @@ sigs.k8s.io/yaml # github.com/onsi/gomega => github.com/onsi/gomega v1.10.1 # github.com/opencontainers/go-digest => github.com/opencontainers/go-digest v1.0.0 # github.com/opencontainers/image-spec => github.com/opencontainers/image-spec v1.0.1 -# github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.1 +# github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.2 # github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 # github.com/opencontainers/selinux => github.com/opencontainers/selinux v1.8.2 # github.com/opentracing/opentracing-go => github.com/opentracing/opentracing-go v1.1.0 From c06a851042cbd0467bf2c59783825c588ffc1978 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 23 Aug 2021 13:41:51 -0700 Subject: [PATCH 2/2] pkg/kubelet/cm: use SkipFreezeOnSet This is a knob added by runc 1.0.2 specifically for kubernetes, which tells runc/libcontainer/cgroups/systemd v1 manager to not freeze the cgroup in Set(). We set this knob here because this code is only used for pods (rather than containers) management, and in this place we create or update the pod cgroup with no device limits set, so we can skip the freeze. If this knob is not set, libcontainer's cgroup v1 manager tries to figure out whether the freeze is needed or not, but it's a somewhat expensive check to perform, thus the knob is a shortcut. Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/cgroup_manager_linux.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 4936f4b166b..230173690d5 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -379,7 +379,8 @@ func getSupportedUnifiedControllers() sets.String { func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources { resources := &libcontainerconfigs.Resources{ - SkipDevices: true, + SkipDevices: true, + SkipFreezeOnSet: true, } if resourceConfig == nil { return resources