diff --git a/pkg/api/pod/util.go b/pkg/api/pod/util.go index e9ca7c271d5..685db7a8851 100644 --- a/pkg/api/pod/util.go +++ b/pkg/api/pod/util.go @@ -365,6 +365,7 @@ func GetValidationOptionsFromPodSpecAndMeta(podSpec, oldPodSpec *api.PodSpec, po AllowInvalidLabelValueInSelector: false, AllowInvalidTopologySpreadConstraintLabelSelector: false, AllowMutableNodeSelectorAndNodeAffinity: utilfeature.DefaultFeatureGate.Enabled(features.PodSchedulingReadiness), + AllowNamespacedSysctlsForHostNetAndHostIPC: false, } if oldPodSpec != nil { @@ -377,6 +378,17 @@ func GetValidationOptionsFromPodSpecAndMeta(podSpec, oldPodSpec *api.PodSpec, po opts.AllowInvalidLabelValueInSelector = hasInvalidLabelValueInAffinitySelector(oldPodSpec) // if old spec has invalid labelSelector in topologySpreadConstraint, we must allow it opts.AllowInvalidTopologySpreadConstraintLabelSelector = hasInvalidTopologySpreadConstraintLabelSelector(oldPodSpec) + + // if old spec has invalid sysctl with hostNet or hostIPC, we must allow it when update + if oldPodSpec.SecurityContext != nil && len(oldPodSpec.SecurityContext.Sysctls) != 0 { + for _, s := range oldPodSpec.SecurityContext.Sysctls { + err := apivalidation.ValidateHostSysctl(s.Name, oldPodSpec.SecurityContext, nil) + if err != nil { + opts.AllowNamespacedSysctlsForHostNetAndHostIPC = true + break + } + } + } } if oldPodMeta != nil && !opts.AllowInvalidPodDeletionCost { // This is an update, so validate only if the existing object was valid. diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index a4d8aeba47a..1a0d83e8335 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -3813,6 +3813,8 @@ type PodValidationOptions struct { AllowInvalidTopologySpreadConstraintLabelSelector bool // Allow node selector additions for gated pods. AllowMutableNodeSelectorAndNodeAffinity bool + // Allow namespaced sysctls in hostNet and hostIPC pods + AllowNamespacedSysctlsForHostNetAndHostIPC bool // The top-level resource being validated is a Pod, not just a PodSpec // embedded in some other resource. ResourceIsPod bool @@ -4563,10 +4565,10 @@ func IsValidSysctlName(name string) bool { return sysctlContainSlashRegexp.MatchString(name) } -func validateSysctls(sysctls []core.Sysctl, fldPath *field.Path, hostNetwork, hostIPC bool) field.ErrorList { +func validateSysctls(securityContext *core.PodSecurityContext, fldPath *field.Path, opts PodValidationOptions) field.ErrorList { allErrs := field.ErrorList{} names := make(map[string]struct{}) - for i, s := range sysctls { + for i, s := range securityContext.Sysctls { if len(s.Name) == 0 { allErrs = append(allErrs, field.Required(fldPath.Index(i).Child("name"), "")) } else if !IsValidSysctlName(s.Name) { @@ -4574,41 +4576,27 @@ func validateSysctls(sysctls []core.Sysctl, fldPath *field.Path, hostNetwork, ho } else if _, ok := names[s.Name]; ok { allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("name"), s.Name)) } - // The parameters hostNet and hostIPC are used to forbid sysctls for pod sharing the - // respective namespaces with the host. - if !isValidSysctlWithHostNetIPC(s.Name, hostNetwork, hostIPC) { - allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("name"), s.Name, "sysctl not allowed with host net/ipc enabled")) + if !opts.AllowNamespacedSysctlsForHostNetAndHostIPC { + err := ValidateHostSysctl(s.Name, securityContext, fldPath.Index(i).Child("name")) + if err != nil { + allErrs = append(allErrs, err) + } } - names[s.Name] = struct{}{} } return allErrs } -func isValidSysctlWithHostNetIPC(sysctl string, hostNet, hostIPC bool) bool { - sysctl = utilsysctl.ConvertSysctlVariableToDotsSeparator(sysctl) - var ns utilsysctl.Namespace - if strings.HasSuffix(sysctl, "*") { - prefix := sysctl[:len(sysctl)-1] - ns = utilsysctl.NamespacedBy(prefix) - if ns == utilsysctl.UnknownNamespace { - // don't handle unknown namespace here - return true - } - } else { - ns = utilsysctl.NamespacedBy(sysctl) - if ns == utilsysctl.UnknownNamespace { - // don't handle unknown namespace here - return true - } +// ValidateHostSysctl will return error if namespaced sysctls is applied to pod sharing the respective namespaces with the host. +func ValidateHostSysctl(sysctl string, securityContext *core.PodSecurityContext, fldPath *field.Path) *field.Error { + ns, _, _ := utilsysctl.GetNamespace(sysctl) + switch { + case securityContext.HostNetwork && ns == utilsysctl.NetNamespace: + return field.Invalid(fldPath, sysctl, "may not be specified when 'hostNetwork' is true") + case securityContext.HostIPC && ns == utilsysctl.IPCNamespace: + return field.Invalid(fldPath, sysctl, "may not be specified when 'hostIPC' is true") } - if ns == utilsysctl.IpcNamespace && hostIPC { - return false - } - if ns == utilsysctl.NetNamespace && hostNet { - return false - } - return true + return nil } // validatePodSpecSecurityContext verifies the SecurityContext of a PodSpec, @@ -4643,13 +4631,7 @@ func validatePodSpecSecurityContext(securityContext *core.PodSecurityContext, sp } if len(securityContext.Sysctls) != 0 { - var hostNetwork, hostIPC bool - if spec.SecurityContext != nil { - hostNetwork = spec.SecurityContext.HostNetwork - hostIPC = spec.SecurityContext.HostIPC - } - - allErrs = append(allErrs, validateSysctls(securityContext.Sysctls, fldPath.Child("sysctls"), hostNetwork, hostIPC)...) + allErrs = append(allErrs, validateSysctls(securityContext, fldPath.Child("sysctls"), opts)...) } if securityContext.FSGroupChangePolicy != nil { diff --git a/pkg/apis/core/validation/validation_test.go b/pkg/apis/core/validation/validation_test.go index 70b22540e7f..f8c2385086c 100644 --- a/pkg/apis/core/validation/validation_test.go +++ b/pkg/apis/core/validation/validation_test.go @@ -21505,12 +21505,18 @@ func TestValidateSysctls(t *testing.T) { "kernel.shmmax", "kernel.shmmax", } + opts := PodValidationOptions{ + AllowNamespacedSysctlsForHostNetAndHostIPC: false, + } sysctls := make([]core.Sysctl, len(valid)) + validSecurityContext := &core.PodSecurityContext{ + Sysctls: sysctls, + } for i, sysctl := range valid { sysctls[i].Name = sysctl } - errs := validateSysctls(sysctls, field.NewPath("foo"), false, false) + errs := validateSysctls(validSecurityContext, field.NewPath("foo"), opts) if len(errs) != 0 { t.Errorf("unexpected validation errors: %v", errs) } @@ -21519,7 +21525,10 @@ func TestValidateSysctls(t *testing.T) { for i, sysctl := range invalid { sysctls[i].Name = sysctl } - errs = validateSysctls(sysctls, field.NewPath("foo"), false, false) + inValidSecurityContext := &core.PodSecurityContext{ + Sysctls: sysctls, + } + errs = validateSysctls(inValidSecurityContext, field.NewPath("foo"), opts) if len(errs) != 2 { t.Errorf("expected 2 validation errors. Got: %v", errs) } else { @@ -21535,7 +21544,10 @@ func TestValidateSysctls(t *testing.T) { for i, sysctl := range duplicates { sysctls[i].Name = sysctl } - errs = validateSysctls(sysctls, field.NewPath("foo"), false, false) + securityContextWithDup := &core.PodSecurityContext{ + Sysctls: sysctls, + } + errs = validateSysctls(securityContextWithDup, field.NewPath("foo"), opts) if len(errs) != 1 { t.Errorf("unexpected validation errors: %v", errs) } else if errs[0].Type != field.ErrorTypeDuplicate { @@ -21546,19 +21558,40 @@ func TestValidateSysctls(t *testing.T) { for i, sysctl := range invalidWithHostNet { sysctls[i].Name = sysctl } - errs = validateSysctls(sysctls, field.NewPath("foo"), true, false) + invalidSecurityContextWithHostNet := &core.PodSecurityContext{ + Sysctls: sysctls, + HostIPC: false, + HostNetwork: true, + } + errs = validateSysctls(invalidSecurityContextWithHostNet, field.NewPath("foo"), opts) if len(errs) != 2 { t.Errorf("unexpected validation errors: %v", errs) } + opts.AllowNamespacedSysctlsForHostNetAndHostIPC = true + errs = validateSysctls(invalidSecurityContextWithHostNet, field.NewPath("foo"), opts) + if len(errs) != 0 { + t.Errorf("unexpected validation errors: %v", errs) + } sysctls = make([]core.Sysctl, len(invalidWithHostIPC)) for i, sysctl := range invalidWithHostIPC { sysctls[i].Name = sysctl } - errs = validateSysctls(sysctls, field.NewPath("foo"), false, true) + invalidSecurityContextWithHostIPC := &core.PodSecurityContext{ + Sysctls: sysctls, + HostIPC: true, + HostNetwork: false, + } + opts.AllowNamespacedSysctlsForHostNetAndHostIPC = false + errs = validateSysctls(invalidSecurityContextWithHostIPC, field.NewPath("foo"), opts) if len(errs) != 2 { t.Errorf("unexpected validation errors: %v", errs) } + opts.AllowNamespacedSysctlsForHostNetAndHostIPC = true + errs = validateSysctls(invalidSecurityContextWithHostIPC, field.NewPath("foo"), opts) + if len(errs) != 0 { + t.Errorf("unexpected validation errors: %v", errs) + } } func newNodeNameEndpoint(nodeName string) *core.Endpoints { diff --git a/pkg/kubelet/sysctl/allowlist.go b/pkg/kubelet/sysctl/allowlist.go index c00449da4c8..e9e2671c17e 100644 --- a/pkg/kubelet/sysctl/allowlist.go +++ b/pkg/kubelet/sysctl/allowlist.go @@ -55,20 +55,14 @@ func NewAllowlist(patterns []string) (*patternAllowlist, error) { policyvalidation.SysctlContainSlashPatternFmt, ) } - s = utilsysctl.ConvertSysctlVariableToDotsSeparator(s) - if strings.HasSuffix(s, "*") { - prefix := s[:len(s)-1] - ns := utilsysctl.NamespacedBy(prefix) - if ns == utilsysctl.UnknownNamespace { - return nil, fmt.Errorf("the sysctls %q are not known to be namespaced", s) - } - w.prefixes[prefix] = ns + ns, sysctlOrPrefix, prefixed := utilsysctl.GetNamespace(s) + if ns == utilsysctl.UnknownNamespace { + return nil, fmt.Errorf("the sysctls %q are not known to be namespaced", sysctlOrPrefix) + } + if prefixed { + w.prefixes[sysctlOrPrefix] = ns } else { - ns := utilsysctl.NamespacedBy(s) - if ns == utilsysctl.UnknownNamespace { - return nil, fmt.Errorf("the sysctl %q are not known to be namespaced", s) - } - w.sysctls[s] = ns + w.sysctls[sysctlOrPrefix] = ns } } return w, nil @@ -82,10 +76,10 @@ func NewAllowlist(patterns []string) (*patternAllowlist, error) { // respective namespaces with the host. This check is only possible for sysctls on // the static default allowlist, not those on the custom allowlist provided by the admin. func (w *patternAllowlist) validateSysctl(sysctl string, hostNet, hostIPC bool) error { - sysctl = utilsysctl.ConvertSysctlVariableToDotsSeparator(sysctl) + sysctl = utilsysctl.NormalizeName(sysctl) nsErrorFmt := "%q not allowed with host %s enabled" if ns, found := w.sysctls[sysctl]; found { - if ns == utilsysctl.IpcNamespace && hostIPC { + if ns == utilsysctl.IPCNamespace && hostIPC { return fmt.Errorf(nsErrorFmt, sysctl, ns) } if ns == utilsysctl.NetNamespace && hostNet { @@ -95,7 +89,7 @@ func (w *patternAllowlist) validateSysctl(sysctl string, hostNet, hostIPC bool) } for p, ns := range w.prefixes { if strings.HasPrefix(sysctl, p) { - if ns == utilsysctl.IpcNamespace && hostIPC { + if ns == utilsysctl.IPCNamespace && hostIPC { return fmt.Errorf(nsErrorFmt, sysctl, ns) } if ns == utilsysctl.NetNamespace && hostNet { diff --git a/pkg/kubelet/sysctl/allowlist_test.go b/pkg/kubelet/sysctl/allowlist_test.go index 9535eabbd9b..6eddb604c6a 100644 --- a/pkg/kubelet/sysctl/allowlist_test.go +++ b/pkg/kubelet/sysctl/allowlist_test.go @@ -1,3 +1,6 @@ +//go:build linux +// +build linux + /* Copyright 2016 The Kubernetes Authors. @@ -17,9 +20,10 @@ limitations under the License. package sysctl import ( - "k8s.io/api/core/v1" - "k8s.io/kubernetes/pkg/kubelet/lifecycle" "testing" + + v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/lifecycle" ) func TestNewAllowlist(t *testing.T) { diff --git a/pkg/kubelet/sysctl/util.go b/pkg/kubelet/sysctl/util.go index 031cf80d6ea..dfd3930711a 100644 --- a/pkg/kubelet/sysctl/util.go +++ b/pkg/kubelet/sysctl/util.go @@ -29,7 +29,7 @@ func ConvertPodSysctlsVariableToDotsSeparator(securityContext *v1.PodSecurityCon return } for i, sysctl := range securityContext.Sysctls { - securityContext.Sysctls[i].Name = utilsysctl.ConvertSysctlVariableToDotsSeparator(sysctl.Name) + securityContext.Sysctls[i].Name = utilsysctl.NormalizeName(sysctl.Name) } return } diff --git a/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go b/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go index 591c0b8090b..7042766adec 100644 --- a/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go +++ b/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go @@ -24,37 +24,81 @@ import ( type Namespace string const ( + // refer to https://man7.org/linux/man-pages/man7/ipc_namespaces.7.html // the Linux IPC namespace - IpcNamespace = Namespace("ipc") + IPCNamespace = Namespace("IPC") + // refer to https://man7.org/linux/man-pages/man7/network_namespaces.7.html // the network namespace - NetNamespace = Namespace("net") + NetNamespace = Namespace("Net") // the zero value if no namespace is known UnknownNamespace = Namespace("") ) -var namespaces = map[string]Namespace{ - "kernel.sem": IpcNamespace, +var nameToNamespace = map[string]Namespace{ + // kernel semaphore parameters: SEMMSL, SEMMNS, SEMOPM, and SEMMNI. + "kernel.sem": IPCNamespace, + // kernel shared memory limits include shmall, shmmax, shmmni, and shm_rmid_forced. + "kernel.shmall": IPCNamespace, + "kernel.shmmax": IPCNamespace, + "kernel.shmmni": IPCNamespace, + "kernel.shm_rmid_forced": IPCNamespace, + // make backward compatibility to know the namespace of kernel.shm* + "kernel.shm": IPCNamespace, + // kernel messages include msgmni, msgmax and msgmnb. + "kernel.msgmax": IPCNamespace, + "kernel.msgmnb": IPCNamespace, + "kernel.msgmni": IPCNamespace, + // make backward compatibility to know the namespace of kernel.msg* + "kernel.msg": IPCNamespace, } -var prefixNamespaces = map[string]Namespace{ - "kernel.shm": IpcNamespace, - "kernel.msg": IpcNamespace, - "fs.mqueue.": IpcNamespace, - "net.": NetNamespace, +var prefixToNamespace = map[string]Namespace{ + "net": NetNamespace, + // mqueue filesystem provides the necessary kernel features to enable the creation + // of a user space library that implements the POSIX message queues API. + "fs.mqueue": IPCNamespace, } -// NamespacedBy returns the namespace of the Linux kernel for a sysctl, or +// namespaceOf returns the namespace of the Linux kernel for a sysctl, or // unknownNamespace if the sysctl is not known to be namespaced. -func NamespacedBy(val string) Namespace { - if ns, found := namespaces[val]; found { +// The second return is prefixed bool. +// It returns true if the key is prefixed with a key in the prefix map +func namespaceOf(val string) Namespace { + if ns, found := nameToNamespace[val]; found { return ns } - for p, ns := range prefixNamespaces { - if strings.HasPrefix(val, p) { + for p, ns := range prefixToNamespace { + if strings.HasPrefix(val, p+".") { return ns } } return UnknownNamespace } + +// GetNamespace extracts information from a sysctl string. It returns: +// 1. The sysctl namespace, which can be one of the following: IPC, Net, or unknown. +// 2. sysctlOrPrefix: the prefix of the sysctl parameter until the first '*'. +// If there is no '*', it will be the original string. +// 3. 'prefixed' is set to true if the sysctl parameter contains '*' or it is in the prefixToNamespace key list, in most cases, it is a suffix *. +// +// For example, if the input sysctl is 'net.ipv6.neigh.*', GetNamespace will return: +// - The Net namespace +// - The sysctlOrPrefix as 'net.ipv6.neigh' +// - 'prefixed' set to true +// +// For the input sysctl 'net.ipv6.conf.all.disable_ipv6', GetNamespace will return: +// - The Net namespace +// - The sysctlOrPrefix as 'net.ipv6.conf.all.disable_ipv6' +// - 'prefixed' set to false. +func GetNamespace(sysctl string) (ns Namespace, sysctlOrPrefix string, prefixed bool) { + sysctlOrPrefix = NormalizeName(sysctl) + firstIndex := strings.IndexAny(sysctlOrPrefix, "*") + if firstIndex != -1 { + sysctlOrPrefix = sysctlOrPrefix[:firstIndex] + prefixed = true + } + ns = namespaceOf(sysctlOrPrefix) + return +} diff --git a/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace_test.go b/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace_test.go index 821b1dfe620..167a8bda669 100644 --- a/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace_test.go +++ b/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace_test.go @@ -20,16 +20,16 @@ import ( "testing" ) -func TestNamespacedBy(t *testing.T) { +func TestNamespacedOf(t *testing.T) { tests := map[string]Namespace{ - "kernel.shm_rmid_forced": IpcNamespace, + "kernel.shm_rmid_forced": IPCNamespace, "net.a.b.c": NetNamespace, - "fs.mqueue.a.b.c": IpcNamespace, + "fs.mqueue.a.b.c": IPCNamespace, "foo": UnknownNamespace, } for sysctl, ns := range tests { - if got := NamespacedBy(sysctl); got != ns { + if got := namespaceOf(sysctl); got != ns { t.Errorf("wrong namespace for %q: got=%s want=%s", sysctl, got, ns) } } diff --git a/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl.go b/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl.go index 8e6268fcb7e..3233c7d181b 100644 --- a/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl.go +++ b/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl.go @@ -99,22 +99,25 @@ func (*procSysctl) SetSysctl(sysctl string, newVal int) error { return os.WriteFile(path.Join(sysctlBase, sysctl), []byte(strconv.Itoa(newVal)), 0640) } -// ConvertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format. +// NormalizeName can return sysctl variables in dots separator format. // The '/' separator is also accepted in place of a '.'. // Convert the sysctl variables to dots separator format for validation. // More info: // // https://man7.org/linux/man-pages/man8/sysctl.8.html // https://man7.org/linux/man-pages/man5/sysctl.d.5.html -func ConvertSysctlVariableToDotsSeparator(val string) string { +func NormalizeName(val string) string { if val == "" { return val } firstSepIndex := strings.IndexAny(val, "./") + // if the first found is `.` like `net.ipv4.conf.eno2/100.rp_filter` if firstSepIndex == -1 || val[firstSepIndex] == '.' { return val } + // for `net/ipv4/conf/eno2.100/rp_filter`, swap the use of `.` and `/` + // to `net.ipv4.conf.eno2/100.rp_filter` f := func(r rune) rune { switch r { case '.': diff --git a/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl_test.go b/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl_test.go index a4ea1ef9bc1..0ab0f3fc8a5 100644 --- a/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl_test.go +++ b/staging/src/k8s.io/component-helpers/node/util/sysctl/sysctl_test.go @@ -40,7 +40,7 @@ func TestConvertSysctlVariableToDotsSeparator(t *testing.T) { } for _, test := range valid { - convertSysctlVal := ConvertSysctlVariableToDotsSeparator(test.in) + convertSysctlVal := NormalizeName(test.in) assert.Equalf(t, test.out, convertSysctlVal, "The sysctl variable was not converted correctly. got: %s, want: %s", convertSysctlVal, test.out) } }