diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index dae17977504..753365a029d 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -294,10 +294,9 @@ const ( // owner: @pweil- // alpha: v1.5 + // deprecated: v1.28 // - // Default userns=host for containers that are using other host namespaces, host mounts, the pod - // contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE, - // SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon. + // This flag used to be needed for dockershim CRI and currently does nothing. ExperimentalHostUserNamespaceDefaultingGate featuregate.Feature = "ExperimentalHostUserNamespaceDefaulting" // owner: @yuzhiquan, @bowei, @PxyUp, @SergeyKanzhelev @@ -998,7 +997,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS ExpandedDNSConfig: {Default: true, PreRelease: featuregate.Beta}, - ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Beta}, + ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Deprecated, LockToDefault: true}, // remove in 1.30 GRPCContainerProbe: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, //remove in 1.29 diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index 8a154f272c8..7fa8f44ef73 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -480,12 +480,6 @@ type RunContainerOptions struct { ReadOnly bool // hostname for pod containers Hostname string - // EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net), - // are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container, - // or using host path volumes. - // This should only be enabled when the container runtime is performing user remapping AND if the - // experimental behavior is desired. - EnableHostUserNamespace bool } // VolumeInfo contains information about the volume. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 4fedb38f2aa..14ca66f8a7f 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -517,56 +517,55 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope) klet := &Kubelet{ - hostname: hostname, - hostnameOverridden: hostnameOverridden, - nodeName: nodeName, - kubeClient: kubeDeps.KubeClient, - heartbeatClient: kubeDeps.HeartbeatClient, - onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure, - rootDirectory: filepath.Clean(rootDirectory), - resyncInterval: kubeCfg.SyncFrequency.Duration, - sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources), - registerNode: registerNode, - registerWithTaints: registerWithTaints, - registerSchedulable: registerSchedulable, - dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig), - serviceLister: serviceLister, - serviceHasSynced: serviceHasSynced, - nodeLister: nodeLister, - nodeHasSynced: nodeHasSynced, - streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration, - recorder: kubeDeps.Recorder, - cadvisor: kubeDeps.CAdvisorInterface, - cloud: kubeDeps.Cloud, - externalCloudProvider: cloudprovider.IsExternal(cloudProvider), - providerID: providerID, - nodeRef: nodeRef, - nodeLabels: nodeLabels, - nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration, - nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration, - os: kubeDeps.OSInterface, - oomWatcher: oomWatcher, - cgroupsPerQOS: kubeCfg.CgroupsPerQOS, - cgroupRoot: kubeCfg.CgroupRoot, - mounter: kubeDeps.Mounter, - hostutil: kubeDeps.HostUtil, - subpather: kubeDeps.Subpather, - maxPods: int(kubeCfg.MaxPods), - podsPerCore: int(kubeCfg.PodsPerCore), - syncLoopMonitor: atomic.Value{}, - daemonEndpoints: daemonEndpoints, - containerManager: kubeDeps.ContainerManager, - nodeIPs: nodeIPs, - nodeIPValidator: validateNodeIP, - clock: clock.RealClock{}, - enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, - makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, - iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit), - iptablesDropBit: int(kubeCfg.IPTablesDropBit), - experimentalHostUserNamespaceDefaulting: utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalHostUserNamespaceDefaultingGate), - keepTerminatedPodVolumes: keepTerminatedPodVolumes, - nodeStatusMaxImages: nodeStatusMaxImages, - tracer: tracer, + hostname: hostname, + hostnameOverridden: hostnameOverridden, + nodeName: nodeName, + kubeClient: kubeDeps.KubeClient, + heartbeatClient: kubeDeps.HeartbeatClient, + onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure, + rootDirectory: filepath.Clean(rootDirectory), + resyncInterval: kubeCfg.SyncFrequency.Duration, + sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources), + registerNode: registerNode, + registerWithTaints: registerWithTaints, + registerSchedulable: registerSchedulable, + dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig), + serviceLister: serviceLister, + serviceHasSynced: serviceHasSynced, + nodeLister: nodeLister, + nodeHasSynced: nodeHasSynced, + streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration, + recorder: kubeDeps.Recorder, + cadvisor: kubeDeps.CAdvisorInterface, + cloud: kubeDeps.Cloud, + externalCloudProvider: cloudprovider.IsExternal(cloudProvider), + providerID: providerID, + nodeRef: nodeRef, + nodeLabels: nodeLabels, + nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration, + nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration, + os: kubeDeps.OSInterface, + oomWatcher: oomWatcher, + cgroupsPerQOS: kubeCfg.CgroupsPerQOS, + cgroupRoot: kubeCfg.CgroupRoot, + mounter: kubeDeps.Mounter, + hostutil: kubeDeps.HostUtil, + subpather: kubeDeps.Subpather, + maxPods: int(kubeCfg.MaxPods), + podsPerCore: int(kubeCfg.PodsPerCore), + syncLoopMonitor: atomic.Value{}, + daemonEndpoints: daemonEndpoints, + containerManager: kubeDeps.ContainerManager, + nodeIPs: nodeIPs, + nodeIPValidator: validateNodeIP, + clock: clock.RealClock{}, + enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, + makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, + iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit), + iptablesDropBit: int(kubeCfg.IPTablesDropBit), + keepTerminatedPodVolumes: keepTerminatedPodVolumes, + nodeStatusMaxImages: nodeStatusMaxImages, + tracer: tracer, } if klet.cloud != nil { @@ -596,10 +595,6 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.configMapManager = configMapManager } - if klet.experimentalHostUserNamespaceDefaulting { - klog.InfoS("Experimental host user namespace defaulting is enabled") - } - machineInfo, err := klet.cadvisor.MachineInfo() if err != nil { return nil, err @@ -1227,13 +1222,6 @@ type Kubelet struct { // The AppArmor validator for checking whether AppArmor is supported. appArmorValidator apparmor.Validator - // experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net), - // are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container, - // or using host path volumes. - // This should only be enabled when the container runtime is performing user remapping AND if the - // experimental behavior is desired. - experimentalHostUserNamespaceDefaulting bool - // StatsProvider provides the node and the container stats. StatsProvider *stats.Provider diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index fbc9104c428..411b0b5cbd1 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -38,7 +38,6 @@ import ( "k8s.io/apimachinery/pkg/util/sets" utilvalidation "k8s.io/apimachinery/pkg/util/validation" utilfeature "k8s.io/apiserver/pkg/util/feature" - "k8s.io/component-helpers/storage/ephemeral" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" podutil "k8s.io/kubernetes/pkg/api/v1/pod" @@ -515,11 +514,6 @@ func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, } } - // only do this check if the experimental behavior is enabled, otherwise allow it to default to false - if kl.experimentalHostUserNamespaceDefaulting { - opts.EnableHostUserNamespace = kl.enableHostUserNamespace(ctx, pod) - } - return opts, cleanupAction, nil } @@ -2166,82 +2160,3 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupP go pcm.Destroy(val) } } - -// enableHostUserNamespace determines if the host user namespace should be used by the container runtime. -// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced -// capability, the pod contains a privileged container, or the pod has a host path volume. -// -// NOTE: when if a container shares any namespace with another container it must also share the user namespace -// or it will not have the correct capabilities in the namespace. This means that host user namespace -// is enabled per pod, not per container. -func (kl *Kubelet) enableHostUserNamespace(ctx context.Context, pod *v1.Pod) bool { - if kubecontainer.HasPrivilegedContainer(pod) || hasHostNamespace(pod) || - hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(ctx, pod) { - return true - } - return false -} - -// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container. -func hasNonNamespacedCapability(pod *v1.Pod) bool { - for _, c := range pod.Spec.Containers { - if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil { - for _, cap := range c.SecurityContext.Capabilities.Add { - if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" { - return true - } - } - } - } - - return false -} - -// hasHostVolume returns true if the pod spec has a HostPath volume. -func hasHostVolume(pod *v1.Pod) bool { - for _, v := range pod.Spec.Volumes { - if v.HostPath != nil { - return true - } - } - return false -} - -// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true. -func hasHostNamespace(pod *v1.Pod) bool { - if pod.Spec.SecurityContext == nil { - return false - } - return pod.Spec.HostIPC || pod.Spec.HostNetwork || pod.Spec.HostPID -} - -// hasHostMountPVC returns true if a PVC is referencing a HostPath volume. -func (kl *Kubelet) hasHostMountPVC(ctx context.Context, pod *v1.Pod) bool { - for _, volume := range pod.Spec.Volumes { - pvcName := "" - switch { - case volume.PersistentVolumeClaim != nil: - pvcName = volume.PersistentVolumeClaim.ClaimName - case volume.Ephemeral != nil: - pvcName = ephemeral.VolumeClaimName(pod, &volume) - default: - continue - } - pvc, err := kl.kubeClient.CoreV1().PersistentVolumeClaims(pod.Namespace).Get(ctx, pvcName, metav1.GetOptions{}) - if err != nil { - klog.InfoS("Unable to retrieve pvc", "pvc", klog.KRef(pod.Namespace, pvcName), "err", err) - continue - } - if pvc != nil { - referencedVolume, err := kl.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}) - if err != nil { - klog.InfoS("Unable to retrieve pv", "pvName", pvc.Spec.VolumeName, "err", err) - continue - } - if referencedVolume != nil && referencedVolume.Spec.HostPath != nil { - return true - } - } - } - return false -} diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index 6d8b622d21d..14be78ba4d4 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -3437,222 +3437,6 @@ func TestGetPortForward(t *testing.T) { } } -func TestHasHostMountPVC(t *testing.T) { - type testcase struct { - pvError error - pvcError error - expected bool - podHasPVC bool - pvcIsHostPath bool - podHasEphemeral bool - } - tests := map[string]testcase{ - "no pvc": {podHasPVC: false, expected: false}, - "error fetching pvc": { - podHasPVC: true, - pvcError: fmt.Errorf("foo"), - expected: false, - }, - "error fetching pv": { - podHasPVC: true, - pvError: fmt.Errorf("foo"), - expected: false, - }, - "host path pvc": { - podHasPVC: true, - pvcIsHostPath: true, - expected: true, - }, - "enabled ephemeral host path": { - podHasEphemeral: true, - pvcIsHostPath: true, - expected: true, - }, - "non host path pvc": { - podHasPVC: true, - pvcIsHostPath: false, - expected: false, - }, - } - - run := func(t *testing.T, v testcase) { - ctx := context.Background() - testKubelet := newTestKubelet(t, false) - defer testKubelet.Cleanup() - pod := &v1.Pod{ - Spec: v1.PodSpec{}, - } - - volumeToReturn := &v1.PersistentVolume{ - Spec: v1.PersistentVolumeSpec{}, - } - - if v.podHasPVC { - pod.Spec.Volumes = []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{}, - }, - }, - } - } - - if v.podHasEphemeral { - pod.Spec.Volumes = []v1.Volume{ - { - Name: "xyz", - VolumeSource: v1.VolumeSource{ - Ephemeral: &v1.EphemeralVolumeSource{}, - }, - }, - } - } - - if (v.podHasPVC || v.podHasEphemeral) && v.pvcIsHostPath { - volumeToReturn.Spec.PersistentVolumeSource = v1.PersistentVolumeSource{ - HostPath: &v1.HostPathVolumeSource{}, - } - } - - testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) { - return true, &v1.PersistentVolumeClaim{ - Spec: v1.PersistentVolumeClaimSpec{ - VolumeName: "foo", - }, - }, v.pvcError - }) - testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumes", func(action core.Action) (bool, runtime.Object, error) { - return true, volumeToReturn, v.pvError - }) - - actual := testKubelet.kubelet.hasHostMountPVC(ctx, pod) - if actual != v.expected { - t.Errorf("expected %t but got %t", v.expected, actual) - } - } - - for k, v := range tests { - t.Run(k, func(t *testing.T) { - run(t, v) - }) - } -} - -func TestHasNonNamespacedCapability(t *testing.T) { - createPodWithCap := func(caps []v1.Capability) *v1.Pod { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{{}}, - }, - } - - if len(caps) > 0 { - pod.Spec.Containers[0].SecurityContext = &v1.SecurityContext{ - Capabilities: &v1.Capabilities{ - Add: caps, - }, - } - } - return pod - } - - nilCaps := createPodWithCap([]v1.Capability{v1.Capability("foo")}) - nilCaps.Spec.Containers[0].SecurityContext = nil - - tests := map[string]struct { - pod *v1.Pod - expected bool - }{ - "nil security contxt": {createPodWithCap(nil), false}, - "nil caps": {nilCaps, false}, - "namespaced cap": {createPodWithCap([]v1.Capability{v1.Capability("foo")}), false}, - "non-namespaced cap MKNOD": {createPodWithCap([]v1.Capability{v1.Capability("MKNOD")}), true}, - "non-namespaced cap SYS_TIME": {createPodWithCap([]v1.Capability{v1.Capability("SYS_TIME")}), true}, - "non-namespaced cap SYS_MODULE": {createPodWithCap([]v1.Capability{v1.Capability("SYS_MODULE")}), true}, - } - - for k, v := range tests { - actual := hasNonNamespacedCapability(v.pod) - if actual != v.expected { - t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual) - } - } -} - -func TestHasHostVolume(t *testing.T) { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{}, - }, - }, - }, - }, - } - - result := hasHostVolume(pod) - if !result { - t.Errorf("expected host volume to enable host user namespace") - } - - pod.Spec.Volumes[0].VolumeSource.HostPath = nil - result = hasHostVolume(pod) - if result { - t.Errorf("expected nil host volume to not enable host user namespace") - } -} - -func TestHasHostNamespace(t *testing.T) { - tests := map[string]struct { - ps v1.PodSpec - expected bool - }{ - "nil psc": { - ps: v1.PodSpec{}, - expected: false}, - - "host pid true": { - ps: v1.PodSpec{ - HostPID: true, - SecurityContext: &v1.PodSecurityContext{}, - }, - expected: true, - }, - "host ipc true": { - ps: v1.PodSpec{ - HostIPC: true, - SecurityContext: &v1.PodSecurityContext{}, - }, - expected: true, - }, - "host net true": { - ps: v1.PodSpec{ - HostNetwork: true, - SecurityContext: &v1.PodSecurityContext{}, - }, - expected: true, - }, - "no host ns": { - ps: v1.PodSpec{ - SecurityContext: &v1.PodSecurityContext{}, - }, - expected: false, - }, - } - - for k, v := range tests { - pod := &v1.Pod{ - Spec: v.ps, - } - actual := hasHostNamespace(pod) - if actual != v.expected { - t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual) - } - } -} - func TestTruncatePodHostname(t *testing.T) { for c, test := range map[string]struct { input string