Merge pull request #116723 from SergeyKanzhelev/ExperimentalHostUserNamespaceDefaulting

deprecate ExperimentalHostUserNamespaceDefaulting
This commit is contained in:
Kubernetes Prow Robot 2023-04-11 21:16:57 -07:00 committed by GitHub
commit 74ad7c397d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 52 additions and 372 deletions

View File

@ -294,10 +294,9 @@ const (
// owner: @pweil-
// alpha: v1.5
// deprecated: v1.28
//
// Default userns=host for containers that are using other host namespaces, host mounts, the pod
// contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE,
// SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon.
// This flag used to be needed for dockershim CRI and currently does nothing.
ExperimentalHostUserNamespaceDefaultingGate featuregate.Feature = "ExperimentalHostUserNamespaceDefaulting"
// owner: @yuzhiquan, @bowei, @PxyUp, @SergeyKanzhelev
@ -998,7 +997,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
ExpandedDNSConfig: {Default: true, PreRelease: featuregate.Beta},
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Beta},
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Deprecated, LockToDefault: true}, // remove in 1.30
GRPCContainerProbe: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, //remove in 1.29

View File

@ -480,12 +480,6 @@ type RunContainerOptions struct {
ReadOnly bool
// hostname for pod containers
Hostname string
// EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
EnableHostUserNamespace bool
}
// VolumeInfo contains information about the volume.

View File

@ -517,56 +517,55 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope)
klet := &Kubelet{
hostname: hostname,
hostnameOverridden: hostnameOverridden,
nodeName: nodeName,
kubeClient: kubeDeps.KubeClient,
heartbeatClient: kubeDeps.HeartbeatClient,
onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure,
rootDirectory: filepath.Clean(rootDirectory),
resyncInterval: kubeCfg.SyncFrequency.Duration,
sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources),
registerNode: registerNode,
registerWithTaints: registerWithTaints,
registerSchedulable: registerSchedulable,
dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig),
serviceLister: serviceLister,
serviceHasSynced: serviceHasSynced,
nodeLister: nodeLister,
nodeHasSynced: nodeHasSynced,
streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration,
recorder: kubeDeps.Recorder,
cadvisor: kubeDeps.CAdvisorInterface,
cloud: kubeDeps.Cloud,
externalCloudProvider: cloudprovider.IsExternal(cloudProvider),
providerID: providerID,
nodeRef: nodeRef,
nodeLabels: nodeLabels,
nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration,
nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration,
os: kubeDeps.OSInterface,
oomWatcher: oomWatcher,
cgroupsPerQOS: kubeCfg.CgroupsPerQOS,
cgroupRoot: kubeCfg.CgroupRoot,
mounter: kubeDeps.Mounter,
hostutil: kubeDeps.HostUtil,
subpather: kubeDeps.Subpather,
maxPods: int(kubeCfg.MaxPods),
podsPerCore: int(kubeCfg.PodsPerCore),
syncLoopMonitor: atomic.Value{},
daemonEndpoints: daemonEndpoints,
containerManager: kubeDeps.ContainerManager,
nodeIPs: nodeIPs,
nodeIPValidator: validateNodeIP,
clock: clock.RealClock{},
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
experimentalHostUserNamespaceDefaulting: utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalHostUserNamespaceDefaultingGate),
keepTerminatedPodVolumes: keepTerminatedPodVolumes,
nodeStatusMaxImages: nodeStatusMaxImages,
tracer: tracer,
hostname: hostname,
hostnameOverridden: hostnameOverridden,
nodeName: nodeName,
kubeClient: kubeDeps.KubeClient,
heartbeatClient: kubeDeps.HeartbeatClient,
onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure,
rootDirectory: filepath.Clean(rootDirectory),
resyncInterval: kubeCfg.SyncFrequency.Duration,
sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources),
registerNode: registerNode,
registerWithTaints: registerWithTaints,
registerSchedulable: registerSchedulable,
dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig),
serviceLister: serviceLister,
serviceHasSynced: serviceHasSynced,
nodeLister: nodeLister,
nodeHasSynced: nodeHasSynced,
streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration,
recorder: kubeDeps.Recorder,
cadvisor: kubeDeps.CAdvisorInterface,
cloud: kubeDeps.Cloud,
externalCloudProvider: cloudprovider.IsExternal(cloudProvider),
providerID: providerID,
nodeRef: nodeRef,
nodeLabels: nodeLabels,
nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration,
nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration,
os: kubeDeps.OSInterface,
oomWatcher: oomWatcher,
cgroupsPerQOS: kubeCfg.CgroupsPerQOS,
cgroupRoot: kubeCfg.CgroupRoot,
mounter: kubeDeps.Mounter,
hostutil: kubeDeps.HostUtil,
subpather: kubeDeps.Subpather,
maxPods: int(kubeCfg.MaxPods),
podsPerCore: int(kubeCfg.PodsPerCore),
syncLoopMonitor: atomic.Value{},
daemonEndpoints: daemonEndpoints,
containerManager: kubeDeps.ContainerManager,
nodeIPs: nodeIPs,
nodeIPValidator: validateNodeIP,
clock: clock.RealClock{},
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
keepTerminatedPodVolumes: keepTerminatedPodVolumes,
nodeStatusMaxImages: nodeStatusMaxImages,
tracer: tracer,
}
if klet.cloud != nil {
@ -596,10 +595,6 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.configMapManager = configMapManager
}
if klet.experimentalHostUserNamespaceDefaulting {
klog.InfoS("Experimental host user namespace defaulting is enabled")
}
machineInfo, err := klet.cadvisor.MachineInfo()
if err != nil {
return nil, err
@ -1227,13 +1222,6 @@ type Kubelet struct {
// The AppArmor validator for checking whether AppArmor is supported.
appArmorValidator apparmor.Validator
// experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool
// StatsProvider provides the node and the container stats.
StatsProvider *stats.Provider

View File

@ -38,7 +38,6 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
utilvalidation "k8s.io/apimachinery/pkg/util/validation"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/storage/ephemeral"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
@ -515,11 +514,6 @@ func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod,
}
}
// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
if kl.experimentalHostUserNamespaceDefaulting {
opts.EnableHostUserNamespace = kl.enableHostUserNamespace(ctx, pod)
}
return opts, cleanupAction, nil
}
@ -2169,82 +2163,3 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupP
go pcm.Destroy(val)
}
}
// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
// capability, the pod contains a privileged container, or the pod has a host path volume.
//
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
// or it will not have the correct capabilities in the namespace. This means that host user namespace
// is enabled per pod, not per container.
func (kl *Kubelet) enableHostUserNamespace(ctx context.Context, pod *v1.Pod) bool {
if kubecontainer.HasPrivilegedContainer(pod) || hasHostNamespace(pod) ||
hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(ctx, pod) {
return true
}
return false
}
// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
func hasNonNamespacedCapability(pod *v1.Pod) bool {
for _, c := range pod.Spec.Containers {
if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil {
for _, cap := range c.SecurityContext.Capabilities.Add {
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
return true
}
}
}
}
return false
}
// hasHostVolume returns true if the pod spec has a HostPath volume.
func hasHostVolume(pod *v1.Pod) bool {
for _, v := range pod.Spec.Volumes {
if v.HostPath != nil {
return true
}
}
return false
}
// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
func hasHostNamespace(pod *v1.Pod) bool {
if pod.Spec.SecurityContext == nil {
return false
}
return pod.Spec.HostIPC || pod.Spec.HostNetwork || pod.Spec.HostPID
}
// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
func (kl *Kubelet) hasHostMountPVC(ctx context.Context, pod *v1.Pod) bool {
for _, volume := range pod.Spec.Volumes {
pvcName := ""
switch {
case volume.PersistentVolumeClaim != nil:
pvcName = volume.PersistentVolumeClaim.ClaimName
case volume.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &volume)
default:
continue
}
pvc, err := kl.kubeClient.CoreV1().PersistentVolumeClaims(pod.Namespace).Get(ctx, pvcName, metav1.GetOptions{})
if err != nil {
klog.InfoS("Unable to retrieve pvc", "pvc", klog.KRef(pod.Namespace, pvcName), "err", err)
continue
}
if pvc != nil {
referencedVolume, err := kl.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{})
if err != nil {
klog.InfoS("Unable to retrieve pv", "pvName", pvc.Spec.VolumeName, "err", err)
continue
}
if referencedVolume != nil && referencedVolume.Spec.HostPath != nil {
return true
}
}
}
return false
}

View File

@ -3437,222 +3437,6 @@ func TestGetPortForward(t *testing.T) {
}
}
func TestHasHostMountPVC(t *testing.T) {
type testcase struct {
pvError error
pvcError error
expected bool
podHasPVC bool
pvcIsHostPath bool
podHasEphemeral bool
}
tests := map[string]testcase{
"no pvc": {podHasPVC: false, expected: false},
"error fetching pvc": {
podHasPVC: true,
pvcError: fmt.Errorf("foo"),
expected: false,
},
"error fetching pv": {
podHasPVC: true,
pvError: fmt.Errorf("foo"),
expected: false,
},
"host path pvc": {
podHasPVC: true,
pvcIsHostPath: true,
expected: true,
},
"enabled ephemeral host path": {
podHasEphemeral: true,
pvcIsHostPath: true,
expected: true,
},
"non host path pvc": {
podHasPVC: true,
pvcIsHostPath: false,
expected: false,
},
}
run := func(t *testing.T, v testcase) {
ctx := context.Background()
testKubelet := newTestKubelet(t, false)
defer testKubelet.Cleanup()
pod := &v1.Pod{
Spec: v1.PodSpec{},
}
volumeToReturn := &v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{},
}
if v.podHasPVC {
pod.Spec.Volumes = []v1.Volume{
{
VolumeSource: v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{},
},
},
}
}
if v.podHasEphemeral {
pod.Spec.Volumes = []v1.Volume{
{
Name: "xyz",
VolumeSource: v1.VolumeSource{
Ephemeral: &v1.EphemeralVolumeSource{},
},
},
}
}
if (v.podHasPVC || v.podHasEphemeral) && v.pvcIsHostPath {
volumeToReturn.Spec.PersistentVolumeSource = v1.PersistentVolumeSource{
HostPath: &v1.HostPathVolumeSource{},
}
}
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PersistentVolumeClaim{
Spec: v1.PersistentVolumeClaimSpec{
VolumeName: "foo",
},
}, v.pvcError
})
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumes", func(action core.Action) (bool, runtime.Object, error) {
return true, volumeToReturn, v.pvError
})
actual := testKubelet.kubelet.hasHostMountPVC(ctx, pod)
if actual != v.expected {
t.Errorf("expected %t but got %t", v.expected, actual)
}
}
for k, v := range tests {
t.Run(k, func(t *testing.T) {
run(t, v)
})
}
}
func TestHasNonNamespacedCapability(t *testing.T) {
createPodWithCap := func(caps []v1.Capability) *v1.Pod {
pod := &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{{}},
},
}
if len(caps) > 0 {
pod.Spec.Containers[0].SecurityContext = &v1.SecurityContext{
Capabilities: &v1.Capabilities{
Add: caps,
},
}
}
return pod
}
nilCaps := createPodWithCap([]v1.Capability{v1.Capability("foo")})
nilCaps.Spec.Containers[0].SecurityContext = nil
tests := map[string]struct {
pod *v1.Pod
expected bool
}{
"nil security contxt": {createPodWithCap(nil), false},
"nil caps": {nilCaps, false},
"namespaced cap": {createPodWithCap([]v1.Capability{v1.Capability("foo")}), false},
"non-namespaced cap MKNOD": {createPodWithCap([]v1.Capability{v1.Capability("MKNOD")}), true},
"non-namespaced cap SYS_TIME": {createPodWithCap([]v1.Capability{v1.Capability("SYS_TIME")}), true},
"non-namespaced cap SYS_MODULE": {createPodWithCap([]v1.Capability{v1.Capability("SYS_MODULE")}), true},
}
for k, v := range tests {
actual := hasNonNamespacedCapability(v.pod)
if actual != v.expected {
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
}
}
}
func TestHasHostVolume(t *testing.T) {
pod := &v1.Pod{
Spec: v1.PodSpec{
Volumes: []v1.Volume{
{
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{},
},
},
},
},
}
result := hasHostVolume(pod)
if !result {
t.Errorf("expected host volume to enable host user namespace")
}
pod.Spec.Volumes[0].VolumeSource.HostPath = nil
result = hasHostVolume(pod)
if result {
t.Errorf("expected nil host volume to not enable host user namespace")
}
}
func TestHasHostNamespace(t *testing.T) {
tests := map[string]struct {
ps v1.PodSpec
expected bool
}{
"nil psc": {
ps: v1.PodSpec{},
expected: false},
"host pid true": {
ps: v1.PodSpec{
HostPID: true,
SecurityContext: &v1.PodSecurityContext{},
},
expected: true,
},
"host ipc true": {
ps: v1.PodSpec{
HostIPC: true,
SecurityContext: &v1.PodSecurityContext{},
},
expected: true,
},
"host net true": {
ps: v1.PodSpec{
HostNetwork: true,
SecurityContext: &v1.PodSecurityContext{},
},
expected: true,
},
"no host ns": {
ps: v1.PodSpec{
SecurityContext: &v1.PodSecurityContext{},
},
expected: false,
},
}
for k, v := range tests {
pod := &v1.Pod{
Spec: v.ps,
}
actual := hasHostNamespace(pod)
if actual != v.expected {
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
}
}
}
func TestTruncatePodHostname(t *testing.T) {
for c, test := range map[string]struct {
input string