diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index 8de4682bd3a..94001216a17 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -434,6 +434,12 @@ type RunContainerOptions struct { ReadOnly bool // hostname for pod containers Hostname string + // EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net), + // are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container, + // or using host path volumes. + // This should only be enabled when the container runtime is performing user remapping AND if the + // experimental behavior is desired. + EnableHostUserNamespace bool } // VolumeInfo contains information about the volume. diff --git a/pkg/kubelet/dockershim/docker_service.go b/pkg/kubelet/dockershim/docker_service.go index 1cf1cb13bdd..516b5bb7049 100644 --- a/pkg/kubelet/dockershim/docker_service.go +++ b/pkg/kubelet/dockershim/docker_service.go @@ -57,6 +57,13 @@ const ( containerTypeLabelContainer = "container" containerLogPathLabelKey = "io.kubernetes.container.logpath" sandboxIDLabelKey = "io.kubernetes.sandbox.id" + + // TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental + // defaulting of host user namespace that may be enabled when the docker daemon + // is using remapped UIDs. + // Dockershim should provide detection support for a remapping environment . + // This should be included in the feature proposal. Defaulting may still occur according + // to kubelet behavior and system settings in addition to any API flags that may be introduced. ) // NetworkPluginSettings is the subset of kubelet runtime args we pass diff --git a/pkg/kubelet/dockertools/docker_manager.go b/pkg/kubelet/dockertools/docker_manager.go index 743380ed4f9..423b18cb8ea 100644 --- a/pkg/kubelet/dockertools/docker_manager.go +++ b/pkg/kubelet/dockertools/docker_manager.go @@ -681,12 +681,18 @@ func (dm *DockerManager) runContainer( } } + userNsMode := "" + if opts.EnableHostUserNamespace { + userNsMode = "host" + } + hc := &dockercontainer.HostConfig{ Binds: binds, NetworkMode: dockercontainer.NetworkMode(netMode), IpcMode: dockercontainer.IpcMode(ipcMode), UTSMode: dockercontainer.UTSMode(utsMode), PidMode: dockercontainer.PidMode(pidMode), + UsernsMode: dockercontainer.UsernsMode(userNsMode), ReadonlyRootfs: readOnlyRootFilesystem(container), Resources: dockercontainer.Resources{ Memory: memoryLimit, diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index de7a64f139b..22ff1ad8ba5 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -450,15 +450,20 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub containerManager: kubeDeps.ContainerManager, nodeIP: net.ParseIP(kubeCfg.NodeIP), clock: clock.RealClock{}, - outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration, - reservation: *reservation, - enableCustomMetrics: kubeCfg.EnableCustomMetrics, - babysitDaemons: kubeCfg.BabysitDaemons, - enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, - iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4), - makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, - iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit), - iptablesDropBit: int(kubeCfg.IPTablesDropBit), + outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration, + reservation: *reservation, + enableCustomMetrics: kubeCfg.EnableCustomMetrics, + babysitDaemons: kubeCfg.BabysitDaemons, + enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, + iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4), + makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, + iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit), + iptablesDropBit: int(kubeCfg.IPTablesDropBit), + experimentalHostUserNamespaceDefaulting: utilconfig.DefaultFeatureGate.ExperimentalHostUserNamespaceDefaulting(), + } + + if klet.experimentalHostUserNamespaceDefaulting { + glog.Infof("Experimental host user namespace defaulting is enabled.") } if mode, err := effectiveHairpinMode(componentconfig.HairpinMode(kubeCfg.HairpinMode), kubeCfg.ContainerRuntime, kubeCfg.NetworkPluginName); err != nil { @@ -1087,6 +1092,13 @@ type Kubelet struct { // The handler serving CRI streaming calls (exec/attach/port-forward). criHandler http.Handler + + // experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net), + // are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container, + // or using host path volumes. + // This should only be enabled when the container runtime is performing user remapping AND if the + // experimental behavior is desired. + experimentalHostUserNamespaceDefaulting bool } // setupDataDirs creates: diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index ffbf901e396..d2b8406887b 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -315,6 +315,11 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *api.Pod, container *api.Cont return nil, err } + // only do this check if the experimental behavior is enabled, otherwise allow it to default to false + if kl.experimentalHostUserNamespaceDefaulting { + opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod) + } + return opts, nil } @@ -1397,3 +1402,87 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups( } return nil } + +// enableHostUserNamespace determines if the host user namespace should be used by the container runtime. +// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced +// capability, the pod contains a privileged container, or the pod has a host path volume. +// +// NOTE: when if a container shares any namespace with another container it must also share the user namespace +// or it will not have the correct capabilities in the namespace. This means that host user namespace +// is enabled per pod, not per container. +func (kl *Kubelet) enableHostUserNamespace(pod *api.Pod) bool { + if hasPrivilegedContainer(pod) || hasHostNamespace(pod) || + hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) { + return true + } + return false +} + +// hasPrivilegedContainer returns true if any of the containers in the pod are privileged. +func hasPrivilegedContainer(pod *api.Pod) bool { + for _, c := range pod.Spec.Containers { + if c.SecurityContext != nil && + c.SecurityContext.Privileged != nil && + *c.SecurityContext.Privileged { + return true + } + } + return false +} + +// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container. +func hasNonNamespacedCapability(pod *api.Pod) bool { + for _, c := range pod.Spec.Containers { + if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil { + for _, cap := range c.SecurityContext.Capabilities.Add { + if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" { + return true + } + } + } + } + + return false +} + +// hasHostVolume returns true if the pod spec has a HostPath volume. +func hasHostVolume(pod *api.Pod) bool { + for _, v := range pod.Spec.Volumes { + if v.HostPath != nil { + return true + } + } + return false +} + +// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true. +func hasHostNamespace(pod *api.Pod) bool { + if pod.Spec.SecurityContext == nil { + return false + } + return pod.Spec.SecurityContext.HostIPC || pod.Spec.SecurityContext.HostNetwork || pod.Spec.SecurityContext.HostPID +} + +// hasHostMountPVC returns true if a PVC is referencing a HostPath volume. +func (kl *Kubelet) hasHostMountPVC(pod *api.Pod) bool { + for _, volume := range pod.Spec.Volumes { + if volume.PersistentVolumeClaim != nil { + pvc, err := kl.kubeClient.Core().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName) + if err != nil { + glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err) + continue + } + if pvc != nil { + referencedVolume, err := kl.kubeClient.Core().PersistentVolumes().Get(pvc.Spec.VolumeName) + if err != nil { + glog.Warningf("unable to retrieve pvc %s - %v", pvc.Spec.VolumeName, err) + continue + } + if referencedVolume != nil && referencedVolume.Spec.HostPath != nil { + return true + } + } + } + } + return false +} diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index dd6a3b19058..6a02500d3a4 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -19,6 +19,7 @@ package kubelet import ( "bytes" "errors" + "fmt" "net" "sort" "testing" @@ -28,10 +29,12 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/apimachinery/registered" + "k8s.io/kubernetes/pkg/client/testing/core" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" "k8s.io/kubernetes/pkg/kubelet/server/remotecommand" "k8s.io/kubernetes/pkg/labels" + "k8s.io/kubernetes/pkg/runtime" "k8s.io/kubernetes/pkg/types" ) @@ -1264,3 +1267,230 @@ func TestMakeDevices(t *testing.T) { assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test) } } + +func TestHasPrivilegedContainer(t *testing.T) { + newBoolPtr := func(b bool) *bool { + return &b + } + tests := map[string]struct { + securityContext *api.SecurityContext + expected bool + }{ + "nil sc": { + securityContext: nil, + expected: false, + }, + "nil privleged": { + securityContext: &api.SecurityContext{}, + expected: false, + }, + "false privleged": { + securityContext: &api.SecurityContext{Privileged: newBoolPtr(false)}, + expected: false, + }, + "true privleged": { + securityContext: &api.SecurityContext{Privileged: newBoolPtr(true)}, + expected: true, + }, + } + + for k, v := range tests { + pod := &api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + {SecurityContext: v.securityContext}, + }, + }, + } + actual := hasPrivilegedContainer(pod) + if actual != v.expected { + t.Errorf("%s expected %t but got %t", k, v.expected, actual) + } + } +} + +func TestHasHostMountPVC(t *testing.T) { + tests := map[string]struct { + pvError error + pvcError error + expected bool + podHasPVC bool + pvcIsHostPath bool + }{ + "no pvc": {podHasPVC: false, expected: false}, + "error fetching pvc": { + podHasPVC: true, + pvcError: fmt.Errorf("foo"), + expected: false, + }, + "error fetching pv": { + podHasPVC: true, + pvError: fmt.Errorf("foo"), + expected: false, + }, + "host path pvc": { + podHasPVC: true, + pvcIsHostPath: true, + expected: true, + }, + "non host path pvc": { + podHasPVC: true, + pvcIsHostPath: false, + expected: false, + }, + } + + for k, v := range tests { + testKubelet := newTestKubelet(t, false) + pod := &api.Pod{ + Spec: api.PodSpec{}, + } + + volumeToReturn := &api.PersistentVolume{ + Spec: api.PersistentVolumeSpec{}, + } + + if v.podHasPVC { + pod.Spec.Volumes = []api.Volume{ + { + VolumeSource: api.VolumeSource{ + PersistentVolumeClaim: &api.PersistentVolumeClaimVolumeSource{}, + }, + }, + } + + if v.pvcIsHostPath { + volumeToReturn.Spec.PersistentVolumeSource = api.PersistentVolumeSource{ + HostPath: &api.HostPathVolumeSource{}, + } + } + + } + + testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) { + return true, &api.PersistentVolumeClaim{ + Spec: api.PersistentVolumeClaimSpec{ + VolumeName: "foo", + }, + }, v.pvcError + }) + testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumes", func(action core.Action) (bool, runtime.Object, error) { + return true, volumeToReturn, v.pvError + }) + + actual := testKubelet.kubelet.hasHostMountPVC(pod) + if actual != v.expected { + t.Errorf("%s expected %t but got %t", k, v.expected, actual) + } + + } +} + +func TestHasNonNamespacedCapability(t *testing.T) { + createPodWithCap := func(caps []api.Capability) *api.Pod { + pod := &api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{{}}, + }, + } + + if len(caps) > 0 { + pod.Spec.Containers[0].SecurityContext = &api.SecurityContext{ + Capabilities: &api.Capabilities{ + Add: caps, + }, + } + } + return pod + } + + nilCaps := createPodWithCap([]api.Capability{api.Capability("foo")}) + nilCaps.Spec.Containers[0].SecurityContext = nil + + tests := map[string]struct { + pod *api.Pod + expected bool + }{ + "nil security contxt": {createPodWithCap(nil), false}, + "nil caps": {nilCaps, false}, + "namespaced cap": {createPodWithCap([]api.Capability{api.Capability("foo")}), false}, + "non-namespaced cap MKNOD": {createPodWithCap([]api.Capability{api.Capability("MKNOD")}), true}, + "non-namespaced cap SYS_TIME": {createPodWithCap([]api.Capability{api.Capability("SYS_TIME")}), true}, + "non-namespaced cap SYS_MODULE": {createPodWithCap([]api.Capability{api.Capability("SYS_MODULE")}), true}, + } + + for k, v := range tests { + actual := hasNonNamespacedCapability(v.pod) + if actual != v.expected { + t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual) + } + } +} + +func TestHasHostVolume(t *testing.T) { + pod := &api.Pod{ + Spec: api.PodSpec{ + Volumes: []api.Volume{ + { + VolumeSource: api.VolumeSource{ + HostPath: &api.HostPathVolumeSource{}, + }, + }, + }, + }, + } + + result := hasHostVolume(pod) + if !result { + t.Errorf("expected host volume to enable host user namespace") + } + + pod.Spec.Volumes[0].VolumeSource.HostPath = nil + result = hasHostVolume(pod) + if result { + t.Errorf("expected nil host volume to not enable host user namespace") + } +} + +func TestHasHostNamespace(t *testing.T) { + tests := map[string]struct { + psc *api.PodSecurityContext + expected bool + }{ + "nil psc": {psc: nil, expected: false}, + "host pid true": { + psc: &api.PodSecurityContext{ + HostPID: true, + }, + expected: true, + }, + "host ipc true": { + psc: &api.PodSecurityContext{ + HostIPC: true, + }, + expected: true, + }, + "host net true": { + psc: &api.PodSecurityContext{ + HostNetwork: true, + }, + expected: true, + }, + "no host ns": { + psc: &api.PodSecurityContext{}, + expected: false, + }, + } + + for k, v := range tests { + pod := &api.Pod{ + Spec: api.PodSpec{ + SecurityContext: v.psc, + }, + } + actual := hasHostNamespace(pod) + if actual != v.expected { + t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual) + } + } +} diff --git a/pkg/util/config/feature_gate.go b/pkg/util/config/feature_gate.go index 11996dabf14..ff90f2fb289 100644 --- a/pkg/util/config/feature_gate.go +++ b/pkg/util/config/feature_gate.go @@ -43,18 +43,26 @@ const ( dynamicKubeletConfig = "DynamicKubeletConfig" dynamicVolumeProvisioning = "DynamicVolumeProvisioning" streamingProxyRedirects = "StreamingProxyRedirects" + + // experimentalHostUserNamespaceDefaulting Default userns=host for containers + // that are using other host namespaces, host mounts, the pod contains a privileged container, + // or specific non-namespaced capabilities + // (MKNOD, SYS_MODULE, SYS_TIME). This should only be enabled if user namespace remapping is enabled + // in the docker daemon. + experimentalHostUserNamespaceDefaultingGate = "ExperimentalHostUserNamespaceDefaulting" ) var ( // Default values for recorded features. Every new feature gate should be // represented here. knownFeatures = map[string]featureSpec{ - allAlphaGate: {false, alpha}, - externalTrafficLocalOnly: {true, beta}, - appArmor: {true, beta}, - dynamicKubeletConfig: {false, alpha}, - dynamicVolumeProvisioning: {true, alpha}, - streamingProxyRedirects: {false, alpha}, + allAlphaGate: {false, alpha}, + externalTrafficLocalOnly: {true, beta}, + appArmor: {true, beta}, + dynamicKubeletConfig: {false, alpha}, + dynamicVolumeProvisioning: {true, alpha}, + streamingProxyRedirects: {false, alpha}, + experimentalHostUserNamespaceDefaultingGate: {false, alpha}, } // Special handling for a few gates. @@ -115,6 +123,10 @@ type FeatureGate interface { // owner: timstclair // alpha: v1.5 StreamingProxyRedirects() bool + + // owner: @pweil- + // alpha: v1.5 + ExperimentalHostUserNamespaceDefaulting() bool } // featureGate implements FeatureGate as well as pflag.Value for flag parsing. @@ -209,6 +221,11 @@ func (f *featureGate) StreamingProxyRedirects() bool { return f.lookup(streamingProxyRedirects) } +// ExperimentalHostUserNamespaceDefaulting returns value for experimentalHostUserNamespaceDefaulting +func (f *featureGate) ExperimentalHostUserNamespaceDefaulting() bool { + return f.lookup(experimentalHostUserNamespaceDefaultingGate) +} + func (f *featureGate) lookup(key string) bool { defaultValue := f.known[key].enabled if f.enabled != nil {