diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 144f38c1171..cf9d7c235b8 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -309,6 +309,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) { fs.StringVar(&c.RemoteRuntimeEndpoint, "container-runtime-endpoint", c.RemoteRuntimeEndpoint, "[Experimental] The unix socket endpoint of remote runtime service. The endpoint is used only when CRI integration is enabled (--enable-cri)") fs.StringVar(&c.RemoteImageEndpoint, "image-service-endpoint", c.RemoteImageEndpoint, "[Experimental] The unix socket endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. The endpoint is used only when CRI integration is enabled (--enable-cri)") + fs.BoolVar(&c.DockerEnableSharedPID, "experimental-docker-enable-shared-pid", c.DockerEnableSharedPID, "[Experimental] The Container Runtime Interface (CRI) will eventually default to using a shared PID namespace for containers in a pod. Setting this flag allows previewing this behavior when running with the CRI enabled and Docker version 1.13.1 or higher.") fs.BoolVar(&c.ExperimentalCheckNodeCapabilitiesBeforeMount, "experimental-check-node-capabilities-before-mount", c.ExperimentalCheckNodeCapabilitiesBeforeMount, "[Experimental] if set true, the kubelet will check the underlying node for required componenets (binaries, etc.) before performing the mount") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 30279dff836..92a41383de7 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -976,7 +976,8 @@ func RunDockershim(c *componentconfig.KubeletConfiguration, dockershimRootDir st } ds, err := dockershim.NewDockerService(dockerClient, c.SeccompProfileRoot, c.PodInfraContainerImage, - streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir) + streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir, + !c.DockerEnableSharedPID) if err != nil { return err } diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 76f40b59b44..f100797cf0a 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -244,6 +244,7 @@ experimental-check-node-capabilities-before-mount experimental-cri experimental-dockershim experimental-dockershim-root-directory +experimental-docker-enable-shared-pid experimental-fail-swap-on experimental-kernel-memcg-notification experimental-keystone-ca-file diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 4790c495be6..6ca58411e67 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -481,6 +481,11 @@ type KubeletConfiguration struct { // This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. // This can be useful for debugging volume related issues. KeepTerminatedPodVolumes bool + // This flag, if set, enables use of a shared PID namespace for pods running in the docker CRI runtime. + // A shared PID namespace is the only option in non-docker runtimes and is required by the CRI. The ability to + // disable it for docker will be removed unless a compelling use case is discovered with widespread use. + // TODO: Remove once we no longer support disabling shared PID namespace (https://issues.k8s.io/41938) + DockerEnableSharedPID bool /* following flags are meant for Node Allocatable */ diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 847168ae707..a85cc657d99 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -536,6 +536,8 @@ type KubeletConfiguration struct { // This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. // This can be useful for debugging volume related issues. KeepTerminatedPodVolumes bool `json:"keepTerminatedPodVolumes,omitempty"` + // This flag, if set, enables use of a shared PID namespace for pods run by the docker CRI runtime. + DockerEnableSharedPID *bool `json:"dockerEnableSharedPID,omitempty"` /* following flags are meant for Node Allocatable */ diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go index 73eadeae197..932f8d9638b 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go @@ -436,6 +436,9 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes + if err := v1.Convert_Pointer_bool_To_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil { + return err + } out.SystemReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.SystemReserved)) out.KubeReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.KubeReserved)) out.SystemReservedCgroup = in.SystemReservedCgroup @@ -637,6 +640,9 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes + if err := v1.Convert_bool_To_Pointer_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil { + return err + } out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved)) out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved)) out.SystemReservedCgroup = in.SystemReservedCgroup diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go index f8e5b76a776..4ca4c4d306b 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go @@ -303,6 +303,11 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c * *out = new(bool) **out = **in } + if in.DockerEnableSharedPID != nil { + in, out := &in.DockerEnableSharedPID, &out.DockerEnableSharedPID + *out = new(bool) + **out = **in + } if in.SystemReserved != nil { in, out := &in.SystemReserved, &out.SystemReserved *out = make(map[string]string) diff --git a/pkg/kubelet/dockershim/docker_container.go b/pkg/kubelet/dockershim/docker_container.go index 428c8c15551..45c85b5b0f4 100644 --- a/pkg/kubelet/dockershim/docker_container.go +++ b/pkg/kubelet/dockershim/docker_container.go @@ -163,6 +163,7 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeapi // Apply security context. applyContainerSecurityContext(lc, podSandboxID, createConfig.Config, hc, securityOptSep) + modifyPIDNamespaceOverrides(ds.disableSharedPID, apiVersion, hc) } // Apply cgroupsParent derived from the sandbox config. diff --git a/pkg/kubelet/dockershim/docker_service.go b/pkg/kubelet/dockershim/docker_service.go index 08558c5dc4c..44696c59200 100644 --- a/pkg/kubelet/dockershim/docker_service.go +++ b/pkg/kubelet/dockershim/docker_service.go @@ -147,7 +147,7 @@ var internalLabelKeys []string = []string{containerTypeLabelKey, containerLogPat // NOTE: Anything passed to DockerService should be eventually handled in another way when we switch to running the shim as a different process. func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot string, podSandboxImage string, streamingConfig *streaming.Config, - pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string) (DockerService, error) { + pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string, disableSharedPID bool) (DockerService, error) { c := dockertools.NewInstrumentedDockerInterface(client) checkpointHandler, err := NewPersistentCheckpointHandler(dockershimRootDir) if err != nil { @@ -164,6 +164,7 @@ func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot str }, containerManager: cm.NewContainerManager(cgroupsName, client), checkpointHandler: checkpointHandler, + disableSharedPID: disableSharedPID, } // check docker version compatibility. @@ -249,6 +250,11 @@ type dockerService struct { // version checking for some operations. Use this cache to avoid querying // the docker daemon every time we need to do such checks. versionCache *cache.ObjectCache + // This option provides an escape hatch to override the new default behavior for Docker under + // the CRI to use a shared PID namespace for all pods. It is temporary and will be removed. + // See proposals/pod-pid-namespace.md for details. + // TODO: Remove once the escape hatch is no longer used (https://issues.k8s.io/41938) + disableSharedPID bool } // Version returns the runtime name, runtime version and runtime API version diff --git a/pkg/kubelet/dockershim/security_context.go b/pkg/kubelet/dockershim/security_context.go index f1f3025b9a6..55b7030e4a5 100644 --- a/pkg/kubelet/dockershim/security_context.go +++ b/pkg/kubelet/dockershim/security_context.go @@ -19,7 +19,9 @@ package dockershim import ( "fmt" "strconv" + "strings" + "github.com/blang/semver" dockercontainer "github.com/docker/engine-api/types/container" "k8s.io/kubernetes/pkg/api/v1" @@ -123,6 +125,7 @@ func modifyContainerNamespaceOptions(nsOpts *runtimeapi.NamespaceOption, sandbox if nsOpts != nil { hostNetwork = nsOpts.HostNetwork } + hostConfig.PidMode = dockercontainer.PidMode(fmt.Sprintf("container:%v", sandboxID)) modifyCommonNamespaceOptions(nsOpts, hostConfig) modifyHostNetworkOptionForContainer(hostNetwork, sandboxID, hostConfig) } @@ -172,3 +175,19 @@ func modifyHostNetworkOptionForContainer(hostNetwork bool, sandboxID string, hc hc.UTSMode = namespaceModeHost } } + +// modifyPIDNamespaceOverrides implements two temporary overrides for the default PID namespace sharing for Docker: +// 1. Docker engine prior to API Version 1.24 doesn't support attaching to another container's +// PID namespace, and it didn't stabilize until 1.26. This check can be removed when Kubernetes' +// minimum Docker version is at least 1.13.1 (API version 1.26). +// 2. The administrator has overridden the default behavior by means of a kubelet flag. This is an +// "escape hatch" to return to previous behavior of isolated namespaces and should be removed once +// no longer needed. +func modifyPIDNamespaceOverrides(disableSharedPID bool, version *semver.Version, hc *dockercontainer.HostConfig) { + if !strings.HasPrefix(string(hc.PidMode), "container:") { + return + } + if disableSharedPID || version.LT(semver.Version{Major: 1, Minor: 26}) { + hc.PidMode = "" + } +} diff --git a/pkg/kubelet/dockershim/security_context_test.go b/pkg/kubelet/dockershim/security_context_test.go index 9b89b46703f..1074ee411a2 100644 --- a/pkg/kubelet/dockershim/security_context_test.go +++ b/pkg/kubelet/dockershim/security_context_test.go @@ -21,6 +21,7 @@ import ( "strconv" "testing" + "github.com/blang/semver" dockercontainer "github.com/docker/engine-api/types/container" "github.com/stretchr/testify/assert" @@ -172,12 +173,14 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) { Privileged: true, IpcMode: dockercontainer.IpcMode(sandboxNSMode), NetworkMode: dockercontainer.NetworkMode(sandboxNSMode), + PidMode: dockercontainer.PidMode(sandboxNSMode), } setCapsHC := &dockercontainer.HostConfig{ CapAdd: []string{"addCapA", "addCapB"}, CapDrop: []string{"dropCapA", "dropCapB"}, IpcMode: dockercontainer.IpcMode(sandboxNSMode), NetworkMode: dockercontainer.NetworkMode(sandboxNSMode), + PidMode: dockercontainer.PidMode(sandboxNSMode), } setSELinuxHC := &dockercontainer.HostConfig{ SecurityOpt: []string{ @@ -188,6 +191,7 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) { }, IpcMode: dockercontainer.IpcMode(sandboxNSMode), NetworkMode: dockercontainer.NetworkMode(sandboxNSMode), + PidMode: dockercontainer.PidMode(sandboxNSMode), } cases := []struct { @@ -286,6 +290,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) { NetworkMode: dockercontainer.NetworkMode(sandboxNSMode), IpcMode: dockercontainer.IpcMode(sandboxNSMode), UTSMode: namespaceModeHost, + PidMode: dockercontainer.PidMode(sandboxNSMode), }, }, { @@ -296,6 +301,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) { expected: &dockercontainer.HostConfig{ NetworkMode: dockercontainer.NetworkMode(sandboxNSMode), IpcMode: dockercontainer.IpcMode(sandboxNSMode), + PidMode: dockercontainer.PidMode(sandboxNSMode), }, }, { @@ -317,6 +323,63 @@ func TestModifyContainerNamespaceOptions(t *testing.T) { } } +func TestModifyContainerNamespacePIDOverride(t *testing.T) { + cases := []struct { + name string + disable bool + version *semver.Version + input, expected dockercontainer.PidMode + }{ + { + name: "SharedPID.Enable", + disable: false, + version: &semver.Version{Major: 1, Minor: 26}, + input: "container:sandbox", + expected: "container:sandbox", + }, + { + name: "SharedPID.Disable", + disable: true, + version: &semver.Version{Major: 1, Minor: 26}, + input: "container:sandbox", + expected: "", + }, + { + name: "SharedPID.OldDocker", + disable: false, + version: &semver.Version{Major: 1, Minor: 25}, + input: "container:sandbox", + expected: "", + }, + { + name: "SharedPID.HostPid", + disable: true, + version: &semver.Version{Major: 1, Minor: 27}, + input: "host", + expected: "host", + }, + { + name: "SharedPID.DistantFuture", + disable: false, + version: &semver.Version{Major: 2, Minor: 10}, + input: "container:sandbox", + expected: "container:sandbox", + }, + { + name: "SharedPID.EmptyPidMode", + disable: true, + version: &semver.Version{Major: 1, Minor: 25}, + input: "", + expected: "", + }, + } + for _, tc := range cases { + dockerCfg := &dockercontainer.HostConfig{PidMode: tc.input} + modifyPIDNamespaceOverrides(tc.disable, tc.version, dockerCfg) + assert.Equal(t, tc.expected, dockerCfg.PidMode, "[Test case %q]", tc.name) + } +} + func fullValidSecurityContext() *runtimeapi.LinuxContainerSecurityContext { return &runtimeapi.LinuxContainerSecurityContext{ Privileged: true, diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index e4c3e062b81..f60e5e89db7 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -553,7 +553,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub // Create and start the CRI shim running as a grpc server. streamingConfig := getStreamingConfig(kubeCfg, kubeDeps) ds, err := dockershim.NewDockerService(klet.dockerClient, kubeCfg.SeccompProfileRoot, kubeCfg.PodInfraContainerImage, - streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir) + streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir, + !kubeCfg.DockerEnableSharedPID) if err != nil { return nil, err }