From c7f52b34f34cc1f0baa557a877d606720cec8cec Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Sat, 9 Mar 2024 09:48:17 +0900 Subject: [PATCH] kubelet: KEP-3857: Recursive Read-only (RRO) mounts See . An example manifest: ```yaml apiVersion: v1 kind: Pod metadata: name: rro spec: volumes: - name: mnt hostPath: # tmpfs is mounted on /mnt/tmpfs path: /mnt containers: - name: busybox image: busybox args: ["sleep", "infinity"] volumeMounts: # /mnt-rro/tmpfs is not writable - name: mnt mountPath: /mnt-rro readOnly: true mountPropagation: None recursiveReadOnly: IfPossible # /mnt-ro/tmpfs is writable - name: mnt mountPath: /mnt-ro readOnly: true # /mnt-rw/tmpfs is writable - name: mnt mountPath: /mnt-rw ``` Requirements: - Feature gate "RecursiveReadOnlyMounts" to be enabled - Linux kernel >= 5.12 - runc >= 1.1 Signed-off-by: Akihiro Suda --- pkg/kubelet/container/runtime.go | 3 + pkg/kubelet/kubelet_pods.go | 85 ++++++++++-- pkg/kubelet/kubelet_pods_linux_test.go | 3 +- pkg/kubelet/kubelet_pods_test.go | 124 ++++++++++++++++++ pkg/kubelet/kubelet_pods_windows_test.go | 2 +- .../kuberuntime/kuberuntime_container.go | 11 +- 6 files changed, 213 insertions(+), 15 deletions(-) diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index e04b5fa192f..b53214ca017 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -440,6 +440,9 @@ type Mount struct { HostPath string // Whether the mount is read-only. ReadOnly bool + // Whether the mount is recursive read-only. + // Must not be true if ReadOnly is false. + RecursiveReadOnly bool // Whether the mount needs SELinux relabeling SELinuxRelabel bool // Requested propagation mode diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index b35642f3310..b8804b9b3cb 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -242,7 +242,7 @@ func shouldMountHostsFile(pod *v1.Pod, podIPs []string) bool { } // makeMounts determines the mount points for the given container. -func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar) ([]kubecontainer.Mount, func(), error) { +func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar, supportsRRO bool) ([]kubecontainer.Mount, func(), error) { mountEtcHostsFile := shouldMountHostsFile(pod, podIPs) klog.V(3).InfoS("Creating hosts mount for container", "pod", klog.KObj(pod), "containerName", container.Name, "podIPs", podIPs, "path", mountEtcHostsFile) mounts := []kubecontainer.Mount{} @@ -343,13 +343,22 @@ func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, h klog.V(5).InfoS("Mount has propagation", "pod", klog.KObj(pod), "containerName", container.Name, "volumeMountName", mount.Name, "propagation", propagation) mustMountRO := vol.Mounter.GetAttributes().ReadOnly + rro, err := resolveRecursiveReadOnly(mount, supportsRRO) + if err != nil { + return nil, cleanupAction, fmt.Errorf("failed to resolve recursive read-only mode: %w", err) + } + if rro && !utilfeature.DefaultFeatureGate.Enabled(features.RecursiveReadOnlyMounts) { + return nil, cleanupAction, fmt.Errorf("recursive read-only mount needs feature gate %q to be enabled", features.RecursiveReadOnlyMounts) + } + mounts = append(mounts, kubecontainer.Mount{ - Name: mount.Name, - ContainerPath: containerPath, - HostPath: hostPath, - ReadOnly: mount.ReadOnly || mustMountRO, - SELinuxRelabel: relabelVolume, - Propagation: propagation, + Name: mount.Name, + ContainerPath: containerPath, + HostPath: hostPath, + ReadOnly: mount.ReadOnly || mustMountRO, + RecursiveReadOnly: rro, + SELinuxRelabel: relabelVolume, + Propagation: propagation, }) } if mountEtcHostsFile { @@ -554,6 +563,8 @@ func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string { // GenerateRunContainerOptions generates the RunContainerOptions, which can be used by // the container runtime to set parameters for launching a container. func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) { + supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod) + opts, err := kl.containerManager.GetResources(pod, container) if err != nil { return nil, nil, err @@ -587,7 +598,7 @@ func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, opts.Envs = append(opts.Envs, envs...) // only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled. - mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs) + mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs, supportsRRO) if err != nil { return nil, cleanupAction, err } @@ -2114,6 +2125,8 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: PodInitializing}} } + supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod) + for _, container := range containers { status := &v1.ContainerStatus{ Name: container.Name, @@ -2131,6 +2144,16 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon } if vol.ReadOnly { rroMode := v1.RecursiveReadOnlyDisabled + if b, err := resolveRecursiveReadOnly(vol, supportsRRO); err != nil { + klog.ErrorS(err, "failed to resolve recursive read-only mode", "mode", *vol.RecursiveReadOnly) + } else if b { + if utilfeature.DefaultFeatureGate.Enabled(features.RecursiveReadOnlyMounts) { + rroMode = v1.RecursiveReadOnlyEnabled + } else { + klog.ErrorS(nil, "recursive read-only mount needs feature gate to be enabled", + "featureGate", features.RecursiveReadOnlyMounts) + } + } volStatus.RecursiveReadOnly = &rroMode // Disabled or Enabled } status.VolumeMounts = append(status.VolumeMounts, volStatus) @@ -2420,3 +2443,49 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupP go pcm.Destroy(val) } } + +func (kl *Kubelet) runtimeClassSupportsRecursiveReadOnlyMounts(pod *v1.Pod) bool { + var runtimeClassName string + if pod.Spec.RuntimeClassName != nil { + runtimeClassName = *pod.Spec.RuntimeClassName + } + runtimeHandlers := kl.runtimeState.runtimeHandlers() + return runtimeClassSupportsRecursiveReadOnlyMounts(runtimeClassName, runtimeHandlers) +} + +// runtimeClassSupportsRecursiveReadOnlyMounts checks whether the runtime class supports recursive read-only mounts. +// The kubelet feature gate is not checked here. +func runtimeClassSupportsRecursiveReadOnlyMounts(runtimeClassName string, runtimeHandlers []kubecontainer.RuntimeHandler) bool { + for _, h := range runtimeHandlers { + if h.Name == runtimeClassName { + return h.SupportsRecursiveReadOnlyMounts + } + } + klog.ErrorS(nil, "unknown runtime class", "runtimeClassName", runtimeClassName) + return false +} + +// resolveRecursiveReadOnly resolves the recursive read-only mount mode. +func resolveRecursiveReadOnly(m v1.VolumeMount, runtimeSupportsRRO bool) (bool, error) { + if m.RecursiveReadOnly == nil || *m.RecursiveReadOnly == v1.RecursiveReadOnlyDisabled { + return false, nil + } + if !m.ReadOnly { + return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not read-only", m.Name) + } + if m.MountPropagation != nil && *m.MountPropagation != v1.MountPropagationNone { + return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not compatible with propagation %q", + m.Name, *m.MountPropagation) + } + switch rroMode := *m.RecursiveReadOnly; rroMode { + case v1.RecursiveReadOnlyIfPossible: + return runtimeSupportsRRO, nil + case v1.RecursiveReadOnlyEnabled: + if !runtimeSupportsRRO { + return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not supported by the runtime", m.Name) + } + return true, nil + default: + return false, fmt.Errorf("unknown recursive read-only mode %q", rroMode) + } +} diff --git a/pkg/kubelet/kubelet_pods_linux_test.go b/pkg/kubelet/kubelet_pods_linux_test.go index 51730c36aed..4d6765afb46 100644 --- a/pkg/kubelet/kubelet_pods_linux_test.go +++ b/pkg/kubelet/kubelet_pods_linux_test.go @@ -42,6 +42,7 @@ func TestMakeMounts(t *testing.T) { testCases := map[string]struct { container v1.Container podVolumes kubecontainer.VolumeMap + supportsRRO bool expectErr bool expectedErrMsg string expectedMounts []kubecontainer.Mount @@ -250,7 +251,7 @@ func TestMakeMounts(t *testing.T) { }, } - mounts, _, err := makeMounts(&pod, "/pod", &tc.container, "fakepodname", "", []string{""}, tc.podVolumes, fhu, fsp, nil) + mounts, _, err := makeMounts(&pod, "/pod", &tc.container, "fakepodname", "", []string{""}, tc.podVolumes, fhu, fsp, nil, tc.supportsRRO) // validate only the error if we expect an error if tc.expectErr { diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index 73fe56d23d5..589a53a4141 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -58,6 +58,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" netutils "k8s.io/utils/net" + "k8s.io/utils/ptr" ) var containerRestartPolicyAlways = v1.ContainerRestartPolicyAlways @@ -6075,3 +6076,126 @@ func TestParseGetSubIdsOutput(t *testing.T) { }) } } + +func TestResolveRecursiveReadOnly(t *testing.T) { + testCases := []struct { + m v1.VolumeMount + runtimeSupportsRRO bool + expected bool + expectedErr string + }{ + { + m: v1.VolumeMount{Name: "rw"}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "ro", ReadOnly: true}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "ro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyDisabled)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible)}, + runtimeSupportsRRO: true, + expected: true, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible), + MountPropagation: ptr.To(v1.MountPropagationNone)}, + runtimeSupportsRRO: true, + expected: true, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible), + MountPropagation: ptr.To(v1.MountPropagationHostToContainer)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not compatible with propagation", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible), + MountPropagation: ptr.To(v1.MountPropagationBidirectional)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not compatible with propagation", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: false, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not read-only", + }, + { + m: v1.VolumeMount{Name: "rro-if-possible", ReadOnly: false, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyIfPossible)}, + runtimeSupportsRRO: false, + expected: false, + expectedErr: "not read-only", + }, + { + m: v1.VolumeMount{Name: "rro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled)}, + runtimeSupportsRRO: true, + expected: true, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "rro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled), + MountPropagation: ptr.To(v1.MountPropagationNone)}, + runtimeSupportsRRO: true, + expected: true, + expectedErr: "", + }, + { + m: v1.VolumeMount{Name: "rro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled), + MountPropagation: ptr.To(v1.MountPropagationHostToContainer)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not compatible with propagation", + }, + { + m: v1.VolumeMount{Name: "rro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled), + MountPropagation: ptr.To(v1.MountPropagationBidirectional)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not compatible with propagation", + }, + { + m: v1.VolumeMount{Name: "rro", RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled)}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "not read-only", + }, + { + m: v1.VolumeMount{Name: "rro", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyEnabled)}, + runtimeSupportsRRO: false, + expected: false, + expectedErr: "not supported by the runtime", + }, + { + m: v1.VolumeMount{Name: "invalid", ReadOnly: true, RecursiveReadOnly: ptr.To(v1.RecursiveReadOnlyMode("foo"))}, + runtimeSupportsRRO: true, + expected: false, + expectedErr: "unknown recursive read-only mode", + }, + } + + for _, tc := range testCases { + got, err := resolveRecursiveReadOnly(tc.m, tc.runtimeSupportsRRO) + t.Logf("resolveRecursiveReadOnly(%+v, %v) = (%v, %v)", tc.m, tc.runtimeSupportsRRO, got, err) + if tc.expectedErr == "" { + assert.Equal(t, tc.expected, got) + assert.NoError(t, err) + } else { + assert.ErrorContains(t, err, tc.expectedErr) + } + } +} diff --git a/pkg/kubelet/kubelet_pods_windows_test.go b/pkg/kubelet/kubelet_pods_windows_test.go index 29a38bb0265..ee57ce02e1e 100644 --- a/pkg/kubelet/kubelet_pods_windows_test.go +++ b/pkg/kubelet/kubelet_pods_windows_test.go @@ -90,7 +90,7 @@ func TestMakeMountsWindows(t *testing.T) { podDir, err := os.MkdirTemp("", "test-rotate-logs") require.NoError(t, err) defer os.RemoveAll(podDir) - mounts, _, err := makeMounts(&pod, podDir, &container, "fakepodname", "", []string{""}, podVolumes, fhu, fsp, nil) + mounts, _, err := makeMounts(&pod, podDir, &container, "fakepodname", "", []string{""}, podVolumes, fhu, fsp, nil, false) require.NoError(t, err) expectedMounts := []kubecontainer.Mount{ diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 9b7a2a04f1d..3b534f1536f 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -430,11 +430,12 @@ func (m *kubeGenericRuntimeManager) makeMounts(opts *kubecontainer.RunContainerO v := opts.Mounts[idx] selinuxRelabel := v.SELinuxRelabel && selinux.GetEnabled() mount := &runtimeapi.Mount{ - HostPath: v.HostPath, - ContainerPath: v.ContainerPath, - Readonly: v.ReadOnly, - SelinuxRelabel: selinuxRelabel, - Propagation: v.Propagation, + HostPath: v.HostPath, + ContainerPath: v.ContainerPath, + Readonly: v.ReadOnly, + SelinuxRelabel: selinuxRelabel, + Propagation: v.Propagation, + RecursiveReadOnly: v.RecursiveReadOnly, } volumeMounts = append(volumeMounts, mount)