Merge pull request #111090 from kinvolk/rata/userns-support-2022

Add support for user namespaces phase 1 (KEP 127)
This commit is contained in:
Kubernetes Prow Robot 2022-08-03 13:05:47 -07:00 committed by GitHub
commit 4b6134b6dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
104 changed files with 2763 additions and 947 deletions

View File

@ -7824,6 +7824,10 @@
"description": "Use the host's pid namespace. Optional: Default to false.",
"type": "boolean"
},
"hostUsers": {
"description": "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
"type": "boolean"
},
"hostname": {
"description": "Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value.",
"type": "string"
@ -7860,7 +7864,7 @@
},
"os": {
"$ref": "#/definitions/io.k8s.api.core.v1.PodOS",
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
},
"overhead": {
"additionalProperties": {

View File

@ -5032,6 +5032,10 @@
"description": "Use the host's pid namespace. Optional: Default to false.",
"type": "boolean"
},
"hostUsers": {
"description": "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
"type": "boolean"
},
"hostname": {
"description": "Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value.",
"type": "string"
@ -5083,7 +5087,7 @@
"$ref": "#/components/schemas/io.k8s.api.core.v1.PodOS"
}
],
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
},
"overhead": {
"additionalProperties": {

View File

@ -3455,6 +3455,10 @@
"description": "Use the host's pid namespace. Optional: Default to false.",
"type": "boolean"
},
"hostUsers": {
"description": "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
"type": "boolean"
},
"hostname": {
"description": "Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value.",
"type": "string"
@ -3506,7 +3510,7 @@
"$ref": "#/components/schemas/io.k8s.api.core.v1.PodOS"
}
],
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
},
"overhead": {
"additionalProperties": {

View File

@ -2534,6 +2534,10 @@
"description": "Use the host's pid namespace. Optional: Default to false.",
"type": "boolean"
},
"hostUsers": {
"description": "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
"type": "boolean"
},
"hostname": {
"description": "Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value.",
"type": "string"
@ -2585,7 +2589,7 @@
"$ref": "#/components/schemas/io.k8s.api.core.v1.PodOS"
}
],
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
"description": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup"
},
"overhead": {
"additionalProperties": {

View File

@ -539,6 +539,15 @@ func dropDisabledFields(
})
}
// If the feature is disabled and not in use, drop the hostUsers field.
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) && !hostUsersInUse(oldPodSpec) {
// Drop the field in podSpec only if SecurityContext is not nil.
// If it is nil, there is no need to set hostUsers=nil (it will be nil too).
if podSpec.SecurityContext != nil {
podSpec.SecurityContext.HostUsers = nil
}
}
dropDisabledProcMountField(podSpec, oldPodSpec)
dropDisabledCSIVolumeSourceAlphaFields(podSpec, oldPodSpec)
@ -672,6 +681,15 @@ func nodeTaintsPolicyInUse(podSpec *api.PodSpec) bool {
return false
}
// hostUsersInUse returns true if the pod spec has spec.hostUsers field set.
func hostUsersInUse(podSpec *api.PodSpec) bool {
if podSpec != nil && podSpec.SecurityContext != nil && podSpec.SecurityContext.HostUsers != nil {
return true
}
return false
}
// procMountInUse returns true if the pod spec is non-nil and has a SecurityContext's ProcMount field set to a non-default value
func procMountInUse(podSpec *api.PodSpec) bool {
if podSpec == nil {

View File

@ -1949,3 +1949,100 @@ func TestDropDisabledMatchLabelKeysField(t *testing.T) {
})
}
}
func TestDropHostUsers(t *testing.T) {
falseVar := false
trueVar := true
podWithoutHostUsers := func() *api.Pod {
return &api.Pod{
Spec: api.PodSpec{
SecurityContext: &api.PodSecurityContext{}},
}
}
podWithHostUsersFalse := func() *api.Pod {
return &api.Pod{
Spec: api.PodSpec{
SecurityContext: &api.PodSecurityContext{
HostUsers: &falseVar,
},
},
}
}
podWithHostUsersTrue := func() *api.Pod {
return &api.Pod{
Spec: api.PodSpec{
SecurityContext: &api.PodSecurityContext{
HostUsers: &trueVar,
},
},
}
}
podInfo := []struct {
description string
hasHostUsers bool
pod func() *api.Pod
}{
{
description: "with hostUsers=true",
hasHostUsers: true,
pod: podWithHostUsersTrue,
},
{
description: "with hostUsers=false",
hasHostUsers: true,
pod: podWithHostUsersFalse,
},
{
description: "with hostUsers=nil",
pod: func() *api.Pod { return nil },
},
}
for _, enabled := range []bool{true, false} {
for _, oldPodInfo := range podInfo {
for _, newPodInfo := range podInfo {
oldPodHasHostUsers, oldPod := oldPodInfo.hasHostUsers, oldPodInfo.pod()
newPodHasHostUsers, newPod := newPodInfo.hasHostUsers, newPodInfo.pod()
if newPod == nil {
continue
}
t.Run(fmt.Sprintf("feature enabled=%v, old pod %v, new pod %v", enabled, oldPodInfo.description, newPodInfo.description), func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.UserNamespacesStatelessPodsSupport, enabled)()
DropDisabledPodFields(newPod, oldPod)
// old pod should never be changed
if !reflect.DeepEqual(oldPod, oldPodInfo.pod()) {
t.Errorf("old pod changed: %v", cmp.Diff(oldPod, oldPodInfo.pod()))
}
switch {
case enabled || oldPodHasHostUsers:
// new pod should not be changed if the feature is enabled, or if the old pod had hostUsers
if !reflect.DeepEqual(newPod, newPodInfo.pod()) {
t.Errorf("new pod changed: %v", cmp.Diff(newPod, newPodInfo.pod()))
}
case newPodHasHostUsers:
// new pod should be changed
if reflect.DeepEqual(newPod, newPodInfo.pod()) {
t.Errorf("new pod was not changed")
}
// new pod should not have hostUsers
if exp := podWithoutHostUsers(); !reflect.DeepEqual(newPod, exp) {
t.Errorf("new pod had hostUsers: %v", cmp.Diff(newPod, exp))
}
default:
// new pod should not need to be changed
if !reflect.DeepEqual(newPod, newPodInfo.pod()) {
t.Errorf("new pod changed: %v", cmp.Diff(newPod, newPodInfo.pod()))
}
}
})
}
}
}
}

View File

@ -2976,6 +2976,7 @@ type PodSpec struct {
// If the OS field is set to windows, following fields must be unset:
// - spec.hostPID
// - spec.hostIPC
// - spec.hostUsers
// - spec.securityContext.seLinuxOptions
// - spec.securityContext.seccompProfile
// - spec.securityContext.fsGroup
@ -3078,6 +3079,18 @@ type PodSecurityContext struct {
// +k8s:conversion-gen=false
// +optional
ShareProcessNamespace *bool
// Use the host's user namespace.
// Optional: Default to true.
// If set to true or not present, the pod will be run in the host user namespace, useful
// for when the pod needs a feature only available to the host user namespace, such as
// loading a kernel module with CAP_SYS_MODULE.
// When set to false, a new user namespace is created for the pod. Setting false is useful
// for mitigating container breakout vulnerabilities even allowing users to run their
// containers as root without actually having root privileges on the host.
// Note that this field cannot be set when spec.os.name is windows.
// +k8s:conversion-gen=false
// +optional
HostUsers *bool
// The SELinux context to be applied to all containers.
// If unspecified, the container runtime will allocate a random SELinux context for each
// container. May also be set in SecurityContext. If set in

View File

@ -303,6 +303,7 @@ func Convert_core_PodSpec_To_v1_PodSpec(in *core.PodSpec, out *v1.PodSpec, s con
out.HostNetwork = in.SecurityContext.HostNetwork
out.HostIPC = in.SecurityContext.HostIPC
out.ShareProcessNamespace = in.SecurityContext.ShareProcessNamespace
out.HostUsers = in.SecurityContext.HostUsers
}
return nil
@ -358,6 +359,7 @@ func Convert_v1_PodSpec_To_core_PodSpec(in *v1.PodSpec, out *core.PodSpec, s con
out.SecurityContext.HostPID = in.HostPID
out.SecurityContext.HostIPC = in.HostIPC
out.SecurityContext.ShareProcessNamespace = in.ShareProcessNamespace
out.SecurityContext.HostUsers = in.HostUsers
return nil
}

View File

@ -6093,6 +6093,7 @@ func autoConvert_core_PodSecurityContext_To_v1_PodSecurityContext(in *core.PodSe
// INFO: in.HostPID opted out of conversion generation
// INFO: in.HostIPC opted out of conversion generation
// INFO: in.ShareProcessNamespace opted out of conversion generation
// INFO: in.HostUsers opted out of conversion generation
out.SELinuxOptions = (*v1.SELinuxOptions)(unsafe.Pointer(in.SELinuxOptions))
out.WindowsOptions = (*v1.WindowsSecurityContextOptions)(unsafe.Pointer(in.WindowsOptions))
out.RunAsUser = (*int64)(unsafe.Pointer(in.RunAsUser))
@ -6186,6 +6187,7 @@ func autoConvert_v1_PodSpec_To_core_PodSpec(in *v1.PodSpec, out *core.PodSpec, s
out.TopologySpreadConstraints = *(*[]core.TopologySpreadConstraint)(unsafe.Pointer(&in.TopologySpreadConstraints))
out.SetHostnameAsFQDN = (*bool)(unsafe.Pointer(in.SetHostnameAsFQDN))
out.OS = (*core.PodOS)(unsafe.Pointer(in.OS))
// INFO: in.HostUsers opted out of conversion generation
return nil
}

View File

@ -3099,6 +3099,52 @@ func validateContainerCommon(ctr *core.Container, volumes map[string]core.Volume
allErrs = append(allErrs, validatePullPolicy(ctr.ImagePullPolicy, path.Child("imagePullPolicy"))...)
allErrs = append(allErrs, ValidateResourceRequirements(&ctr.Resources, path.Child("resources"), opts)...)
allErrs = append(allErrs, ValidateSecurityContext(ctr.SecurityContext, path.Child("securityContext"))...)
return allErrs
}
func validateHostUsers(spec *core.PodSpec, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
// Only make the following checks if hostUsers is false (otherwise, the container uses the
// same userns as the host, and so there isn't anything to check).
if spec.SecurityContext == nil || spec.SecurityContext.HostUsers == nil || *spec.SecurityContext.HostUsers == true {
return allErrs
}
// For now only these volumes are supported:
// - configmap
// - secret
// - downwardAPI
// - emptyDir
// - projected
// So reject anything else.
for i, vol := range spec.Volumes {
switch {
case vol.EmptyDir != nil:
case vol.Secret != nil:
case vol.DownwardAPI != nil:
case vol.ConfigMap != nil:
case vol.Projected != nil:
default:
allErrs = append(allErrs, field.Forbidden(fldPath.Child("volumes").Index(i), "volume type not supported when `pod.Spec.HostUsers` is false"))
}
}
// We decided to restrict the usage of userns with other host namespaces:
// https://github.com/kubernetes/kubernetes/pull/111090#discussion_r935994282
// The tl;dr is: you can easily run into permission issues that seem unexpected, we don't
// know of any good use case and we can always enable them later.
// Note we already validated above spec.SecurityContext is not nil.
if spec.SecurityContext.HostNetwork {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("hostNetwork"), "when `pod.Spec.HostUsers` is false"))
}
if spec.SecurityContext.HostPID {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("HostPID"), "when `pod.Spec.HostUsers` is false"))
}
if spec.SecurityContext.HostIPC {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("HostIPC"), "when `pod.Spec.HostUsers` is false"))
}
return allErrs
}
@ -3569,6 +3615,7 @@ func ValidatePodSpec(spec *core.PodSpec, podMeta *metav1.ObjectMeta, fldPath *fi
allErrs = append(allErrs, validateReadinessGates(spec.ReadinessGates, fldPath.Child("readinessGates"))...)
allErrs = append(allErrs, validateTopologySpreadConstraints(spec.TopologySpreadConstraints, fldPath.Child("topologySpreadConstraints"))...)
allErrs = append(allErrs, validateWindowsHostProcessPod(spec, fldPath, opts)...)
allErrs = append(allErrs, validateHostUsers(spec, fldPath)...)
if len(spec.ServiceAccountName) > 0 {
for _, msg := range ValidateServiceAccountName(spec.ServiceAccountName, false) {
allErrs = append(allErrs, field.Invalid(fldPath.Child("serviceAccountName"), spec.ServiceAccountName, msg))
@ -3661,6 +3708,9 @@ func validateWindows(spec *core.PodSpec, fldPath *field.Path) field.ErrorList {
if securityContext.SELinuxOptions != nil {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("securityContext").Child("seLinuxOptions"), "cannot be set for a windows pod"))
}
if securityContext.HostUsers != nil {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("hostUsers"), "cannot be set for a windows pod"))
}
if securityContext.HostPID {
allErrs = append(allErrs, field.Forbidden(fldPath.Child("hostPID"), "cannot be set for a windows pod"))
}

View File

@ -18399,6 +18399,7 @@ func TestValidateOSFields(t *testing.T) {
"SecurityContext.HostIPC",
"SecurityContext.HostNetwork",
"SecurityContext.HostPID",
"SecurityContext.HostUsers",
"SecurityContext.RunAsGroup",
"SecurityContext.RunAsUser",
"SecurityContext.SELinuxOptions",
@ -20694,6 +20695,172 @@ func TestValidateNonSpecialIP(t *testing.T) {
}
}
func TestValidateHostUsers(t *testing.T) {
falseVar := false
trueVar := true
cases := []struct {
name string
success bool
spec *core.PodSpec
}{
{
name: "empty",
success: true,
spec: &core.PodSpec{},
},
{
name: "hostUsers unset",
success: true,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{},
},
},
{
name: "hostUsers=false",
success: true,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
},
},
},
{
name: "hostUsers=true",
success: true,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &trueVar,
},
},
},
{
name: "hostUsers=false & volumes",
success: true,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
},
Volumes: []core.Volume{
{
Name: "configmap",
VolumeSource: core.VolumeSource{
ConfigMap: &core.ConfigMapVolumeSource{
LocalObjectReference: core.LocalObjectReference{Name: "configmap"},
},
},
},
{
Name: "secret",
VolumeSource: core.VolumeSource{
Secret: &core.SecretVolumeSource{
SecretName: "secret",
},
},
},
{
Name: "downward-api",
VolumeSource: core.VolumeSource{
DownwardAPI: &core.DownwardAPIVolumeSource{},
},
},
{
Name: "proj",
VolumeSource: core.VolumeSource{
Projected: &core.ProjectedVolumeSource{},
},
},
{
Name: "empty-dir",
VolumeSource: core.VolumeSource{
EmptyDir: &core.EmptyDirVolumeSource{},
},
},
},
},
},
{
name: "hostUsers=false - unsupported volume",
success: false,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
},
Volumes: []core.Volume{
{
Name: "host-path",
VolumeSource: core.VolumeSource{
HostPath: &core.HostPathVolumeSource{},
},
},
},
},
},
{
// It should ignore unsupported volumes with hostUsers=true.
name: "hostUsers=true - unsupported volume",
success: true,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &trueVar,
},
Volumes: []core.Volume{
{
Name: "host-path",
VolumeSource: core.VolumeSource{
HostPath: &core.HostPathVolumeSource{},
},
},
},
},
},
{
name: "hostUsers=false & HostNetwork",
success: false,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
HostNetwork: true,
},
},
},
{
name: "hostUsers=false & HostPID",
success: false,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
HostPID: true,
},
},
},
{
name: "hostUsers=false & HostIPC",
success: false,
spec: &core.PodSpec{
SecurityContext: &core.PodSecurityContext{
HostUsers: &falseVar,
HostIPC: true,
},
},
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
fPath := field.NewPath("spec")
allErrs := validateHostUsers(tc.spec, fPath)
if !tc.success && len(allErrs) == 0 {
t.Errorf("Unexpected success")
}
if tc.success && len(allErrs) != 0 {
t.Errorf("Unexpected error(s): %v", allErrs)
}
})
}
}
func TestValidateWindowsHostProcessPod(t *testing.T) {
const containerName = "container"
falseVar := false

View File

@ -3736,6 +3736,11 @@ func (in *PodSecurityContext) DeepCopyInto(out *PodSecurityContext) {
*out = new(bool)
**out = **in
}
if in.HostUsers != nil {
in, out := &in.HostUsers, &out.HostUsers
*out = new(bool)
**out = **in
}
if in.SELinuxOptions != nil {
in, out := &in.SELinuxOptions, &out.SELinuxOptions
*out = new(SELinuxOptions)

View File

@ -819,6 +819,10 @@ func (adc *attachDetachController) GetPodVolumeDir(podUID types.UID, pluginName,
return ""
}
func (adc *attachDetachController) GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return nil, nil, nil
}
func (adc *attachDetachController) GetPodPluginDir(podUID types.UID, pluginName string) string {
return ""
}

View File

@ -394,6 +394,10 @@ func (expc *expandController) GetPodsDir() string {
return ""
}
func (expc *expandController) GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return nil, nil, nil
}
func (expc *expandController) GetPodVolumeDir(podUID types.UID, pluginName string, volumeName string) string {
return ""
}

View File

@ -55,6 +55,10 @@ func (ctrl *PersistentVolumeController) GetPodVolumeDir(podUID types.UID, plugin
return ""
}
func (ctrl *PersistentVolumeController) GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return nil, nil, nil
}
func (ctrl *PersistentVolumeController) GetPodPluginDir(podUID types.UID, pluginName string) string {
return ""
}

View File

@ -821,6 +821,13 @@ const (
// Enable resource managers to make NUMA aligned decisions
TopologyManager featuregate.Feature = "TopologyManager"
// owner: @rata, @giuseppe
// kep: http://kep.k8s.io/127
// alpha: v1.25
//
// Enables user namespace support for stateless pods.
UserNamespacesStatelessPodsSupport featuregate.Feature = "UserNamespacesStatelessPodsSupport"
// owner: @cofyc
// alpha: v1.21
VolumeCapacityPriority featuregate.Feature = "VolumeCapacityPriority"
@ -1081,6 +1088,8 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha},
UserNamespacesStatelessPodsSupport: {Default: false, PreRelease: featuregate.Alpha},
WinDSR: {Default: false, PreRelease: featuregate.Alpha},
WinOverlay: {Default: true, PreRelease: featuregate.Beta},

View File

@ -22196,10 +22196,17 @@ func schema_k8sio_api_core_v1_PodSpec(ref common.ReferenceCallback) common.OpenA
},
"os": {
SchemaProps: spec.SchemaProps{
Description: "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup",
Description: "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup",
Ref: ref("k8s.io/api/core/v1.PodOS"),
},
},
"hostUsers": {
SchemaProps: spec.SchemaProps{
Description: "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
Type: []string{"boolean"},
Format: "",
},
},
},
Required: []string{"containers"},
},

View File

@ -56,6 +56,9 @@ type RuntimeHelper interface {
// supplemental groups for the Pod. These extra supplemental groups come
// from annotations on persistent volumes that the pod depends on.
GetExtraSupplementalGroupsForPod(pod *v1.Pod) []int64
// GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error)
}
// ShouldContainerBeRestarted checks whether a container needs to be restarted.

View File

@ -65,3 +65,7 @@ func (f *FakeRuntimeHelper) GetPodDir(podUID kubetypes.UID) string {
func (f *FakeRuntimeHelper) GetExtraSupplementalGroupsForPod(pod *v1.Pod) []int64 {
return nil
}
func (f *FakeRuntimeHelper) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) {
return nil, nil
}

View File

@ -837,6 +837,10 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
StateDirectory: rootDirectory,
})
klet.shutdownManager = shutdownManager
klet.usernsManager, err = MakeUserNsManager(klet)
if err != nil {
return nil, err
}
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
// Finally, put the most recent version of the config on the Kubelet, so
@ -1175,6 +1179,9 @@ type Kubelet struct {
// Handles node shutdown events for the Node.
shutdownManager nodeshutdown.Manager
// Manage user namespaces
usernsManager *usernsManager
}
// ListPodStats is delegated to StatsProvider, which implements stats.Provider interface
@ -1889,6 +1896,8 @@ func (kl *Kubelet) syncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus
klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID)
}
kl.usernsManager.Release(pod.UID)
// mark the final pod status
kl.statusManager.TerminatePod(pod)
klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID)

View File

@ -420,6 +420,15 @@ func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) {
return truncated, nil
}
// GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
func (kl *Kubelet) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) {
return kl.usernsManager.GetOrCreateUserNamespaceMappings(pod)
}
func (kl *Kubelet) getHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return kl.usernsManager.getHostIDsForPod(pod, containerUID, containerGID)
}
// GeneratePodHostNameAndDomain creates a hostname and domain name for a pod,
// given that pod's spec and annotations or returns an error.
func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) {
@ -1154,6 +1163,12 @@ func (kl *Kubelet) HandlePodCleanups() error {
return err
}
// Remove orphaned pod user namespace allocations (if any).
klog.V(3).InfoS("Clean up orphaned pod user namespace allocations")
if err = kl.usernsManager.CleanupOrphanedPodUsernsAllocations(allPods, runningRuntimePods); err != nil {
klog.ErrorS(err, "Failed cleaning up orphaned pod user namespaces allocations")
}
// Remove orphaned volumes from pods that are known not to have any
// containers. Note that we pass all pods (including terminated pods) to
// the function, so that we don't remove volumes associated with terminated

View File

@ -45,15 +45,23 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
libcontainercgroups.IsCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
if err != nil {
return err
}
config.Linux = cl
return nil
}
// generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) *runtimeapi.LinuxContainerConfig {
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) {
sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username)
if err != nil {
return nil, err
}
lc := &runtimeapi.LinuxContainerConfig{
Resources: &runtimeapi.LinuxContainerResources{},
SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username),
SecurityContext: sc,
}
if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER {
@ -124,7 +132,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C
}
}
return lc
return lc, nil
}
// calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits

View File

@ -47,6 +47,8 @@ func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerInde
restartCountUint32 := uint32(restartCount)
envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
l, _ := m.generateLinuxContainerConfig(container, pod, new(int64), "", nil, enforceMemoryQoS)
expectedConfig := &runtimeapi.ContainerConfig{
Metadata: &runtimeapi.ContainerMetadata{
Name: container.Name,
@ -64,7 +66,7 @@ func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerInde
Stdin: container.Stdin,
StdinOnce: container.StdinOnce,
Tty: container.TTY,
Linux: m.generateLinuxContainerConfig(container, pod, new(int64), "", nil, enforceMemoryQoS),
Linux: l,
Envs: envs,
}
return expectedConfig
@ -215,7 +217,8 @@ func TestGenerateLinuxContainerConfigResources(t *testing.T) {
},
}
linuxConfig := m.generateLinuxContainerConfig(&pod.Spec.Containers[0], pod, new(int64), "", nil, false)
linuxConfig, err := m.generateLinuxContainerConfig(&pod.Spec.Containers[0], pod, new(int64), "", nil, false)
assert.NoError(t, err)
assert.Equal(t, test.expected.CpuPeriod, linuxConfig.GetResources().CpuPeriod, test.name)
assert.Equal(t, test.expected.CpuQuota, linuxConfig.GetResources().CpuQuota, test.name)
assert.Equal(t, test.expected.CpuShares, linuxConfig.GetResources().CpuShares, test.name)
@ -329,6 +332,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
memoryLow int64
memoryHigh int64
}
l1, _ := m.generateLinuxContainerConfig(&pod1.Spec.Containers[0], pod1, new(int64), "", nil, true)
l2, _ := m.generateLinuxContainerConfig(&pod2.Spec.Containers[0], pod2, new(int64), "", nil, true)
tests := []struct {
name string
pod *v1.Pod
@ -338,7 +343,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
name: "Request128MBLimit256MB",
pod: pod1,
expected: &expectedResult{
m.generateLinuxContainerConfig(&pod1.Spec.Containers[0], pod1, new(int64), "", nil, true),
l1,
128 * 1024 * 1024,
int64(float64(256*1024*1024) * m.memoryThrottlingFactor),
},
@ -347,7 +352,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
name: "Request128MBWithoutLimit",
pod: pod2,
expected: &expectedResult{
m.generateLinuxContainerConfig(&pod2.Spec.Containers[0], pod2, new(int64), "", nil, true),
l2,
128 * 1024 * 1024,
int64(pod2MemoryHigh),
},
@ -355,7 +360,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
}
for _, test := range tests {
linuxConfig := m.generateLinuxContainerConfig(&test.pod.Spec.Containers[0], test.pod, new(int64), "", nil, true)
linuxConfig, err := m.generateLinuxContainerConfig(&test.pod.Spec.Containers[0], test.pod, new(int64), "", nil, true)
assert.NoError(t, err)
assert.Equal(t, test.expected.containerConfig, linuxConfig, test.name)
assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.min"], strconv.FormatInt(test.expected.memoryLow, 10), test.name)
assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.high"], strconv.FormatInt(test.expected.memoryHigh, 10), test.name)
@ -577,7 +583,8 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) {
},
} {
t.Run(tc.name, func(t *testing.T) {
got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target, false)
got, err := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target, false)
assert.NoError(t, err)
if diff := cmp.Diff(tc.want, got.SecurityContext.NamespaceOptions); diff != "" {
t.Errorf("%v: diff (-want +got):\n%v", t.Name(), diff)
}
@ -668,7 +675,8 @@ func TestGenerateLinuxContainerConfigSwap(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
m.memorySwapBehavior = tc.swapSetting
actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false)
actual, err := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false)
assert.NoError(t, err)
assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name)
})
}

View File

@ -195,7 +195,11 @@ func (m *kubeGenericRuntimeManager) generatePodSandboxLinuxConfig(pod *v1.Pod) (
if sc.RunAsGroup != nil && runtime.GOOS != "windows" {
lc.SecurityContext.RunAsGroup = &runtimeapi.Int64Value{Value: int64(*sc.RunAsGroup)}
}
lc.SecurityContext.NamespaceOptions = runtimeutil.NamespacesForPod(pod)
namespaceOptions, err := runtimeutil.NamespacesForPod(pod, m.runtimeHelper)
if err != nil {
return nil, err
}
lc.SecurityContext.NamespaceOptions = namespaceOptions
if sc.FSGroup != nil && runtime.GOOS != "windows" {
lc.SecurityContext.SupplementalGroups = append(lc.SecurityContext.SupplementalGroups, int64(*sc.FSGroup))

View File

@ -25,7 +25,7 @@ import (
)
// determineEffectiveSecurityContext gets container's security context from v1.Pod and v1.Container.
func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Pod, container *v1.Container, uid *int64, username string) *runtimeapi.LinuxContainerSecurityContext {
func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Pod, container *v1.Container, uid *int64, username string) (*runtimeapi.LinuxContainerSecurityContext, error) {
effectiveSc := securitycontext.DetermineEffectiveSecurityContext(pod, container)
synthesized := convertToRuntimeSecurityContext(effectiveSc)
if synthesized == nil {
@ -53,7 +53,11 @@ func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Po
}
// set namespace options and supplemental groups.
synthesized.NamespaceOptions = runtimeutil.NamespacesForPod(pod)
namespaceOptions, err := runtimeutil.NamespacesForPod(pod, m.runtimeHelper)
if err != nil {
return nil, err
}
synthesized.NamespaceOptions = namespaceOptions
podSc := pod.Spec.SecurityContext
if podSc != nil {
if podSc.FSGroup != nil {
@ -75,7 +79,7 @@ func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Po
synthesized.MaskedPaths = securitycontext.ConvertToRuntimeMaskedPaths(effectiveSc.ProcMount)
synthesized.ReadonlyPaths = securitycontext.ConvertToRuntimeReadonlyPaths(effectiveSc.ProcMount)
return synthesized
return synthesized, nil
}
// convertToRuntimeSecurityContext converts v1.SecurityContext to runtimeapi.SecurityContext.

View File

@ -97,12 +97,18 @@ func PidNamespaceForPod(pod *v1.Pod) runtimeapi.NamespaceMode {
return runtimeapi.NamespaceMode_CONTAINER
}
// NamespacesForPod returns the runtimeapi.NamespaceOption for a given pod.
// namespacesForPod returns the runtimeapi.NamespaceOption for a given pod.
// An empty or nil pod can be used to get the namespace defaults for v1.Pod.
func NamespacesForPod(pod *v1.Pod) *runtimeapi.NamespaceOption {
return &runtimeapi.NamespaceOption{
Ipc: IpcNamespaceForPod(pod),
Network: NetworkNamespaceForPod(pod),
Pid: PidNamespaceForPod(pod),
func NamespacesForPod(pod *v1.Pod, runtimeHelper kubecontainer.RuntimeHelper) (*runtimeapi.NamespaceOption, error) {
userNs, err := runtimeHelper.GetOrCreateUserNamespaceMappings(pod)
if err != nil {
return nil, err
}
return &runtimeapi.NamespaceOption{
Ipc: IpcNamespaceForPod(pod),
Network: NetworkNamespaceForPod(pod),
Pid: PidNamespaceForPod(pod),
UsernsOptions: userNs,
}, nil
}

View File

@ -24,6 +24,7 @@ import (
v1 "k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
kubecontainertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
)
func TestPodSandboxChanged(t *testing.T) {
@ -222,7 +223,8 @@ func TestNamespacesForPod(t *testing.T) {
},
} {
t.Run(desc, func(t *testing.T) {
actual := NamespacesForPod(test.input)
actual, err := NamespacesForPod(test.input, &kubecontainertest.FakeRuntimeHelper{})
require.NoError(t, err)
require.Equal(t, test.expected, actual)
})
}

View File

@ -0,0 +1,595 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kubelet
import (
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
)
// bitsDataElement is the number of bits in a bitArray.data element.
const bitsDataElement = 32
type bitArray struct {
data []uint32
firstIndex int
}
func makeBitArray(size uint32) *bitArray {
m := bitArray{
data: make([]uint32, (size+bitsDataElement-1)/bitsDataElement),
firstIndex: 0,
}
return &m
}
func (b *bitArray) set(index uint32) {
b.data[index/bitsDataElement] |= (uint32(1) << (index % bitsDataElement))
}
func (b *bitArray) isSet(index uint32) bool {
return (b.data[index/bitsDataElement]>>(index%bitsDataElement))&0x1 == 1
}
func (b *bitArray) findAvailable() (uint32, bool) {
for i := b.firstIndex; i < len(b.data); i++ {
// Check if all bits are used (all 1s).
if b.data[i] == math.MaxUint32 {
continue
}
for j := uint32(0); j < bitsDataElement; j++ {
if (b.data[i]>>j)&0x1 == 0 {
v := uint32(i)*bitsDataElement + j
b.set(v)
// Update firstIndex to the current
// data element since there are no other
// unset bits before the current index.
b.firstIndex = int(i)
return v, true
}
}
}
return 0, false
}
func (b *bitArray) clear(index uint32) {
i := index / bitsDataElement
// update firstIndex if the index found is less than
// the current one.
if i < uint32(b.firstIndex) {
b.firstIndex = int(i)
}
// clear the bit by ANDing the data element with the
// complement of the bitmask to be cleared.
b.data[i] &= ^(1 << (index % bitsDataElement))
}
// length for the user namespace to create (65536).
const userNsLength = (1 << 16)
// Limit the total number of pods using userns in this node to this value.
// This is an alpha limitation that will probably be lifted later.
const maxPods = 1024
// Create a new map when we removed enough pods to avoid memory leaks
// since Go maps never free memory.
const mapReInitializeThreshold = 1000
type userNsPodsManager interface {
getPodDir(podUID types.UID) string
listPodsFromDisk() ([]types.UID, error)
}
type usernsManager struct {
used *bitArray
usedBy map[types.UID]uint32 // Map pod.UID to range used
removed int
numAllocated int
kl userNsPodsManager
// This protects all members except for kl.anager
lock sync.Mutex
}
// UserNamespace holds the configuration for the user namespace.
type userNamespace struct {
// UIDs mappings for the user namespace.
UIDMappings []idMapping `json:"uidMappings"`
// GIDs mappings for the user namespace.
GIDMappings []idMapping `json:"gidMappings"`
}
// Pod user namespace mapping
type idMapping struct {
// Required.
HostId uint32 `json:"hostId"`
// Required.
ContainerId uint32 `json:"containerId"`
// Required.
Length uint32 `json:"length"`
}
// mappingsFile is the file where the user namespace mappings are persisted.
const mappingsFile = "userns"
// writeMappingsToFile writes the specified user namespace configuration to the pod
// directory.
func (m *usernsManager) writeMappingsToFile(pod types.UID, userNs userNamespace) error {
dir := m.kl.getPodDir(pod)
data, err := json.Marshal(userNs)
if err != nil {
return err
}
fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
if err != nil {
return err
}
if err := fstore.Write(mappingsFile, data); err != nil {
return err
}
// We need to fsync the parent dir so the file is guaranteed to be there.
// fstore guarantees an atomic write, we need durability too.
parentDir, err := os.Open(dir)
if err != nil {
return err
}
if err = parentDir.Sync(); err != nil {
// Ignore return here, there is already an error reported.
parentDir.Close()
return err
}
return parentDir.Close()
}
// readMappingsFromFile reads the user namespace configuration from the pod directory.
func (m *usernsManager) readMappingsFromFile(pod types.UID) ([]byte, error) {
dir := m.kl.getPodDir(pod)
fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
if err != nil {
return nil, err
}
return fstore.Read(mappingsFile)
}
func MakeUserNsManager(kl userNsPodsManager) (*usernsManager, error) {
m := usernsManager{
// Create a bitArray for all the UID space (2^32).
// As a by product of that, no index param to bitArray can be out of bounds (index is uint32).
used: makeBitArray((math.MaxUint32 + 1) / userNsLength),
usedBy: make(map[types.UID]uint32),
kl: kl,
}
// First block is reserved for the host.
m.used.set(0)
// Second block will be used for phase II. Don't assign that range for now.
m.used.set(1)
// do not bother reading the list of pods if user namespaces are not enabled.
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) {
return &m, nil
}
found, err := kl.listPodsFromDisk()
if err != nil {
if os.IsNotExist(err) {
return &m, nil
}
return nil, fmt.Errorf("user namespace manager can't read pods from disk: %w", err)
}
for _, podUID := range found {
klog.V(5).InfoS("reading pod from disk for user namespace", "podUID", podUID)
if err := m.recordPodMappings(podUID); err != nil {
return nil, err
}
}
return &m, nil
}
// recordPodMappings registers the range used for the user namespace if the
// usernsConfFile exists in the pod directory.
func (m *usernsManager) recordPodMappings(pod types.UID) error {
content, err := m.readMappingsFromFile(pod)
if err != nil && err != utilstore.ErrKeyNotFound {
return err
}
// If no content, it means the pod doesn't have userns. Nothing else to do
if len(content) == 0 {
return nil
}
_, err = m.parseUserNsFileAndRecord(pod, content)
return err
}
// isSet checks if the specified index is already set.
func (m *usernsManager) isSet(v uint32) bool {
index := v / userNsLength
return m.used.isSet(index)
}
// allocateOne finds a free user namespace and allocate it to the specified pod.
// The first return value is the first ID in the user namespace, the second returns
// the length for the user namespace range.
func (m *usernsManager) allocateOne(pod types.UID) (firstID uint32, length uint32, err error) {
if m.numAllocated >= maxPods {
return 0, 0, fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated)
}
m.numAllocated++
defer func() {
if err != nil {
m.numAllocated--
}
}()
firstZero, found := m.used.findAvailable()
if !found {
return 0, 0, fmt.Errorf("could not find an empty slot to allocate a user namespace")
}
klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
firstID = firstZero * userNsLength
m.usedBy[pod] = firstID
return firstID, userNsLength, nil
}
// record stores the user namespace [from; from+length] to the specified pod.
func (m *usernsManager) record(pod types.UID, from, length uint32) (err error) {
if length != userNsLength {
return fmt.Errorf("wrong user namespace length %v", length)
}
if from%userNsLength != 0 {
return fmt.Errorf("wrong user namespace offset specified %v", from)
}
prevFrom, found := m.usedBy[pod]
if found && prevFrom != from {
return fmt.Errorf("different user namespace range already used by pod %q", pod)
}
index := from / userNsLength
// if the pod wasn't found then verify the range is free.
if !found && m.used.isSet(index) {
return fmt.Errorf("range picked for pod %q already taken", pod)
}
// The pod is already registered, nothing to do.
if found && prevFrom == from {
return nil
}
if m.numAllocated >= maxPods {
return fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated)
}
m.numAllocated++
defer func() {
if err != nil {
m.numAllocated--
}
}()
klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
// "from" is a ID (UID/GID), set the corresponding userns of size
// userNsLength in the bit-array.
m.used.set(index)
m.usedBy[pod] = from
return nil
}
// Release releases the user namespace allocated to the specified pod.
func (m *usernsManager) Release(podUID types.UID) {
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) {
return
}
m.lock.Lock()
defer m.lock.Unlock()
m.releaseWithLock(podUID)
}
func (m *usernsManager) releaseWithLock(pod types.UID) {
v, ok := m.usedBy[pod]
if !ok {
klog.V(5).InfoS("pod user namespace allocation not present", "podUID", pod)
return
}
delete(m.usedBy, pod)
klog.V(5).InfoS("releasing pod user namespace allocation", "podUID", pod)
m.numAllocated--
m.removed++
_ = os.Remove(filepath.Join(m.kl.getPodDir(pod), mappingsFile))
if m.removed%mapReInitializeThreshold == 0 {
n := make(map[types.UID]uint32)
for k, v := range m.usedBy {
n[k] = v
}
m.usedBy = n
m.removed = 0
}
m.used.clear(v / userNsLength)
}
func (m *usernsManager) parseUserNsFileAndRecord(pod types.UID, content []byte) (userNs userNamespace, err error) {
if err = json.Unmarshal([]byte(content), &userNs); err != nil {
err = fmt.Errorf("can't parse file: %w", err)
return
}
if len(userNs.UIDMappings) != 1 {
err = fmt.Errorf("invalid user namespace configuration: no more than one mapping allowed.")
return
}
if len(userNs.UIDMappings) != len(userNs.GIDMappings) {
err = fmt.Errorf("invalid user namespace configuration: GID and UID mappings should be identical.")
return
}
if userNs.UIDMappings[0] != userNs.GIDMappings[0] {
err = fmt.Errorf("invalid user namespace configuration: GID and UID mapping should be identical")
return
}
// We don't produce configs without root mapped and some runtimes assume it is mapped.
// Validate the file has something we produced and can digest.
if userNs.UIDMappings[0].ContainerId != 0 {
err = fmt.Errorf("invalid user namespace configuration: UID 0 must be mapped")
return
}
if userNs.GIDMappings[0].ContainerId != 0 {
err = fmt.Errorf("invalid user namespace configuration: GID 0 must be mapped")
return
}
hostId := userNs.UIDMappings[0].HostId
length := userNs.UIDMappings[0].Length
err = m.record(pod, hostId, length)
return
}
func (m *usernsManager) createUserNs(pod *v1.Pod) (userNs userNamespace, err error) {
firstID, length, err := m.allocateOne(pod.UID)
if err != nil {
return
}
defer func() {
if err != nil {
m.releaseWithLock(pod.UID)
}
}()
userNs = userNamespace{
UIDMappings: []idMapping{
{
ContainerId: 0,
HostId: firstID,
Length: length,
},
},
GIDMappings: []idMapping{
{
ContainerId: 0,
HostId: firstID,
Length: length,
},
},
}
return userNs, m.writeMappingsToFile(pod.UID, userNs)
}
// GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
func (m *usernsManager) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) {
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) {
return nil, nil
}
m.lock.Lock()
defer m.lock.Unlock()
if pod.Spec.HostUsers == nil || *pod.Spec.HostUsers == true {
return &runtimeapi.UserNamespace{
Mode: runtimeapi.NamespaceMode_NODE,
}, nil
}
content, err := m.readMappingsFromFile(pod.UID)
if err != nil && err != utilstore.ErrKeyNotFound {
return nil, err
}
var userNs userNamespace
if string(content) != "" {
userNs, err = m.parseUserNsFileAndRecord(pod.UID, content)
if err != nil {
return nil, err
}
} else {
userNs, err = m.createUserNs(pod)
if err != nil {
return nil, err
}
}
var uids []*runtimeapi.IDMapping
var gids []*runtimeapi.IDMapping
for _, u := range userNs.UIDMappings {
uids = append(uids, &runtimeapi.IDMapping{
HostId: u.HostId,
ContainerId: u.ContainerId,
Length: u.Length,
})
}
for _, g := range userNs.GIDMappings {
gids = append(gids, &runtimeapi.IDMapping{
HostId: g.HostId,
ContainerId: g.ContainerId,
Length: g.Length,
})
}
return &runtimeapi.UserNamespace{
Mode: runtimeapi.NamespaceMode_POD,
Uids: uids,
Gids: gids,
}, nil
}
// CleanupOrphanedPodUsernsAllocations reconciliates the state of user namespace
// allocations with the pods actually running. It frees any user namespace
// allocation for orphaned pods.
func (m *usernsManager) CleanupOrphanedPodUsernsAllocations(pods []*v1.Pod, runningPods []*kubecontainer.Pod) error {
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) {
return nil
}
m.lock.Lock()
defer m.lock.Unlock()
allPods := sets.NewString()
for _, pod := range pods {
allPods.Insert(string(pod.UID))
}
for _, pod := range runningPods {
allPods.Insert(string(pod.ID))
}
allFound := sets.NewString()
found, err := m.kl.listPodsFromDisk()
if err != nil {
return err
}
for _, podUID := range found {
allFound.Insert(string(podUID))
}
// Lets remove all the pods "found" that are not known.
for _, podUID := range found {
if allPods.Has(string(podUID)) {
continue
}
klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
m.releaseWithLock(podUID)
}
// Lets remove any existing allocation for a pod that is not "found".
for podUID := range m.usedBy {
if allFound.Has(string(podUID)) {
continue
}
klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
m.releaseWithLock(podUID)
}
return nil
}
// getHostIDsForPod if the pod uses user namespaces, takes the uid and gid
// inside the container and returns the host UID and GID those are mapped to on
// the host. If containerUID/containerGID is nil, then it returns the host
// UID/GID for ID 0 inside the container.
// If the pod is not using user namespaces, as there is no mapping needed, the
// same containerUID and containerGID params are returned.
func (m *usernsManager) getHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesStatelessPodsSupport) {
return containerUID, containerGID, nil
}
if pod == nil || pod.Spec.HostUsers == nil || *pod.Spec.HostUsers == true {
return containerUID, containerGID, nil
}
mapping, err := m.GetOrCreateUserNamespaceMappings(pod)
if err != nil {
err = fmt.Errorf("Error getting pod user namespace mapping: %w", err)
return
}
uid, err := hostIDFromMapping(mapping.Uids, containerUID)
if err != nil {
err = fmt.Errorf("Error getting host UID: %w", err)
return
}
gid, err := hostIDFromMapping(mapping.Gids, containerGID)
if err != nil {
err = fmt.Errorf("Error getting host GID: %w", err)
return
}
return &uid, &gid, nil
}
func hostIDFromMapping(mapping []*runtimeapi.IDMapping, containerId *int64) (int64, error) {
if len(mapping) == 0 {
return 0, fmt.Errorf("can't use empty user namespace mapping")
}
// If none is requested, root inside the container is used
id := int64(0)
if containerId != nil {
id = *containerId
}
for _, m := range mapping {
if m == nil {
continue
}
firstId := int64(m.ContainerId)
lastId := firstId + int64(m.Length) - 1
// The id we are looking for is in the range
if id >= firstId && id <= lastId {
// Return the host id for this container id
return int64(m.HostId) + id - firstId, nil
}
}
return 0, fmt.Errorf("ID: %v not present in pod user namespace", id)
}

View File

@ -0,0 +1,346 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kubelet
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
featuregatetesting "k8s.io/component-base/featuregate/testing"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
pkgfeatures "k8s.io/kubernetes/pkg/features"
)
type testUserNsPodsManager struct {
}
func (m *testUserNsPodsManager) getPodDir(podUID types.UID) string {
return "/tmp/non-existant-dir.This-is-not-used-in-tests"
}
func (m *testUserNsPodsManager) listPodsFromDisk() ([]types.UID, error) {
return nil, nil
}
func TestUserNsManagerAllocate(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.UserNamespacesStatelessPodsSupport, true)()
testUserNsPodsManager := &testUserNsPodsManager{}
m, err := MakeUserNsManager(testUserNsPodsManager)
require.NoError(t, err)
assert.Equal(t, true, m.isSet(0), "m.isSet(0) should be true")
assert.Equal(t, true, m.isSet(1), "m.isSet(1) should be true")
allocated, length, err := m.allocateOne("one")
assert.NoError(t, err)
assert.Equal(t, userNsLength, int(length), "m.isSet(%d).length=%v", allocated, length)
assert.Equal(t, true, m.isSet(allocated), "m.isSet(%d)", allocated)
assert.Equal(t, userNsLength*2, int(allocated))
allocated2, length2, err := m.allocateOne("two")
assert.NoError(t, err)
assert.NotEqual(t, allocated, allocated2, "allocated != allocated2")
assert.Equal(t, length, length2, "length == length2")
assert.Equal(t, uint32(userNsLength*3), allocated2)
// verify that re-adding the same pod with the same settings won't fail
err = m.record("two", allocated2, length2)
assert.NoError(t, err)
// but it fails if anyting is different
err = m.record("two", allocated2+1, length2)
assert.Error(t, err)
m.Release("one")
m.Release("two")
assert.Equal(t, false, m.isSet(allocated), "m.isSet(%d)", allocated)
assert.Equal(t, false, m.isSet(allocated2), "m.nsSet(%d)", allocated2)
var allocs []uint32
for i := 0; i < 1000; i++ {
allocated, length, err = m.allocateOne(types.UID(fmt.Sprintf("%d", i)))
assert.Equal(t, userNsLength, int(length), "length is not the expected. iter: %v", i)
assert.Equal(t, userNsLength*(i+2), int(allocated), "firstID is not the expected. iter: %v", i)
assert.NoError(t, err)
allocs = append(allocs, allocated)
}
for i, v := range allocs {
assert.Equal(t, true, m.isSet(v), "m.isSet(%d) should be true", v)
m.Release(types.UID(fmt.Sprintf("%d", i)))
assert.Equal(t, false, m.isSet(v), "m.isSet(%d) should be false", v)
err = m.record(types.UID(fmt.Sprintf("%d", i)), v, userNsLength)
assert.NoError(t, err)
m.Release(types.UID(fmt.Sprintf("%d", i)))
assert.Equal(t, false, m.isSet(v), "m.isSet(%d) should be false", v)
}
}
func TestUserNsManagerParseUserNsFile(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.UserNamespacesStatelessPodsSupport, true)()
cases := []struct {
name string
file string
success bool
}{
{
name: "basic",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":0, "length":65536 } ],
"gidMappings":[ { "hostId":131072, "containerId":0, "length":65536 } ]
}`,
success: true,
},
{
name: "invalid length",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":0, "length":0 } ],
"gidMappings":[ { "hostId":131072, "containerId":0, "length":0 } ]
}`,
success: false,
},
{
name: "wrong offset",
file: `{
"uidMappings":[ {"hostId":131072, "containerId":0, "length":65536 } ],
"gidMappings":[ {"hostId":1, "containerId":0, "length":65536 } ]
}`,
success: false,
},
{
name: "two GID mappings",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":0, "length":userNsLength } ],
"gidMappings":[ { "hostId":131072, "containerId":0, "length":userNsLength }, { "hostId":196608, "containerId":0, "length":65536 } ]
}`,
success: false,
},
{
name: "two UID mappings",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":0, "length":65536 }, { "hostId":196608, "containerId":0, "length":65536 } ],
"gidMappings":[ { "hostId":131072, "containerId":0, "length":65536 } ]
}`,
success: false,
},
{
name: "no root UID",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":1, "length":65536 } ],
"gidMappings":[ { "hostId":131072, "containerId":0, "length":65536 } ]
}`,
success: false,
},
{
name: "no root GID",
file: `{
"uidMappings":[ { "hostId":131072, "containerId":0, "length":65536 } ],
"gidMappings":[ { "hostId":131072, "containerId":1, "length":65536 } ]
}`,
success: false,
},
}
testUserNsPodsManager := &testUserNsPodsManager{}
m, err := MakeUserNsManager(testUserNsPodsManager)
assert.NoError(t, err)
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
// We don't validate the result. It was parsed with the json parser, we trust that.
_, err = m.parseUserNsFileAndRecord(types.UID(tc.name), []byte(tc.file))
if (tc.success && err == nil) || (!tc.success && err != nil) {
return
}
t.Errorf("expected success: %v but got error: %v", tc.success, err)
})
}
}
func TestUserNsManagerHostIDFromMapping(t *testing.T) {
// mapping []*runtimeapi.IDMapping, containerId *int64
cases := []struct {
name string
success bool
containerId int64 // -1 means a nil ptr will be used.
expHostId int64
m []*runtimeapi.IDMapping
}{
{
name: "one basic mapping",
success: true,
containerId: -1,
expHostId: 0,
m: []*runtimeapi.IDMapping{
{
HostId: 0,
ContainerId: 0,
Length: userNsLength,
},
},
},
{
name: "one unprivileged mapping",
success: true,
containerId: -1,
expHostId: userNsLength * 2,
m: []*runtimeapi.IDMapping{
{
HostId: userNsLength * 2,
ContainerId: 0,
Length: userNsLength,
},
},
},
{
name: "one unprivileged mapping random id",
success: true,
containerId: 3,
expHostId: userNsLength*2 + 3,
m: []*runtimeapi.IDMapping{
{
HostId: userNsLength * 2,
ContainerId: 0,
Length: userNsLength,
},
},
},
{
name: "two unprivileged mapping",
success: true,
containerId: 0,
expHostId: userNsLength*2 + 0,
m: []*runtimeapi.IDMapping{
{
HostId: userNsLength * 2,
ContainerId: 0,
Length: 1,
},
{
HostId: userNsLength*2 + 10,
ContainerId: 1,
Length: 1,
},
},
},
{
name: "two unprivileged mapping - random id",
success: true,
containerId: 1,
expHostId: userNsLength*2 + 10,
m: []*runtimeapi.IDMapping{
{
HostId: userNsLength * 2,
ContainerId: 0,
Length: 1,
},
{
HostId: userNsLength*2 + 10,
ContainerId: 1,
Length: 1,
},
},
},
{
name: "two unprivileged mapping - not mapped user",
success: false,
containerId: 3,
m: []*runtimeapi.IDMapping{
{
HostId: userNsLength * 2,
ContainerId: 0,
Length: 1,
},
{
HostId: userNsLength*2 + 1,
ContainerId: 1,
Length: 1,
},
},
},
{
name: "no mappings",
success: false,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
var containerId *int64
if tc.containerId != -1 {
containerId = &tc.containerId
}
id, err := hostIDFromMapping(tc.m, containerId)
if (tc.success && err != nil) || (!tc.success && err == nil) {
t.Fatalf("%v: expected success: %v - got error: %v", tc.name, tc.success, err)
}
if !tc.success && err != nil {
return
}
if id != tc.expHostId {
t.Errorf("expected: %v - got: %v", tc.expHostId, id)
}
})
}
}
func BenchmarkBitmaskFindAndSetFirstZero(t *testing.B) {
b := makeBitArray(userNsLength)
for i := 0; i < userNsLength; i++ {
_, found := b.findAvailable()
assert.True(t, found)
}
}
func BenchmarkBitmaskSetAndClear(t *testing.B) {
b := makeBitArray(userNsLength)
for i := uint32(0); i < userNsLength; i++ {
b.set(i)
b.clear(i)
}
}
func BenchmarkBitmaskFindAndSetFirstZeroAndClear(t *testing.B) {
b := makeBitArray(userNsLength)
for i := 0; i < userNsLength; i++ {
ret, found := b.findAvailable()
assert.True(t, found)
b.clear(ret)
}
}
func BenchmarkBitmaskFindAndSetFirstZeroAndClear0Every2(t *testing.B) {
// it is an interesting edge case as it forces a full scan
// on each second allocation.
b := makeBitArray(userNsLength)
for i := 0; i < userNsLength; i++ {
_, found := b.findAvailable()
assert.True(t, found)
if i%2 == 0 {
b.clear(0)
}
}
}

View File

@ -128,6 +128,16 @@ func (kvh *kubeletVolumeHost) GetPodsDir() string {
return kvh.kubelet.getPodsDir()
}
// GetHostIDsForPod if the pod uses user namespaces, takes the uid and gid
// inside the container and returns the host UID and GID those are mapped to on
// the host. If containerUID/containerGID is nil, then it returns the host
// UID/GID for ID 0 inside the container.
// If the pod is not using user namespaces, as there is no mapping needed, the
// same containerUID and containerGID params are returned.
func (kvh *kubeletVolumeHost) GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return kvh.kubelet.getHostIDsForPod(pod, containerUID, containerGID)
}
func (kvh *kubeletVolumeHost) GetPodVolumeDir(podUID types.UID, pluginName string, volumeName string) string {
dir := kvh.kubelet.getPodVolumeDir(podUID, pluginName, volumeName)
if runtime.GOOS == "windows" {

View File

@ -340,6 +340,13 @@ type KubeletVolumeHost interface {
WaitForCacheSync() error
// Returns hostutil.HostUtils
GetHostUtil() hostutil.HostUtils
// GetHostIDsForPod if the pod uses user namespaces, takes the uid and
// gid inside the container and returns the host UID and GID those are
// mapped to on the host. If containerUID/containerGID is nil, then it
// returns the host UID/GID for ID 0 inside the container.
// If the pod is not using user namespaces, as there is no mapping needed, the
// same containerUID and containerGID params are returned.
GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error)
}
// AttachDetachVolumeHost is a AttachDetach Controller specific interface that plugins can use

View File

@ -118,6 +118,10 @@ func (f *fakeVolumeHost) GetPodsDir() string {
return filepath.Join(f.rootDir, "pods")
}
func (f *fakeVolumeHost) GetHostIDsForPod(pod *v1.Pod, containerUID, containerGID *int64) (hostUID, hostGID *int64, err error) {
return containerUID, containerGID, nil
}
func (f *fakeVolumeHost) GetPodVolumeDir(podUID types.UID, pluginName, volumeName string) string {
return filepath.Join(f.rootDir, "pods", string(podUID), "volumes", pluginName, volumeName)
}

View File

@ -669,10 +669,23 @@ func (og *operationGenerator) GenerateMountVolumeFunc(
resizeOptions.DeviceStagePath = deviceMountPath
}
kvh, ok := og.GetVolumePluginMgr().Host.(volume.KubeletVolumeHost)
if !ok {
eventErr, detailedErr := volumeToMount.GenerateError("MountVolume type assertion error", fmt.Errorf("volume host does not implement KubeletVolumeHost interface"))
return volumetypes.NewOperationContext(eventErr, detailedErr, migrated)
}
uid := util.FsUserFrom(volumeToMount.Pod)
hostUID, hostGID, err := kvh.GetHostIDsForPod(volumeToMount.Pod, uid, fsGroup)
if err != nil {
msg := fmt.Sprintf("MountVolume.GetHostIDsForPod failed to find host ID in user namespace (UID: %v GID: %v)", uid, fsGroup)
eventErr, detailedErr := volumeToMount.GenerateError(msg, err)
return volumetypes.NewOperationContext(eventErr, detailedErr, migrated)
}
// Execute mount
mountErr := volumeMounter.SetUp(volume.MounterArgs{
FsUser: util.FsUserFrom(volumeToMount.Pod),
FsGroup: fsGroup,
FsUser: hostUID,
FsGroup: hostGID,
DesiredSize: volumeToMount.DesiredSizeLimit,
FSGroupChangePolicy: fsGroupChangePolicy,
})

View File

@ -342,7 +342,7 @@ func verifyDirectoryPermission(path string, readonly bool) bool {
func TestSetVolumeOwnershipOwner(t *testing.T) {
fsGroup := int64(3000)
currentUid := os.Getuid()
currentUid := os.Geteuid()
if currentUid != 0 {
t.Skip("running as non-root")
}

File diff suppressed because it is too large Load Diff

View File

@ -3712,6 +3712,7 @@ message PodSpec {
// If the OS field is set to windows, following fields must be unset:
// - spec.hostPID
// - spec.hostIPC
// - spec.hostUsers
// - spec.securityContext.seLinuxOptions
// - spec.securityContext.seccompProfile
// - spec.securityContext.fsGroup
@ -3732,6 +3733,19 @@ message PodSpec {
// - spec.containers[*].securityContext.runAsGroup
// +optional
optional PodOS os = 36;
// Use the host's user namespace.
// Optional: Default to true.
// If set to true or not present, the pod will be run in the host user namespace, useful
// for when the pod needs a feature only available to the host user namespace, such as
// loading a kernel module with CAP_SYS_MODULE.
// When set to false, a new userns is created for the pod. Setting false is useful for
// mitigating container breakout vulnerabilities even allowing users to run their
// containers as root without actually having root privileges on the host.
// This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.
// +k8s:conversion-gen=false
// +optional
optional bool hostUsers = 37;
}
// PodStatus represents information about the status of a pod. Status may trail the actual

View File

@ -3289,6 +3289,7 @@ type PodSpec struct {
// If the OS field is set to windows, following fields must be unset:
// - spec.hostPID
// - spec.hostIPC
// - spec.hostUsers
// - spec.securityContext.seLinuxOptions
// - spec.securityContext.seccompProfile
// - spec.securityContext.fsGroup
@ -3309,6 +3310,18 @@ type PodSpec struct {
// - spec.containers[*].securityContext.runAsGroup
// +optional
OS *PodOS `json:"os,omitempty" protobuf:"bytes,36,opt,name=os"`
// Use the host's user namespace.
// Optional: Default to true.
// If set to true or not present, the pod will be run in the host user namespace, useful
// for when the pod needs a feature only available to the host user namespace, such as
// loading a kernel module with CAP_SYS_MODULE.
// When set to false, a new userns is created for the pod. Setting false is useful for
// mitigating container breakout vulnerabilities even allowing users to run their
// containers as root without actually having root privileges on the host.
// This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.
// +k8s:conversion-gen=false
// +optional
HostUsers *bool `json:"hostUsers,omitempty" protobuf:"bytes,37,opt,name=hostUsers"`
}
// OSName is the set of OS'es that can be used in OS.

View File

@ -1670,7 +1670,8 @@ var map_PodSpec = map[string]string{
"overhead": "Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md",
"topologySpreadConstraints": "TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed.",
"setHostnameAsFQDN": "If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false.",
"os": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup",
"os": "Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set.\n\nIf the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions\n\nIf the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup",
"hostUsers": "Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature.",
}
func (PodSpec) SwaggerDoc() map[string]string {

View File

@ -3954,6 +3954,11 @@ func (in *PodSpec) DeepCopyInto(out *PodSpec) {
*out = new(PodOS)
**out = **in
}
if in.HostUsers != nil {
in, out := &in.HostUsers, &out.HostUsers
*out = new(bool)
**out = **in
}
return
}

View File

@ -1625,7 +1625,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"updateStrategy": {

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"strategy": {

View File

@ -600,6 +600,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1627,7 +1627,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
}
},

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"volumeClaimTemplates": [

View File

@ -598,6 +598,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"strategy": {

View File

@ -602,6 +602,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"volumeClaimTemplates": [

View File

@ -598,6 +598,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1625,7 +1625,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"updateStrategy": {

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"strategy": {

View File

@ -600,6 +600,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1627,7 +1627,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
}
},

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"volumeClaimTemplates": [

View File

@ -598,6 +598,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1679,7 +1679,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"ttlSecondsAfterFinished": 8,

View File

@ -633,6 +633,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1630,7 +1630,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"ttlSecondsAfterFinished": 8,

View File

@ -597,6 +597,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1679,7 +1679,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"ttlSecondsAfterFinished": 8,

View File

@ -633,6 +633,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1673,7 +1673,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"ttlSecondsAfterFinished": 8,

View File

@ -630,6 +630,7 @@ template:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1567,7 +1567,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
},
"status": {
"phase": "phaseValue",

View File

@ -548,6 +548,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1610,7 +1610,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
}
}

View File

@ -581,6 +581,7 @@ template:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1616,7 +1616,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
}
},

View File

@ -586,6 +586,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1625,7 +1625,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"updateStrategy": {

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1626,7 +1626,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
},
"strategy": {

View File

@ -602,6 +602,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -1627,7 +1627,8 @@
"setHostnameAsFQDN": true,
"os": {
"name": "nameValue"
}
},
"hostUsers": true
}
}
},

View File

@ -592,6 +592,7 @@ spec:
hostIPC: true
hostNetwork: true
hostPID: true
hostUsers: true
hostname: hostnameValue
imagePullSecrets:
- name: nameValue

View File

@ -61,6 +61,7 @@ type PodSpecApplyConfiguration struct {
TopologySpreadConstraints []TopologySpreadConstraintApplyConfiguration `json:"topologySpreadConstraints,omitempty"`
SetHostnameAsFQDN *bool `json:"setHostnameAsFQDN,omitempty"`
OS *PodOSApplyConfiguration `json:"os,omitempty"`
HostUsers *bool `json:"hostUsers,omitempty"`
}
// PodSpecApplyConfiguration constructs an declarative configuration of the PodSpec type for use with
@ -407,3 +408,11 @@ func (b *PodSpecApplyConfiguration) WithOS(value *PodOSApplyConfiguration) *PodS
b.OS = value
return b
}
// WithHostUsers sets the HostUsers field in the declarative configuration to the given value
// and returns the receiver, so that objects can be built by chaining "With" function invocations.
// If called multiple times, the HostUsers field is set to the value of the last call.
func (b *PodSpecApplyConfiguration) WithHostUsers(value bool) *PodSpecApplyConfiguration {
b.HostUsers = &value
return b
}

Some files were not shown because too many files have changed in this diff Show More