diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index c5ada193eff..af4d3dba507 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -583,9 +583,10 @@ type StorageMedium string // These are the valid value for StorageMedium const ( - StorageMediumDefault StorageMedium = "" // use whatever the default is for the node - StorageMediumMemory StorageMedium = "Memory" // use memory (tmpfs) - StorageMediumHugePages StorageMedium = "HugePages" // use hugepages + StorageMediumDefault StorageMedium = "" // use whatever the default is for the node + StorageMediumMemory StorageMedium = "Memory" // use memory (tmpfs) + StorageMediumHugePages StorageMedium = "HugePages" // use hugepages + StorageMediumHugePagesPrefix StorageMedium = "HugePages-" // prefix for full medium notation HugePages- ) // Protocol defines network protocols supported for things like container ports. diff --git a/pkg/apis/core/v1/helper/helpers.go b/pkg/apis/core/v1/helper/helpers.go index a930520f1b1..ac3018c9992 100644 --- a/pkg/apis/core/v1/helper/helpers.go +++ b/pkg/apis/core/v1/helper/helpers.go @@ -103,6 +103,27 @@ func HugePageUnitSizeFromByteSize(size int64) (string, error) { return fmt.Sprintf("%d%s", size, hugePageSizeUnitList[idx]), nil } +// IsHugePageMedium returns true if the volume medium is in 'HugePages[-size]' format +func IsHugePageMedium(medium v1.StorageMedium) bool { + if medium == v1.StorageMediumHugePages { + return true + } + return strings.HasPrefix(string(medium), string(v1.StorageMediumHugePagesPrefix)) +} + +// HugePageSizeFromMedium returns the page size for the specified huge page medium. +// If the specified input is not a valid huge page medium an error is returned. +func HugePageSizeFromMedium(medium v1.StorageMedium) (resource.Quantity, error) { + if !IsHugePageMedium(medium) { + return resource.Quantity{}, fmt.Errorf("medium: %s is not a hugepage medium", medium) + } + if medium == v1.StorageMediumHugePages { + return resource.Quantity{}, fmt.Errorf("medium: %s doesn't have size information", medium) + } + pageSize := strings.TrimPrefix(string(medium), string(v1.StorageMediumHugePagesPrefix)) + return resource.ParseQuantity(pageSize) +} + // IsOvercommitAllowed returns true if the resource is in the default // namespace and is not hugepages. func IsOvercommitAllowed(name v1.ResourceName) bool { diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index cec7ebf1d32..6c13172208c 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -3083,8 +3083,33 @@ func validateContainersOnlyForPod(containers []core.Container, fldPath *field.Pa return allErrs } +// PodValidationOptions contains the different settings for pod validation +type PodValidationOptions struct { + // Allow pod spec to have more than one huge page resource (with different sizes) + AllowMultipleHugePageResources bool +} + +// ValidatePodSingleHugePageResources checks if there are multiple huge +// pages resources in the pod object. +func ValidatePodSingleHugePageResources(pod *core.Pod, specPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + hugePageResources := sets.NewString() + for i := range pod.Spec.Containers { + resourceSet := toContainerResourcesSet(&pod.Spec.Containers[i]) + for resourceStr := range resourceSet { + if v1helper.IsHugePageResourceName(v1.ResourceName(resourceStr)) { + hugePageResources.Insert(resourceStr) + } + } + } + if len(hugePageResources) > 1 { + allErrs = append(allErrs, field.Invalid(specPath, hugePageResources.List(), "must use a single hugepage size in a pod spec")) + } + return allErrs +} + // ValidatePod tests if required fields in the pod are set. -func ValidatePod(pod *core.Pod) field.ErrorList { +func ValidatePod(pod *core.Pod, opts PodValidationOptions) field.ErrorList { fldPath := field.NewPath("metadata") allErrs := ValidateObjectMeta(&pod.ObjectMeta, true, ValidatePodName, fldPath) allErrs = append(allErrs, ValidatePodSpecificAnnotations(pod.ObjectMeta.Annotations, &pod.Spec, fldPath.Child("annotations"))...) @@ -3111,17 +3136,8 @@ func ValidatePod(pod *core.Pod) field.ErrorList { allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.Containers, specPath.Child("containers"))...) allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.InitContainers, specPath.Child("initContainers"))...) - hugePageResources := sets.NewString() - for i := range pod.Spec.Containers { - resourceSet := toContainerResourcesSet(&pod.Spec.Containers[i]) - for resourceStr := range resourceSet { - if v1helper.IsHugePageResourceName(v1.ResourceName(resourceStr)) { - hugePageResources.Insert(resourceStr) - } - } - } - if len(hugePageResources) > 1 { - allErrs = append(allErrs, field.Invalid(specPath, hugePageResources, "must use a single hugepage size in a pod spec")) + if !opts.AllowMultipleHugePageResources { + allErrs = append(allErrs, ValidatePodSingleHugePageResources(pod, specPath)...) } podIPsField := field.NewPath("status", "podIPs") @@ -3679,8 +3695,8 @@ func ValidateContainerUpdates(newContainers, oldContainers []core.Container, fld } // ValidatePodCreate validates a pod in the context of its initial create -func ValidatePodCreate(pod *core.Pod) field.ErrorList { - allErrs := ValidatePod(pod) +func ValidatePodCreate(pod *core.Pod, opts PodValidationOptions) field.ErrorList { + allErrs := ValidatePod(pod, opts) fldPath := field.NewPath("spec") // EphemeralContainers can only be set on update using the ephemeralcontainers subresource @@ -3693,12 +3709,16 @@ func ValidatePodCreate(pod *core.Pod) field.ErrorList { // ValidatePodUpdate tests to see if the update is legal for an end user to make. newPod is updated with fields // that cannot be changed. -func ValidatePodUpdate(newPod, oldPod *core.Pod) field.ErrorList { +func ValidatePodUpdate(newPod, oldPod *core.Pod, opts PodValidationOptions) field.ErrorList { fldPath := field.NewPath("metadata") allErrs := ValidateObjectMetaUpdate(&newPod.ObjectMeta, &oldPod.ObjectMeta, fldPath) allErrs = append(allErrs, ValidatePodSpecificAnnotationUpdates(newPod, oldPod, fldPath.Child("annotations"))...) specPath := field.NewPath("spec") + if !opts.AllowMultipleHugePageResources { + allErrs = append(allErrs, ValidatePodSingleHugePageResources(newPod, specPath)...) + } + // validate updateable fields: // 1. spec.containers[*].image // 2. spec.initContainers[*].image diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 5413aaf5176..7319820dda9 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -539,6 +539,14 @@ const ( // // Enables a feature to make secrets and configmaps data immutable. ImmutableEphemeralVolumes featuregate.Feature = "ImmutableEphemeralVolumes" + + // owner: @bart0sh + // alpha: v1.18 + // + // Enables usage of HugePages- in a volume medium, + // e.g. emptyDir: + // medium: HugePages-1Gi + HugePageStorageMediumSize featuregate.Feature = "HugePageStorageMediumSize" ) func init() { @@ -624,6 +632,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS PodDisruptionBudget: {Default: true, PreRelease: featuregate.Beta}, ServiceTopology: {Default: false, PreRelease: featuregate.Alpha}, ImmutableEphemeralVolumes: {Default: false, PreRelease: featuregate.Alpha}, + HugePageStorageMediumSize: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/config/BUILD b/pkg/kubelet/config/BUILD index 25595bd3e9b..b34fb86f199 100644 --- a/pkg/kubelet/config/BUILD +++ b/pkg/kubelet/config/BUILD @@ -24,6 +24,7 @@ go_library( "//pkg/apis/core/install:go_default_library", "//pkg/apis/core/v1:go_default_library", "//pkg/apis/core/validation:go_default_library", + "//pkg/features:go_default_library", "//pkg/kubelet/checkpoint:go_default_library", "//pkg/kubelet/checkpointmanager:go_default_library", "//pkg/kubelet/container:go_default_library", @@ -40,6 +41,7 @@ go_library( "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/yaml:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//staging/src/k8s.io/client-go/tools/cache:go_default_library", "//staging/src/k8s.io/client-go/tools/record:go_default_library", diff --git a/pkg/kubelet/config/common.go b/pkg/kubelet/config/common.go index cd557c28147..6ddf219b4f1 100644 --- a/pkg/kubelet/config/common.go +++ b/pkg/kubelet/config/common.go @@ -27,8 +27,10 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilyaml "k8s.io/apimachinery/pkg/util/yaml" + utilfeature "k8s.io/apiserver/pkg/util/feature" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/core/helper" + "k8s.io/kubernetes/pkg/features" // TODO: remove this import if // api.Registry.GroupOrDie(v1.GroupName).GroupVersion.String() is changed @@ -133,7 +135,10 @@ func tryDecodeSinglePod(data []byte, defaultFn defaultFunc) (parsed bool, pod *v if err = defaultFn(newPod); err != nil { return true, pod, err } - if errs := validation.ValidatePod(newPod); len(errs) > 0 { + opts := validation.PodValidationOptions{ + AllowMultipleHugePageResources: utilfeature.DefaultFeatureGate.Enabled(features.HugePageStorageMediumSize), + } + if errs := validation.ValidatePod(newPod, opts); len(errs) > 0 { return true, pod, fmt.Errorf("invalid pod: %v", errs) } v1Pod := &v1.Pod{} @@ -157,13 +162,17 @@ func tryDecodePodList(data []byte, defaultFn defaultFunc) (parsed bool, pods v1. return false, pods, err } + opts := validation.PodValidationOptions{ + AllowMultipleHugePageResources: utilfeature.DefaultFeatureGate.Enabled(features.HugePageStorageMediumSize), + } + // Apply default values and validate pods. for i := range newPods.Items { newPod := &newPods.Items[i] if err = defaultFn(newPod); err != nil { return true, pods, err } - if errs := validation.ValidatePod(newPod); len(errs) > 0 { + if errs := validation.ValidatePod(newPod, opts); len(errs) > 0 { err = fmt.Errorf("invalid pod: %v", errs) return true, pods, err } diff --git a/pkg/kubelet/config/common_test.go b/pkg/kubelet/config/common_test.go index c220a14e504..b29b83d080f 100644 --- a/pkg/kubelet/config/common_test.go +++ b/pkg/kubelet/config/common_test.go @@ -251,7 +251,7 @@ func TestStaticPodNameGenerate(t *testing.T) { if c.overwrite != "" { pod.Name = c.overwrite } - errs := validation.ValidatePod(pod) + errs := validation.ValidatePod(pod, validation.PodValidationOptions{}) if c.shouldErr { specNameErrored := false for _, err := range errs { diff --git a/pkg/kubelet/config/file_linux_test.go b/pkg/kubelet/config/file_linux_test.go index ee9f1517b75..4c7a7ba8f34 100644 --- a/pkg/kubelet/config/file_linux_test.go +++ b/pkg/kubelet/config/file_linux_test.go @@ -90,7 +90,7 @@ func TestReadPodsFromFileExistAlready(t *testing.T) { if err := k8s_api_v1.Convert_v1_Pod_To_core_Pod(pod, internalPod, nil); err != nil { t.Fatalf("%s: Cannot convert pod %#v, %#v", testCase.desc, pod, err) } - if errs := validation.ValidatePod(internalPod); len(errs) > 0 { + if errs := validation.ValidatePod(internalPod, validation.PodValidationOptions{}); len(errs) > 0 { t.Fatalf("%s: Invalid pod %#v, %#v", testCase.desc, internalPod, errs) } } @@ -369,7 +369,7 @@ func expectUpdate(t *testing.T, ch chan interface{}, testCase *testCase) { if err := k8s_api_v1.Convert_v1_Pod_To_core_Pod(pod, internalPod, nil); err != nil { t.Fatalf("%s: Cannot convert pod %#v, %#v", testCase.desc, pod, err) } - if errs := validation.ValidatePod(internalPod); len(errs) > 0 { + if errs := validation.ValidatePod(internalPod, validation.PodValidationOptions{}); len(errs) > 0 { t.Fatalf("%s: Invalid pod %#v, %#v", testCase.desc, internalPod, errs) } } diff --git a/pkg/kubelet/config/http_test.go b/pkg/kubelet/config/http_test.go index 398dbc8e679..2778bfc535b 100644 --- a/pkg/kubelet/config/http_test.go +++ b/pkg/kubelet/config/http_test.go @@ -319,7 +319,7 @@ func TestExtractPodsFromHTTP(t *testing.T) { if err := k8s_api_v1.Convert_v1_Pod_To_core_Pod(pod, internalPod, nil); err != nil { t.Fatalf("%s: Cannot convert pod %#v, %#v", testCase.desc, pod, err) } - if errs := validation.ValidatePod(internalPod); len(errs) != 0 { + if errs := validation.ValidatePod(internalPod, validation.PodValidationOptions{}); len(errs) != 0 { t.Errorf("%s: Expected no validation errors on %#v, Got %v", testCase.desc, pod, errs.ToAggregate()) } } diff --git a/pkg/registry/core/pod/strategy.go b/pkg/registry/core/pod/strategy.go index 917769e400c..2f9e8235685 100644 --- a/pkg/registry/core/pod/strategy.go +++ b/pkg/registry/core/pod/strategy.go @@ -88,7 +88,11 @@ func (podStrategy) PrepareForUpdate(ctx context.Context, obj, old runtime.Object // Validate validates a new pod. func (podStrategy) Validate(ctx context.Context, obj runtime.Object) field.ErrorList { pod := obj.(*api.Pod) - allErrs := validation.ValidatePodCreate(pod) + opts := validation.PodValidationOptions{ + // Allow multiple huge pages on pod create if feature is enabled + AllowMultipleHugePageResources: utilfeature.DefaultFeatureGate.Enabled(features.HugePageStorageMediumSize), + } + allErrs := validation.ValidatePodCreate(pod, opts) allErrs = append(allErrs, validation.ValidateConditionalPod(pod, nil, field.NewPath(""))...) return allErrs } @@ -104,8 +108,13 @@ func (podStrategy) AllowCreateOnUpdate() bool { // ValidateUpdate is the default update validation for an end user. func (podStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList { - errorList := validation.ValidatePod(obj.(*api.Pod)) - errorList = append(errorList, validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod))...) + oldFailsSingleHugepagesValidation := len(validation.ValidatePodSingleHugePageResources(old.(*api.Pod), field.NewPath("spec"))) > 0 + opts := validation.PodValidationOptions{ + // Allow multiple huge pages on pod create if feature is enabled or if the old pod already has multiple hugepages specified + AllowMultipleHugePageResources: oldFailsSingleHugepagesValidation || utilfeature.DefaultFeatureGate.Enabled(features.HugePageStorageMediumSize), + } + errorList := validation.ValidatePod(obj.(*api.Pod), opts) + errorList = append(errorList, validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod), opts)...) errorList = append(errorList, validation.ValidateConditionalPod(obj.(*api.Pod), old.(*api.Pod), field.NewPath(""))...) return errorList } diff --git a/pkg/volume/emptydir/BUILD b/pkg/volume/emptydir/BUILD index c4a58fda540..b220a50107e 100644 --- a/pkg/volume/emptydir/BUILD +++ b/pkg/volume/emptydir/BUILD @@ -44,6 +44,7 @@ go_test( embed = [":go_default_library"], deps = select({ "@io_bazel_rules_go//go/platform:android": [ + "//pkg/features:go_default_library", "//pkg/volume:go_default_library", "//pkg/volume/testing:go_default_library", "//pkg/volume/util:go_default_library", @@ -51,10 +52,13 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/util/testing:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", "//vendor/k8s.io/utils/mount:go_default_library", ], "@io_bazel_rules_go//go/platform:linux": [ + "//pkg/features:go_default_library", "//pkg/volume:go_default_library", "//pkg/volume/testing:go_default_library", "//pkg/volume/util:go_default_library", @@ -62,7 +66,9 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/util/testing:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", "//vendor/k8s.io/utils/mount:go_default_library", ], "//conditions:default": [], diff --git a/pkg/volume/emptydir/empty_dir.go b/pkg/volume/emptydir/empty_dir.go index a890a34802c..1102262f766 100644 --- a/pkg/volume/emptydir/empty_dir.go +++ b/pkg/volume/emptydir/empty_dir.go @@ -216,12 +216,12 @@ func (ed *emptyDir) SetUpAt(dir string, mounterArgs volume.MounterArgs) error { } } - switch ed.medium { - case v1.StorageMediumDefault: + switch { + case ed.medium == v1.StorageMediumDefault: err = ed.setupDir(dir) - case v1.StorageMediumMemory: + case ed.medium == v1.StorageMediumMemory: err = ed.setupTmpfs(dir) - case v1.StorageMediumHugePages: + case v1helper.IsHugePageMedium(ed.medium): err = ed.setupHugepages(dir) default: err = fmt.Errorf("unknown storage medium %q", ed.medium) @@ -290,11 +290,11 @@ func (ed *emptyDir) setupHugepages(dir string) error { } // If the directory is a mountpoint with medium hugepages, there is no // work to do since we are already in the desired state. - if isMnt && medium == v1.StorageMediumHugePages { + if isMnt && v1helper.IsHugePageMedium(medium) { return nil } - pageSizeMountOption, err := getPageSizeMountOptionFromPod(ed.pod) + pageSizeMountOption, err := getPageSizeMountOption(ed.medium, ed.pod) if err != nil { return err } @@ -303,33 +303,52 @@ func (ed *emptyDir) setupHugepages(dir string) error { return ed.mounter.Mount("nodev", dir, "hugetlbfs", []string{pageSizeMountOption}) } -// getPageSizeMountOptionFromPod retrieves pageSize mount option from Pod's resources -// and validates pageSize options in all containers of given Pod. -func getPageSizeMountOptionFromPod(pod *v1.Pod) (string, error) { +// getPageSizeMountOption retrieves pageSize mount option from Pod's resources +// and medium and validates pageSize options in all containers of given Pod. +func getPageSizeMountOption(medium v1.StorageMedium, pod *v1.Pod) (string, error) { pageSizeFound := false pageSize := resource.Quantity{} - // In some rare cases init containers can also consume Huge pages. - containers := append(pod.Spec.Containers, pod.Spec.InitContainers...) - for _, container := range containers { + + var mediumPageSize resource.Quantity + if medium != v1.StorageMediumHugePages { + // medium is: Hugepages- + var err error + mediumPageSize, err = v1helper.HugePageSizeFromMedium(medium) + if err != nil { + return "", err + } + } + + // In some rare cases init containers can also consume Huge pages + for _, container := range append(pod.Spec.Containers, pod.Spec.InitContainers...) { // We can take request because limit and requests must match. for requestName := range container.Resources.Requests { - if v1helper.IsHugePageResourceName(requestName) { - currentPageSize, err := v1helper.HugePageSizeFromResourceName(requestName) - if err != nil { - return "", err - } - // PageSize for all volumes in a POD are equal, except for the first one discovered. + if !v1helper.IsHugePageResourceName(requestName) { + continue + } + currentPageSize, err := v1helper.HugePageSizeFromResourceName(requestName) + if err != nil { + return "", err + } + if medium == v1.StorageMediumHugePages { // medium is: Hugepages, size is not specified + // PageSize for all volumes in a POD must be equal if medium is "Hugepages" if pageSizeFound && pageSize.Cmp(currentPageSize) != 0 { - return "", fmt.Errorf("multiple pageSizes for huge pages in a single PodSpec") + return "", fmt.Errorf("medium: %s can't be used if container requests multiple huge page sizes", medium) } - pageSize = currentPageSize + pageSizeFound = true + pageSize = currentPageSize + } else { // medium is: Hugepages- + if currentPageSize.Cmp(mediumPageSize) == 0 { + pageSizeFound = true + pageSize = currentPageSize + } } } } if !pageSizeFound { - return "", fmt.Errorf("hugePages storage requested, but there is no resource request for huge pages") + return "", fmt.Errorf("medium %s: hugePages storage requested, but there is no resource request for huge pages", medium) } return fmt.Sprintf("%s=%s", hugePagesPageSizeMountOption, pageSize.String()), nil diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index fb350175dc8..ecc177e7189 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -887,9 +887,10 @@ type FlockerVolumeSource struct { type StorageMedium string const ( - StorageMediumDefault StorageMedium = "" // use whatever the default is for the node, assume anything we don't explicitly handle is this - StorageMediumMemory StorageMedium = "Memory" // use memory (e.g. tmpfs on linux) - StorageMediumHugePages StorageMedium = "HugePages" // use hugepages + StorageMediumDefault StorageMedium = "" // use whatever the default is for the node, assume anything we don't explicitly handle is this + StorageMediumMemory StorageMedium = "Memory" // use memory (e.g. tmpfs on linux) + StorageMediumHugePages StorageMedium = "HugePages" // use hugepages + StorageMediumHugePagesPrefix StorageMedium = "HugePages-" // prefix for full medium notation HugePages- ) // Protocol defines network protocols supported for things like container ports.