diff --git a/pkg/api/pod/util.go b/pkg/api/pod/util.go index 847b3abd15b..0694bc2e925 100644 --- a/pkg/api/pod/util.go +++ b/pkg/api/pod/util.go @@ -345,29 +345,34 @@ func usesMultipleHugePageResources(podSpec *api.PodSpec) bool { return len(hugePageResources) > 1 } -// GetValidationOptionsFromPodSpec returns validation options based on pod specs -func GetValidationOptionsFromPodSpec(podSpec, oldPodSpec *api.PodSpec) apivalidation.PodValidationOptions { +// GetValidationOptionsFromPodSpecAndMeta returns validation options based on pod specs and metadata +func GetValidationOptionsFromPodSpecAndMeta(podSpec, oldPodSpec *api.PodSpec, podMeta, oldPodMeta *metav1.ObjectMeta) apivalidation.PodValidationOptions { // default pod validation options based on feature gate opts := validation.PodValidationOptions{ // Allow multiple huge pages on pod create if feature is enabled AllowMultipleHugePageResources: utilfeature.DefaultFeatureGate.Enabled(features.HugePageStorageMediumSize), // Allow pod spec to use hugepages in downward API if feature is enabled - AllowDownwardAPIHugePages: utilfeature.DefaultFeatureGate.Enabled(features.DownwardAPIHugePages), + AllowDownwardAPIHugePages: utilfeature.DefaultFeatureGate.Enabled(features.DownwardAPIHugePages), + AllowInvalidPodDeletionCost: !utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost), } - // if we are not doing an update operation, just return with default options - if oldPodSpec == nil { - return opts + + if oldPodSpec != nil { + // if old spec used multiple huge page sizes, we must allow it + opts.AllowMultipleHugePageResources = opts.AllowMultipleHugePageResources || usesMultipleHugePageResources(oldPodSpec) + // if old spec used hugepages in downward api, we must allow it + opts.AllowDownwardAPIHugePages = opts.AllowDownwardAPIHugePages || usesHugePagesInProjectedVolume(oldPodSpec) + // determine if any container is using hugepages in env var + if !opts.AllowDownwardAPIHugePages { + VisitContainers(oldPodSpec, AllContainers, func(c *api.Container, containerType ContainerType) bool { + opts.AllowDownwardAPIHugePages = opts.AllowDownwardAPIHugePages || usesHugePagesInProjectedEnv(*c) + return !opts.AllowDownwardAPIHugePages + }) + } } - // if old spec used multiple huge page sizes, we must allow it - opts.AllowMultipleHugePageResources = opts.AllowMultipleHugePageResources || usesMultipleHugePageResources(oldPodSpec) - // if old spec used hugepages in downward api, we must allow it - opts.AllowDownwardAPIHugePages = opts.AllowDownwardAPIHugePages || usesHugePagesInProjectedVolume(oldPodSpec) - // determine if any container is using hugepages in env var - if !opts.AllowDownwardAPIHugePages { - VisitContainers(oldPodSpec, AllContainers, func(c *api.Container, containerType ContainerType) bool { - opts.AllowDownwardAPIHugePages = opts.AllowDownwardAPIHugePages || usesHugePagesInProjectedEnv(*c) - return !opts.AllowDownwardAPIHugePages - }) + if oldPodMeta != nil && !opts.AllowInvalidPodDeletionCost { + // This is an update, so validate only if the existing object was valid. + _, err := helper.GetDeletionCostFromPodAnnotations(oldPodMeta.Annotations) + opts.AllowInvalidPodDeletionCost = err != nil } return opts } @@ -375,15 +380,18 @@ func GetValidationOptionsFromPodSpec(podSpec, oldPodSpec *api.PodSpec) apivalida // GetValidationOptionsFromPodTemplate will return pod validation options for specified template. func GetValidationOptionsFromPodTemplate(podTemplate, oldPodTemplate *api.PodTemplateSpec) apivalidation.PodValidationOptions { var newPodSpec, oldPodSpec *api.PodSpec + var newPodMeta, oldPodMeta *metav1.ObjectMeta // we have to be careful about nil pointers here // replication controller in particular is prone to passing nil if podTemplate != nil { newPodSpec = &podTemplate.Spec + newPodMeta = &podTemplate.ObjectMeta } if oldPodTemplate != nil { oldPodSpec = &oldPodTemplate.Spec + oldPodMeta = &oldPodTemplate.ObjectMeta } - return GetValidationOptionsFromPodSpec(newPodSpec, oldPodSpec) + return GetValidationOptionsFromPodSpecAndMeta(newPodSpec, oldPodSpec, newPodMeta, oldPodMeta) } // DropDisabledTemplateFields removes disabled fields from the pod template metadata and spec. diff --git a/pkg/api/pod/util_test.go b/pkg/api/pod/util_test.go index b3373e963a8..80385b9a1f1 100644 --- a/pkg/api/pod/util_test.go +++ b/pkg/api/pod/util_test.go @@ -32,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/util/validation/field" utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/kubernetes/pkg/apis/core" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/features" ) @@ -1328,3 +1329,53 @@ func TestDropEphemeralContainers(t *testing.T) { } } } + +func TestValidatePodDeletionCostOption(t *testing.T) { + testCases := []struct { + name string + oldPodMeta *metav1.ObjectMeta + featureEnabled bool + wantAllowInvalidPodDeletionCost bool + }{ + { + name: "CreateFeatureEnabled", + featureEnabled: true, + wantAllowInvalidPodDeletionCost: false, + }, + { + name: "CreateFeatureDisabled", + featureEnabled: false, + wantAllowInvalidPodDeletionCost: true, + }, + { + name: "UpdateFeatureDisabled", + oldPodMeta: &metav1.ObjectMeta{Annotations: map[string]string{core.PodDeletionCost: "100"}}, + featureEnabled: false, + wantAllowInvalidPodDeletionCost: true, + }, + { + name: "UpdateFeatureEnabledValidOldValue", + oldPodMeta: &metav1.ObjectMeta{Annotations: map[string]string{core.PodDeletionCost: "100"}}, + featureEnabled: true, + wantAllowInvalidPodDeletionCost: false, + }, + { + name: "UpdateFeatureEnabledValidOldValue", + oldPodMeta: &metav1.ObjectMeta{Annotations: map[string]string{core.PodDeletionCost: "invalid-value"}}, + featureEnabled: true, + wantAllowInvalidPodDeletionCost: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodDeletionCost, tc.featureEnabled)() + // The new pod doesn't impact the outcome. + gotOptions := GetValidationOptionsFromPodSpecAndMeta(nil, nil, nil, tc.oldPodMeta) + if tc.wantAllowInvalidPodDeletionCost != gotOptions.AllowInvalidPodDeletionCost { + t.Errorf("unexpected diff, want: %v, got: %v", tc.wantAllowInvalidPodDeletionCost, gotOptions.AllowInvalidPodDeletionCost) + } + + }) + } +} diff --git a/pkg/api/testing/backward_compatibility_test.go b/pkg/api/testing/backward_compatibility_test.go index c9cfb888a39..94f6f9d5fff 100644 --- a/pkg/api/testing/backward_compatibility_test.go +++ b/pkg/api/testing/backward_compatibility_test.go @@ -159,7 +159,7 @@ func TestCompatibility_v1_PodSecurityContext(t *testing.T) { } validator := func(obj runtime.Object) field.ErrorList { - opts := podutil.GetValidationOptionsFromPodSpec(&(obj.(*api.Pod).Spec), nil) + opts := podutil.GetValidationOptionsFromPodSpecAndMeta(&(obj.(*api.Pod).Spec), nil, &(obj.(*api.Pod).ObjectMeta), nil) return validation.ValidatePodSpec(&(obj.(*api.Pod).Spec), &(obj.(*api.Pod).ObjectMeta), field.NewPath("spec"), opts) } diff --git a/pkg/apis/apps/validation/validation.go b/pkg/apis/apps/validation/validation.go index e1705bb75f8..4699e85b98f 100644 --- a/pkg/apis/apps/validation/validation.go +++ b/pkg/apis/apps/validation/validation.go @@ -46,7 +46,7 @@ func ValidateStatefulSetName(name string, prefix bool) []string { } // ValidatePodTemplateSpecForStatefulSet validates the given template and ensures that it is in accordance with the desired selector. -func ValidatePodTemplateSpecForStatefulSet(template *api.PodTemplateSpec, selector labels.Selector, fldPath *field.Path) field.ErrorList { +func ValidatePodTemplateSpecForStatefulSet(template *api.PodTemplateSpec, selector labels.Selector, fldPath *field.Path, opts apivalidation.PodValidationOptions) field.ErrorList { allErrs := field.ErrorList{} if template == nil { allErrs = append(allErrs, field.Required(fldPath, "")) @@ -64,13 +64,13 @@ func ValidatePodTemplateSpecForStatefulSet(template *api.PodTemplateSpec, select // allErrs = append(allErrs, apivalidation.ValidatePodTemplateSpec(template, fldPath)...) allErrs = append(allErrs, unversionedvalidation.ValidateLabels(template.Labels, fldPath.Child("labels"))...) allErrs = append(allErrs, apivalidation.ValidateAnnotations(template.Annotations, fldPath.Child("annotations"))...) - allErrs = append(allErrs, apivalidation.ValidatePodSpecificAnnotations(template.Annotations, &template.Spec, fldPath.Child("annotations"))...) + allErrs = append(allErrs, apivalidation.ValidatePodSpecificAnnotations(template.Annotations, &template.Spec, fldPath.Child("annotations"), opts)...) } return allErrs } // ValidateStatefulSetSpec tests if required fields in the StatefulSet spec are set. -func ValidateStatefulSetSpec(spec *apps.StatefulSetSpec, fldPath *field.Path) field.ErrorList { +func ValidateStatefulSetSpec(spec *apps.StatefulSetSpec, fldPath *field.Path, opts apivalidation.PodValidationOptions) field.ErrorList { allErrs := field.ErrorList{} switch spec.PodManagementPolicy { @@ -122,7 +122,7 @@ func ValidateStatefulSetSpec(spec *apps.StatefulSetSpec, fldPath *field.Path) fi if err != nil { allErrs = append(allErrs, field.Invalid(fldPath.Child("selector"), spec.Selector, "")) } else { - allErrs = append(allErrs, ValidatePodTemplateSpecForStatefulSet(&spec.Template, selector, fldPath.Child("template"))...) + allErrs = append(allErrs, ValidatePodTemplateSpecForStatefulSet(&spec.Template, selector, fldPath.Child("template"), opts)...) } if spec.Template.Spec.RestartPolicy != api.RestartPolicyAlways { @@ -136,9 +136,9 @@ func ValidateStatefulSetSpec(spec *apps.StatefulSetSpec, fldPath *field.Path) fi } // ValidateStatefulSet validates a StatefulSet. -func ValidateStatefulSet(statefulSet *apps.StatefulSet) field.ErrorList { +func ValidateStatefulSet(statefulSet *apps.StatefulSet, opts apivalidation.PodValidationOptions) field.ErrorList { allErrs := apivalidation.ValidateObjectMeta(&statefulSet.ObjectMeta, true, ValidateStatefulSetName, field.NewPath("metadata")) - allErrs = append(allErrs, ValidateStatefulSetSpec(&statefulSet.Spec, field.NewPath("spec"))...) + allErrs = append(allErrs, ValidateStatefulSetSpec(&statefulSet.Spec, field.NewPath("spec"), opts)...) return allErrs } diff --git a/pkg/apis/apps/validation/validation_test.go b/pkg/apis/apps/validation/validation_test.go index ee6f650917b..96c441d2052 100644 --- a/pkg/apis/apps/validation/validation_test.go +++ b/pkg/apis/apps/validation/validation_test.go @@ -131,7 +131,7 @@ func TestValidateStatefulSet(t *testing.T) { for i, successCase := range successCases { t.Run("success case "+strconv.Itoa(i), func(t *testing.T) { - if errs := ValidateStatefulSet(&successCase); len(errs) != 0 { + if errs := ValidateStatefulSet(&successCase, corevalidation.PodValidationOptions{}); len(errs) != 0 { t.Errorf("expected success: %v", errs) } }) @@ -356,7 +356,7 @@ func TestValidateStatefulSet(t *testing.T) { for k, v := range errorCases { t.Run(k, func(t *testing.T) { - errs := ValidateStatefulSet(&v) + errs := ValidateStatefulSet(&v, corevalidation.PodValidationOptions{}) if len(errs) == 0 { t.Errorf("expected failure for %s", k) } diff --git a/pkg/apis/core/annotation_key_constants.go b/pkg/apis/core/annotation_key_constants.go index df0c15fd69c..f7194fb2168 100644 --- a/pkg/apis/core/annotation_key_constants.go +++ b/pkg/apis/core/annotation_key_constants.go @@ -106,4 +106,14 @@ const ( // This annotation is used by the Attach Detach Controller to determine whether to use the in-tree or // CSI Backend for a volume plugin on a specific node. MigratedPluginsAnnotationKey = "storage.alpha.kubernetes.io/migrated-plugins" + + // PodDeletionCost can be used to set to an int32 that represent the cost of deleting + // a pod compared to other pods belonging to the same ReplicaSet. Pods with lower + // deletion cost are preferred to be deleted before pods with higher deletion cost. + // Note that this is honored on a best-effort basis, and so it does not offer guarantees on + // pod deletion order. + // The implicit deletion cost for pods that don't set the annotation is 0, negative values are permitted. + // + // This annotation is alpha-level and is only honored when PodDeletionCost feature is enabled. + PodDeletionCost = "controller.kubernetes.io/pod-deletion-cost" ) diff --git a/pkg/apis/core/helper/helpers.go b/pkg/apis/core/helper/helpers.go index adf5f28eca4..4cbe3100bad 100644 --- a/pkg/apis/core/helper/helpers.go +++ b/pkg/apis/core/helper/helpers.go @@ -19,6 +19,7 @@ package helper import ( "encoding/json" "fmt" + "strconv" "strings" "k8s.io/apimachinery/pkg/api/resource" @@ -532,3 +533,29 @@ func ToPodResourcesSet(podSpec *core.PodSpec) sets.String { } return result } + +// GetDeletionCostFromPodAnnotations returns the integer value of pod-deletion-cost. Returns 0 +// if not set or the value is invalid. +func GetDeletionCostFromPodAnnotations(annotations map[string]string) (int32, error) { + if value, exist := annotations[core.PodDeletionCost]; exist { + // values that start with plus sign (e.g, "+10") or leading zeros (e.g., "008") are not valid. + if !validFirstDigit(value) { + return 0, fmt.Errorf("invalid value %q", value) + } + + i, err := strconv.ParseInt(value, 10, 32) + if err != nil { + // make sure we default to 0 on error. + return 0, err + } + return int32(i), nil + } + return 0, nil +} + +func validFirstDigit(str string) bool { + if len(str) == 0 { + return false + } + return str[0] == '-' || (str[0] == '0' && str == "0") || (str[0] >= '1' && str[0] <= '9') +} diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index 5d5289daaf7..9f0409b116f 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -131,7 +131,7 @@ func ValidateDNS1123Subdomain(value string, fldPath *field.Path) field.ErrorList return allErrs } -func ValidatePodSpecificAnnotations(annotations map[string]string, spec *core.PodSpec, fldPath *field.Path) field.ErrorList { +func ValidatePodSpecificAnnotations(annotations map[string]string, spec *core.PodSpec, fldPath *field.Path, opts PodValidationOptions) field.ErrorList { allErrs := field.ErrorList{} if value, isMirror := annotations[core.MirrorPodAnnotationKey]; isMirror { @@ -144,6 +144,12 @@ func ValidatePodSpecificAnnotations(annotations map[string]string, spec *core.Po allErrs = append(allErrs, ValidateTolerationsInPodAnnotations(annotations, fldPath)...) } + if !opts.AllowInvalidPodDeletionCost { + if _, err := helper.GetDeletionCostFromPodAnnotations(annotations); err != nil { + allErrs = append(allErrs, field.Invalid(fldPath.Key(core.PodDeletionCost), annotations[core.PodDeletionCost], "must be a 32bit integer")) + } + } + allErrs = append(allErrs, ValidateSeccompPodAnnotations(annotations, fldPath)...) allErrs = append(allErrs, ValidateAppArmorPodAnnotations(annotations, spec, fldPath)...) @@ -167,7 +173,7 @@ func ValidateTolerationsInPodAnnotations(annotations map[string]string, fldPath return allErrs } -func ValidatePodSpecificAnnotationUpdates(newPod, oldPod *core.Pod, fldPath *field.Path) field.ErrorList { +func ValidatePodSpecificAnnotationUpdates(newPod, oldPod *core.Pod, fldPath *field.Path, opts PodValidationOptions) field.ErrorList { allErrs := field.ErrorList{} newAnnotations := newPod.Annotations oldAnnotations := oldPod.Annotations @@ -194,7 +200,7 @@ func ValidatePodSpecificAnnotationUpdates(newPod, oldPod *core.Pod, fldPath *fie allErrs = append(allErrs, field.Forbidden(fldPath.Key(k), "may not add mirror pod annotation")) } } - allErrs = append(allErrs, ValidatePodSpecificAnnotations(newAnnotations, &newPod.Spec, fldPath)...) + allErrs = append(allErrs, ValidatePodSpecificAnnotations(newAnnotations, &newPod.Spec, fldPath, opts)...) return allErrs } @@ -3185,6 +3191,8 @@ type PodValidationOptions struct { AllowMultipleHugePageResources bool // Allow pod spec to use hugepages in downward API AllowDownwardAPIHugePages bool + // Allow invalid pod-deletion-cost annotation value for backward compatibility. + AllowInvalidPodDeletionCost bool } // ValidatePodSingleHugePageResources checks if there are multiple huge @@ -3209,7 +3217,7 @@ func ValidatePodSingleHugePageResources(pod *core.Pod, specPath *field.Path) fie func validatePodMetadataAndSpec(pod *core.Pod, opts PodValidationOptions) field.ErrorList { fldPath := field.NewPath("metadata") allErrs := ValidateObjectMeta(&pod.ObjectMeta, true, ValidatePodName, fldPath) - allErrs = append(allErrs, ValidatePodSpecificAnnotations(pod.ObjectMeta.Annotations, &pod.Spec, fldPath.Child("annotations"))...) + allErrs = append(allErrs, ValidatePodSpecificAnnotations(pod.ObjectMeta.Annotations, &pod.Spec, fldPath.Child("annotations"), opts)...) allErrs = append(allErrs, ValidatePodSpec(&pod.Spec, &pod.ObjectMeta, field.NewPath("spec"), opts)...) // we do additional validation only pertinent for pods and not pod templates @@ -3928,7 +3936,7 @@ func ValidatePodUpdate(newPod, oldPod *core.Pod, opts PodValidationOptions) fiel fldPath := field.NewPath("metadata") allErrs := ValidateObjectMetaUpdate(&newPod.ObjectMeta, &oldPod.ObjectMeta, fldPath) allErrs = append(allErrs, validatePodMetadataAndSpec(newPod, opts)...) - allErrs = append(allErrs, ValidatePodSpecificAnnotationUpdates(newPod, oldPod, fldPath.Child("annotations"))...) + allErrs = append(allErrs, ValidatePodSpecificAnnotationUpdates(newPod, oldPod, fldPath.Child("annotations"), opts)...) specPath := field.NewPath("spec") if !opts.AllowMultipleHugePageResources { @@ -4035,10 +4043,10 @@ func ValidateContainerStateTransition(newStatuses, oldStatuses []core.ContainerS // ValidatePodStatusUpdate tests to see if the update is legal for an end user to make. newPod is updated with fields // that cannot be changed. -func ValidatePodStatusUpdate(newPod, oldPod *core.Pod) field.ErrorList { +func ValidatePodStatusUpdate(newPod, oldPod *core.Pod, opts PodValidationOptions) field.ErrorList { fldPath := field.NewPath("metadata") allErrs := ValidateObjectMetaUpdate(&newPod.ObjectMeta, &oldPod.ObjectMeta, fldPath) - allErrs = append(allErrs, ValidatePodSpecificAnnotationUpdates(newPod, oldPod, fldPath.Child("annotations"))...) + allErrs = append(allErrs, ValidatePodSpecificAnnotationUpdates(newPod, oldPod, fldPath.Child("annotations"), opts)...) allErrs = append(allErrs, validatePodConditions(newPod.Status.Conditions, fldPath.Child("conditions"))...) fldPath = field.NewPath("status") @@ -4573,7 +4581,7 @@ func ValidatePodTemplateSpec(spec *core.PodTemplateSpec, fldPath *field.Path, op allErrs := field.ErrorList{} allErrs = append(allErrs, unversionedvalidation.ValidateLabels(spec.Labels, fldPath.Child("labels"))...) allErrs = append(allErrs, ValidateAnnotations(spec.Annotations, fldPath.Child("annotations"))...) - allErrs = append(allErrs, ValidatePodSpecificAnnotations(spec.Annotations, &spec.Spec, fldPath.Child("annotations"))...) + allErrs = append(allErrs, ValidatePodSpecificAnnotations(spec.Annotations, &spec.Spec, fldPath.Child("annotations"), opts)...) allErrs = append(allErrs, ValidatePodSpec(&spec.Spec, nil, fldPath.Child("spec"), opts)...) allErrs = append(allErrs, validateSeccompAnnotationsAndFields(spec.ObjectMeta, &spec.Spec, fldPath.Child("spec"))...) diff --git a/pkg/apis/core/validation/validation_test.go b/pkg/apis/core/validation/validation_test.go index f80ce27dee8..c8109cf404e 100644 --- a/pkg/apis/core/validation/validation_test.go +++ b/pkg/apis/core/validation/validation_test.go @@ -7838,6 +7838,22 @@ func TestValidatePod(t *testing.T) { DNSPolicy: core.DNSClusterFirst, }, }, + "negative pod-deletion-cost": { + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns", Annotations: map[string]string{core.PodDeletionCost: "-100"}}, + Spec: core.PodSpec{ + Containers: []core.Container{{Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File"}}, + RestartPolicy: core.RestartPolicyAlways, + DNSPolicy: core.DNSClusterFirst, + }, + }, + "positive pod-deletion-cost": { + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns", Annotations: map[string]string{core.PodDeletionCost: "100"}}, + Spec: core.PodSpec{ + Containers: []core.Container{{Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File"}}, + RestartPolicy: core.RestartPolicyAlways, + DNSPolicy: core.DNSClusterFirst, + }, + }, } for k, v := range successCases { t.Run(k, func(t *testing.T) { @@ -8764,10 +8780,43 @@ func TestValidatePod(t *testing.T) { }, }, }, + "invalid pod-deletion-cost": { + expectedError: "metadata.annotations[controller.kubernetes.io/pod-deletion-cost]: Invalid value: \"text\": must be a 32bit integer", + spec: core.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns", Annotations: map[string]string{core.PodDeletionCost: "text"}}, + Spec: core.PodSpec{ + Containers: []core.Container{{Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File"}}, + RestartPolicy: core.RestartPolicyAlways, + DNSPolicy: core.DNSClusterFirst, + }, + }, + }, + "invalid leading zeros pod-deletion-cost": { + expectedError: "metadata.annotations[controller.kubernetes.io/pod-deletion-cost]: Invalid value: \"008\": must be a 32bit integer", + spec: core.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns", Annotations: map[string]string{core.PodDeletionCost: "008"}}, + Spec: core.PodSpec{ + Containers: []core.Container{{Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File"}}, + RestartPolicy: core.RestartPolicyAlways, + DNSPolicy: core.DNSClusterFirst, + }, + }, + }, + "invalid leading plus sign pod-deletion-cost": { + expectedError: "metadata.annotations[controller.kubernetes.io/pod-deletion-cost]: Invalid value: \"+10\": must be a 32bit integer", + spec: core.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns", Annotations: map[string]string{core.PodDeletionCost: "+10"}}, + Spec: core.PodSpec{ + Containers: []core.Container{{Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File"}}, + RestartPolicy: core.RestartPolicyAlways, + DNSPolicy: core.DNSClusterFirst, + }, + }, + }, } for k, v := range errorCases { t.Run(k, func(t *testing.T) { - if errs := ValidatePodCreate(&v.spec, PodValidationOptions{}); len(errs) == 0 { + if errs := ValidatePodCreate(&v.spec, PodValidationOptions{AllowInvalidPodDeletionCost: false}); len(errs) == 0 { t.Errorf("expected failure") } else if v.expectedError == "" { t.Errorf("missing expectedError, got %q", errs.ToAggregate().Error()) @@ -9711,7 +9760,7 @@ func TestValidatePodStatusUpdate(t *testing.T) { for _, test := range tests { test.new.ObjectMeta.ResourceVersion = "1" test.old.ObjectMeta.ResourceVersion = "1" - errs := ValidatePodStatusUpdate(&test.new, &test.old) + errs := ValidatePodStatusUpdate(&test.new, &test.old, PodValidationOptions{}) if test.err == "" { if len(errs) != 0 { t.Errorf("unexpected invalid: %s (%+v)\nA: %+v\nB: %+v", test.test, errs, test.new, test.old) @@ -16350,7 +16399,7 @@ func TestPodIPsValidation(t *testing.T) { oldPod.ResourceVersion = "1" oldPod.Name = newPod.Name - errs := ValidatePodStatusUpdate(newPod, oldPod) + errs := ValidatePodStatusUpdate(newPod, oldPod, PodValidationOptions{}) if oldTestCase.expectError { // The old pod was invalid, tolerate invalid IPs in the new pod as well if len(errs) > 0 { diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 1e22da40876..8ca76a2b6f2 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -39,13 +39,16 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/strategicpatch" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" clientretry "k8s.io/client-go/util/retry" podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/apis/core/helper" _ "k8s.io/kubernetes/pkg/apis/core/install" "k8s.io/kubernetes/pkg/apis/core/validation" + "k8s.io/kubernetes/pkg/features" hashutil "k8s.io/kubernetes/pkg/util/hash" taintutils "k8s.io/kubernetes/pkg/util/taints" "k8s.io/utils/integer" @@ -791,15 +794,17 @@ func (s ActivePods) Less(i, j int) bool { // is unknown, and a pod whose phase is unknown comes before a running pod. // 3. If exactly one of the pods is ready, the pod that is not ready comes // before the ready pod. -// 4. If the pods' ranks differ, the pod with greater rank comes before the pod +// 4. If controller.kubernetes.io/pod-deletion-cost annotation is set, then +// the pod with the lower value will come first. +// 5. If the pods' ranks differ, the pod with greater rank comes before the pod // with lower rank. -// 5. If both pods are ready but have not been ready for the same amount of +// 6. If both pods are ready but have not been ready for the same amount of // time, the pod that has been ready for a shorter amount of time comes // before the pod that has been ready for longer. -// 6. If one pod has a container that has restarted more than any container in +// 7. If one pod has a container that has restarted more than any container in // the other pod, the pod with the container with more restarts comes // before the other pod. -// 7. If the pods' creation times differ, the pod that was created more recently +// 8. If the pods' creation times differ, the pod that was created more recently // comes before the older pod. // // If none of these rules matches, the second pod comes before the first pod. @@ -842,7 +847,17 @@ func (s ActivePodsWithRanks) Less(i, j int) bool { if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) { return !podutil.IsPodReady(s.Pods[i]) } - // 4. Doubled up < not doubled up + + // 4. higher pod-deletion-cost < lower pod-deletion cost + if utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost) { + pi, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[i].Annotations) + pj, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[j].Annotations) + if pi != pj { + return pi < pj + } + } + + // 5. Doubled up < not doubled up // If one of the two pods is on the same node as one or more additional // ready pods that belong to the same replicaset, whichever pod has more // colocated ready pods is less @@ -851,7 +866,7 @@ func (s ActivePodsWithRanks) Less(i, j int) bool { } // TODO: take availability into account when we push minReadySeconds information from deployment into pods, // see https://github.com/kubernetes/kubernetes/issues/22065 - // 5. Been ready for empty time < less time < more time + // 6. Been ready for empty time < less time < more time // If both pods are ready, the latest ready one is smaller if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) { readyTime1 := podReadyTime(s.Pods[i]) @@ -860,11 +875,11 @@ func (s ActivePodsWithRanks) Less(i, j int) bool { return afterOrZero(readyTime1, readyTime2) } } - // 6. Pods with containers with higher restart counts < lower restart counts + // 7. Pods with containers with higher restart counts < lower restart counts if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) { return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j]) } - // 7. Empty creation time pods < newer pods < older pods + // 8. Empty creation time pods < newer pods < older pods if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) { return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) } diff --git a/pkg/controller/controller_utils_test.go b/pkg/controller/controller_utils_test.go index 625695f3aaa..4f0d563644e 100644 --- a/pkg/controller/controller_utils_test.go +++ b/pkg/controller/controller_utils_test.go @@ -38,6 +38,7 @@ import ( "k8s.io/apimachinery/pkg/util/clock" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/uuid" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" clientscheme "k8s.io/client-go/kubernetes/scheme" @@ -45,8 +46,11 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" utiltesting "k8s.io/client-go/util/testing" + featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/kubernetes/pkg/apis/core" _ "k8s.io/kubernetes/pkg/apis/core/install" "k8s.io/kubernetes/pkg/controller/testutil" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/securitycontext" "github.com/stretchr/testify/assert" @@ -435,7 +439,7 @@ func TestSortingActivePodsWithRanks(t *testing.T) { now := metav1.Now() then := metav1.Time{Time: now.AddDate(0, -1, 0)} zeroTime := metav1.Time{} - pod := func(podName, nodeName string, phase v1.PodPhase, ready bool, restarts int32, readySince metav1.Time, created metav1.Time) *v1.Pod { + pod := func(podName, nodeName string, phase v1.PodPhase, ready bool, restarts int32, readySince metav1.Time, created metav1.Time, annotations map[string]string) *v1.Pod { var conditions []v1.PodCondition var containerStatuses []v1.ContainerStatus if ready { @@ -446,6 +450,7 @@ func TestSortingActivePodsWithRanks(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ CreationTimestamp: created, Name: podName, + Annotations: annotations, }, Spec: v1.PodSpec{NodeName: nodeName}, Status: v1.PodStatus{ @@ -456,15 +461,17 @@ func TestSortingActivePodsWithRanks(t *testing.T) { } } var ( - unscheduledPod = pod("unscheduled", "", v1.PodPending, false, 0, zeroTime, zeroTime) - scheduledPendingPod = pod("pending", "node", v1.PodPending, false, 0, zeroTime, zeroTime) - unknownPhasePod = pod("unknown-phase", "node", v1.PodUnknown, false, 0, zeroTime, zeroTime) - runningNotReadyPod = pod("not-ready", "node", v1.PodRunning, false, 0, zeroTime, zeroTime) - runningReadyNoLastTransitionTimePod = pod("ready-no-last-transition-time", "node", v1.PodRunning, true, 0, zeroTime, zeroTime) - runningReadyNow = pod("ready-now", "node", v1.PodRunning, true, 0, now, now) - runningReadyThen = pod("ready-then", "node", v1.PodRunning, true, 0, then, then) - runningReadyNowHighRestarts = pod("ready-high-restarts", "node", v1.PodRunning, true, 9001, now, now) - runningReadyNowCreatedThen = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then) + unscheduledPod = pod("unscheduled", "", v1.PodPending, false, 0, zeroTime, zeroTime, nil) + scheduledPendingPod = pod("pending", "node", v1.PodPending, false, 0, zeroTime, zeroTime, nil) + unknownPhasePod = pod("unknown-phase", "node", v1.PodUnknown, false, 0, zeroTime, zeroTime, nil) + runningNotReadyPod = pod("not-ready", "node", v1.PodRunning, false, 0, zeroTime, zeroTime, nil) + runningReadyNoLastTransitionTimePod = pod("ready-no-last-transition-time", "node", v1.PodRunning, true, 0, zeroTime, zeroTime, nil) + runningReadyNow = pod("ready-now", "node", v1.PodRunning, true, 0, now, now, nil) + runningReadyThen = pod("ready-then", "node", v1.PodRunning, true, 0, then, then, nil) + runningReadyNowHighRestarts = pod("ready-high-restarts", "node", v1.PodRunning, true, 9001, now, now, nil) + runningReadyNowCreatedThen = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then, nil) + lowPodDeletionCost = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "10"}) + highPodDeletionCost = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "100"}) ) equalityTests := []*v1.Pod{ unscheduledPod, @@ -491,33 +498,40 @@ func TestSortingActivePodsWithRanks(t *testing.T) { rank int } inequalityTests := []struct { - lesser, greater podWithRank + lesser, greater podWithRank + disablePodDeletioncost bool }{ - {podWithRank{unscheduledPod, 1}, podWithRank{scheduledPendingPod, 2}}, - {podWithRank{unscheduledPod, 2}, podWithRank{scheduledPendingPod, 1}}, - {podWithRank{scheduledPendingPod, 1}, podWithRank{unknownPhasePod, 2}}, - {podWithRank{unknownPhasePod, 1}, podWithRank{runningNotReadyPod, 2}}, - {podWithRank{runningNotReadyPod, 1}, podWithRank{runningReadyNoLastTransitionTimePod, 1}}, - {podWithRank{runningReadyNoLastTransitionTimePod, 1}, podWithRank{runningReadyNow, 1}}, - {podWithRank{runningReadyNow, 2}, podWithRank{runningReadyNoLastTransitionTimePod, 1}}, - {podWithRank{runningReadyNow, 1}, podWithRank{runningReadyThen, 1}}, - {podWithRank{runningReadyNow, 2}, podWithRank{runningReadyThen, 1}}, - {podWithRank{runningReadyNowHighRestarts, 1}, podWithRank{runningReadyNow, 1}}, - {podWithRank{runningReadyNow, 2}, podWithRank{runningReadyNowHighRestarts, 1}}, - {podWithRank{runningReadyNow, 1}, podWithRank{runningReadyNowCreatedThen, 1}}, - {podWithRank{runningReadyNowCreatedThen, 2}, podWithRank{runningReadyNow, 1}}, + {lesser: podWithRank{unscheduledPod, 1}, greater: podWithRank{scheduledPendingPod, 2}}, + {lesser: podWithRank{unscheduledPod, 2}, greater: podWithRank{scheduledPendingPod, 1}}, + {lesser: podWithRank{scheduledPendingPod, 1}, greater: podWithRank{unknownPhasePod, 2}}, + {lesser: podWithRank{unknownPhasePod, 1}, greater: podWithRank{runningNotReadyPod, 2}}, + {lesser: podWithRank{runningNotReadyPod, 1}, greater: podWithRank{runningReadyNoLastTransitionTimePod, 1}}, + {lesser: podWithRank{runningReadyNoLastTransitionTimePod, 1}, greater: podWithRank{runningReadyNow, 1}}, + {lesser: podWithRank{runningReadyNow, 2}, greater: podWithRank{runningReadyNoLastTransitionTimePod, 1}}, + {lesser: podWithRank{runningReadyNow, 1}, greater: podWithRank{runningReadyThen, 1}}, + {lesser: podWithRank{runningReadyNow, 2}, greater: podWithRank{runningReadyThen, 1}}, + {lesser: podWithRank{runningReadyNowHighRestarts, 1}, greater: podWithRank{runningReadyNow, 1}}, + {lesser: podWithRank{runningReadyNow, 2}, greater: podWithRank{runningReadyNowHighRestarts, 1}}, + {lesser: podWithRank{runningReadyNow, 1}, greater: podWithRank{runningReadyNowCreatedThen, 1}}, + {lesser: podWithRank{runningReadyNowCreatedThen, 2}, greater: podWithRank{runningReadyNow, 1}}, + {lesser: podWithRank{lowPodDeletionCost, 2}, greater: podWithRank{highPodDeletionCost, 1}}, + {lesser: podWithRank{highPodDeletionCost, 2}, greater: podWithRank{lowPodDeletionCost, 1}, disablePodDeletioncost: true}, } - for _, test := range inequalityTests { - podsWithRanks := ActivePodsWithRanks{ - Pods: []*v1.Pod{test.lesser.pod, test.greater.pod}, - Rank: []int{test.lesser.rank, test.greater.rank}, - } - if !podsWithRanks.Less(0, 1) { - t.Errorf("expected pod %q with rank %v to be less than %q with rank %v", podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0], podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1]) - } - if podsWithRanks.Less(1, 0) { - t.Errorf("expected pod %q with rank %v not to be less than %v with rank %v", podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1], podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0]) - } + for i, test := range inequalityTests { + t.Run(fmt.Sprintf("test%d", i), func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodDeletionCost, !test.disablePodDeletioncost)() + + podsWithRanks := ActivePodsWithRanks{ + Pods: []*v1.Pod{test.lesser.pod, test.greater.pod}, + Rank: []int{test.lesser.rank, test.greater.rank}, + } + if !podsWithRanks.Less(0, 1) { + t.Errorf("expected pod %q with rank %v to be less than %q with rank %v", podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0], podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1]) + } + if podsWithRanks.Less(1, 0) { + t.Errorf("expected pod %q with rank %v not to be less than %v with rank %v", podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1], podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0]) + } + }) } } diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index b4adfb2e536..f5bb0f2f4c7 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -663,6 +663,12 @@ const ( // // Enables the usage of different protocols in the same Service with type=LoadBalancer MixedProtocolLBService featuregate.Feature = "MixedProtocolLBService" + + // owner: @ahg-g + // alpha: v1.21 + // + // Enables controlling pod ranking on replicaset scale-down. + PodDeletionCost featuregate.Feature = "PodDeletionCost" ) func init() { @@ -763,6 +769,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS MixedProtocolLBService: {Default: false, PreRelease: featuregate.Alpha}, PreferNominatedNode: {Default: false, PreRelease: featuregate.Alpha}, RunAsGroup: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.22 + PodDeletionCost: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/registry/apps/statefulset/strategy.go b/pkg/registry/apps/statefulset/strategy.go index 4bca4d693eb..a1f27417fb3 100644 --- a/pkg/registry/apps/statefulset/strategy.go +++ b/pkg/registry/apps/statefulset/strategy.go @@ -96,7 +96,8 @@ func (statefulSetStrategy) PrepareForUpdate(ctx context.Context, obj, old runtim // Validate validates a new StatefulSet. func (statefulSetStrategy) Validate(ctx context.Context, obj runtime.Object) field.ErrorList { statefulSet := obj.(*apps.StatefulSet) - return validation.ValidateStatefulSet(statefulSet) + opts := pod.GetValidationOptionsFromPodTemplate(&statefulSet.Spec.Template, nil) + return validation.ValidateStatefulSet(statefulSet, opts) } // Canonicalize normalizes the object after validation. @@ -112,7 +113,9 @@ func (statefulSetStrategy) AllowCreateOnUpdate() bool { func (statefulSetStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList { newStatefulSet := obj.(*apps.StatefulSet) oldStatefulSet := old.(*apps.StatefulSet) - validationErrorList := validation.ValidateStatefulSet(newStatefulSet) + + opts := pod.GetValidationOptionsFromPodTemplate(&newStatefulSet.Spec.Template, &oldStatefulSet.Spec.Template) + validationErrorList := validation.ValidateStatefulSet(newStatefulSet, opts) updateErrorList := validation.ValidateStatefulSetUpdate(newStatefulSet, oldStatefulSet) return append(validationErrorList, updateErrorList...) } diff --git a/pkg/registry/core/pod/strategy.go b/pkg/registry/core/pod/strategy.go index 703e9542e5b..566a4af5231 100644 --- a/pkg/registry/core/pod/strategy.go +++ b/pkg/registry/core/pod/strategy.go @@ -91,7 +91,7 @@ func (podStrategy) PrepareForUpdate(ctx context.Context, obj, old runtime.Object // Validate validates a new pod. func (podStrategy) Validate(ctx context.Context, obj runtime.Object) field.ErrorList { pod := obj.(*api.Pod) - opts := podutil.GetValidationOptionsFromPodSpec(&pod.Spec, nil) + opts := podutil.GetValidationOptionsFromPodSpecAndMeta(&pod.Spec, nil, &pod.ObjectMeta, nil) return validation.ValidatePodCreate(pod, opts) } @@ -109,7 +109,7 @@ func (podStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) // Allow downward api usage of hugepages on pod update if feature is enabled or if the old pod already had used them. pod := obj.(*api.Pod) oldPod := old.(*api.Pod) - opts := podutil.GetValidationOptionsFromPodSpec(&pod.Spec, &oldPod.Spec) + opts := podutil.GetValidationOptionsFromPodSpecAndMeta(&pod.Spec, &oldPod.Spec, &pod.ObjectMeta, &oldPod.ObjectMeta) return validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod), opts) } @@ -167,7 +167,11 @@ func (podStatusStrategy) PrepareForUpdate(ctx context.Context, obj, old runtime. } func (podStatusStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList { - return validation.ValidatePodStatusUpdate(obj.(*api.Pod), old.(*api.Pod)) + pod := obj.(*api.Pod) + oldPod := old.(*api.Pod) + opts := podutil.GetValidationOptionsFromPodSpecAndMeta(&pod.Spec, &oldPod.Spec, &pod.ObjectMeta, &oldPod.ObjectMeta) + + return validation.ValidatePodStatusUpdate(obj.(*api.Pod), old.(*api.Pod), opts) } type podEphemeralContainersStrategy struct { @@ -180,7 +184,7 @@ var EphemeralContainersStrategy = podEphemeralContainersStrategy{Strategy} func (podEphemeralContainersStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList { newPod := obj.(*api.Pod) oldPod := old.(*api.Pod) - opts := podutil.GetValidationOptionsFromPodSpec(&newPod.Spec, &oldPod.Spec) + opts := podutil.GetValidationOptionsFromPodSpecAndMeta(&newPod.Spec, &oldPod.Spec, &newPod.ObjectMeta, &oldPod.ObjectMeta) return validation.ValidatePodEphemeralContainersUpdate(newPod, oldPod, opts) } diff --git a/staging/src/k8s.io/api/core/v1/annotation_key_constants.go b/staging/src/k8s.io/api/core/v1/annotation_key_constants.go index d3ebf862836..b3902b6d16f 100644 --- a/staging/src/k8s.io/api/core/v1/annotation_key_constants.go +++ b/staging/src/k8s.io/api/core/v1/annotation_key_constants.go @@ -128,4 +128,14 @@ const ( // This annotation is used by the Attach Detach Controller to determine whether to use the in-tree or // CSI Backend for a volume plugin on a specific node. MigratedPluginsAnnotationKey = "storage.alpha.kubernetes.io/migrated-plugins" + + // PodDeletionCost can be used to set to an int32 that represent the cost of deleting + // a pod compared to other pods belonging to the same ReplicaSet. Pods with lower + // deletion cost are preferred to be deleted before pods with higher deletion cost. + // Note that this is honored on a best-effort basis, and so it does not offer guarantees on + // pod deletion order. + // The implicit deletion cost for pods that don't set the annotation is 0, negative values are permitted. + // + // This annotation is alpha-level and is only honored when PodDeletionCost feature is enabled. + PodDeletionCost = "controller.kubernetes.io/pod-deletion-cost" )