From 49da505a9a7b2bcf56571c28d90e70a06633b64a Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 26 Apr 2019 10:08:52 -0700 Subject: [PATCH] EvenPodsSpread: api changes --- pkg/api/pod/util.go | 12 +++ pkg/apis/core/types.go | 68 ++++++++++++++ pkg/apis/core/validation/validation.go | 76 ++++++++++++++++ pkg/apis/core/validation/validation_test.go | 98 ++++++++++++++++++++- pkg/features/kube_features.go | 7 ++ staging/src/k8s.io/api/core/v1/types.go | 73 +++++++++++++++ 6 files changed, 333 insertions(+), 1 deletion(-) diff --git a/pkg/api/pod/util.go b/pkg/api/pod/util.go index 35417fec68e..70c10d4f775 100644 --- a/pkg/api/pod/util.go +++ b/pkg/api/pod/util.go @@ -401,6 +401,11 @@ func dropDisabledFields( // does not specify any values for these fields. podSpec.PreemptionPolicy = nil } + + if !utilfeature.DefaultFeatureGate.Enabled(features.EvenPodsSpread) && !topologySpreadConstraintsInUse(oldPodSpec) { + // Set TopologySpreadConstraints to nil only if feature is disabled and it is not used + podSpec.TopologySpreadConstraints = nil + } } // dropDisabledRunAsGroupField removes disabled fields from PodSpec related @@ -562,7 +567,14 @@ func overheadInUse(podSpec *api.PodSpec) bool { return true } return false +} +// topologySpreadConstraintsInUse returns true if the pod spec is non-nil and has a TopologySpreadConstraints slice +func topologySpreadConstraintsInUse(podSpec *api.PodSpec) bool { + if podSpec == nil { + return false + } + return len(podSpec.TopologySpreadConstraints) > 0 } // procMountInUse returns true if the pod spec is non-nil and has a SecurityContext's ProcMount field set to a non-default value diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index 70e4e710842..634642c5162 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -2715,6 +2715,13 @@ type PodSpec struct { // If not specified, the default is true. // +optional EnableServiceLinks *bool + // TopologySpreadConstraints describes how a group of pods ought to spread across topology + // domains. Scheduler will schedule pods in a way which abides by the constraints. + // This field is alpha-level and is only honored by clusters that enables the EvenPodsSpread + // feature. + // All topologySpreadConstraints are ANDed. + // +optional + TopologySpreadConstraints []TopologySpreadConstraint } // HostAlias holds the mapping between IP and hostnames that will be injected as an entry in the @@ -4834,3 +4841,64 @@ const ( // DefaultHardPodAffinityWeight defines the weight of the implicit PreferredDuringScheduling affinity rule. DefaultHardPodAffinitySymmetricWeight int32 = 1 ) + +type UnsatisfiableConstraintAction string + +const ( + // DoNotSchedule instructs the scheduler not to schedule the pod + // when constraints are not satisfied. + DoNotSchedule UnsatisfiableConstraintAction = "DoNotSchedule" + // ScheduleAnyway instructs the scheduler to schedule the pod + // even if constraints are not satisfied. + ScheduleAnyway UnsatisfiableConstraintAction = "ScheduleAnyway" +) + +// TopologySpreadConstraint specifies how to spread matching pods among the given topology. +type TopologySpreadConstraint struct { + // MaxSkew describes the degree to which pods may be unevenly distributed. + // It's the maximum permitted difference between the number of matching pods in + // any two topology domains of a given topology type. + // For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + // labelSelector spread as 1/1/0: + // +-------+-------+-------+ + // | zone1 | zone2 | zone3 | + // +-------+-------+-------+ + // | P | P | | + // +-------+-------+-------+ + // - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; + // scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) + // violate MaxSkew(1). + // - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + // It's a required field. Default value is 1 and 0 is not allowed. + MaxSkew int32 + // TopologyKey is the key of node labels. Nodes that have a label with this key + // and identical values are considered to be in the same topology. + // We consider each as a "bucket", and try to put balanced number + // of pods into each bucket. + // It's a required field. + TopologyKey string + // WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + // the spread constraint. + // - DoNotSchedule (default) tells the scheduler not to schedule it + // - ScheduleAnyway tells the scheduler to still schedule it + // It's considered as "Unsatisfiable" if and only if placing incoming pod on any + // topology violates "MaxSkew". + // For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + // labelSelector spread as 3/1/1: + // +-------+-------+-------+ + // | zone1 | zone2 | zone3 | + // +-------+-------+-------+ + // | P P P | P | P | + // +-------+-------+-------+ + // If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + // to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + // MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + // won't make it *more* imbalanced. + // It's a required field. + WhenUnsatisfiable UnsatisfiableConstraintAction + // LabelSelector is used to find matching pods. + // Pods that match this label selector are counted to determine the number of pods + // in their corresponding topology domain. + // +optional + LabelSelector *metav1.LabelSelector +} diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index afc92112093..e0e5cda3a8f 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -3091,6 +3091,7 @@ func ValidatePodSpec(spec *core.PodSpec, fldPath *field.Path) field.ErrorList { allErrs = append(allErrs, validateAffinity(spec.Affinity, fldPath.Child("affinity"))...) allErrs = append(allErrs, validatePodDNSConfig(spec.DNSConfig, &spec.DNSPolicy, fldPath.Child("dnsConfig"))...) allErrs = append(allErrs, validateReadinessGates(spec.ReadinessGates, fldPath.Child("readinessGates"))...) + allErrs = append(allErrs, validateTopologySpreadConstraints(spec.TopologySpreadConstraints, fldPath.Child("topologySpreadConstraints"))...) if len(spec.ServiceAccountName) > 0 { for _, msg := range ValidateServiceAccountName(spec.ServiceAccountName, false) { allErrs = append(allErrs, field.Invalid(fldPath.Child("serviceAccountName"), spec.ServiceAccountName, msg)) @@ -5561,3 +5562,78 @@ func ValidateProcMountType(fldPath *field.Path, procMountType core.ProcMountType return field.NotSupported(fldPath, procMountType, []string{string(core.DefaultProcMount), string(core.UnmaskedProcMount)}) } } + +var ( + supportedScheduleActions = sets.NewString(string(core.DoNotSchedule), string(core.ScheduleAnyway)) +) + +type spreadConstraintPair struct { + topologyKey string + whenUnsatisfiable core.UnsatisfiableConstraintAction +} + +// validateTopologySpreadConstraints validates given TopologySpreadConstraints. +func validateTopologySpreadConstraints(constraints []core.TopologySpreadConstraint, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + + var existingConstraintPairs []spreadConstraintPair + for i, constraint := range constraints { + subFldPath := fldPath.Index(i) + if err := ValidateMaxSkew(subFldPath.Child("maxSkew"), constraint.MaxSkew); err != nil { + allErrs = append(allErrs, err) + } + if err := ValidateTopologyKey(subFldPath.Child("topologyKey"), constraint.TopologyKey); err != nil { + allErrs = append(allErrs, err) + } + if err := ValidateWhenUnsatisfiable(subFldPath.Child("whenUnsatisfiable"), constraint.WhenUnsatisfiable); err != nil { + allErrs = append(allErrs, err) + } + // tuple {topologyKey, whenUnsatisfiable} denotes one kind of spread constraint + pair := spreadConstraintPair{ + topologyKey: constraint.TopologyKey, + whenUnsatisfiable: constraint.WhenUnsatisfiable, + } + if err := ValidateSpreadConstraintPair(subFldPath.Child("{topologyKey, whenUnsatisfiable}"), pair, existingConstraintPairs); err != nil { + allErrs = append(allErrs, err) + } else { + existingConstraintPairs = append(existingConstraintPairs, pair) + } + } + + return allErrs +} + +// ValidateMaxSkew tests that the argument is a valid MaxSkew. +func ValidateMaxSkew(fldPath *field.Path, maxSkew int32) *field.Error { + if maxSkew <= 0 { + return field.Invalid(fldPath, maxSkew, isNotPositiveErrorMsg) + } + return nil +} + +// ValidateTopologyKey tests that the argument is a valid TopologyKey. +func ValidateTopologyKey(fldPath *field.Path, topologyKey string) *field.Error { + if len(topologyKey) == 0 { + return field.Required(fldPath, "can not be empty") + } + return nil +} + +// ValidateWhenUnsatisfiable tests that the argument is a valid UnsatisfiableConstraintAction. +func ValidateWhenUnsatisfiable(fldPath *field.Path, action core.UnsatisfiableConstraintAction) *field.Error { + if !supportedScheduleActions.Has(string(action)) { + return field.NotSupported(fldPath, action, supportedScheduleActions.List()) + } + return nil +} + +// ValidateSpreadConstraintPair tests that if `pair` exists in `existingConstraintPairs`. +func ValidateSpreadConstraintPair(fldPath *field.Path, pair spreadConstraintPair, existingConstraintPairs []spreadConstraintPair) *field.Error { + for _, existingPair := range existingConstraintPairs { + if pair.topologyKey == existingPair.topologyKey && + pair.whenUnsatisfiable == existingPair.whenUnsatisfiable { + return field.Duplicate(fldPath, pair) + } + } + return nil +} diff --git a/pkg/apis/core/validation/validation_test.go b/pkg/apis/core/validation/validation_test.go index b544631af59..2e226a68d18 100644 --- a/pkg/apis/core/validation/validation_test.go +++ b/pkg/apis/core/validation/validation_test.go @@ -13663,7 +13663,6 @@ func testDataSourceInSpec(name string, kind string, apiGroup string) *core.Persi } func TestAlphaVolumePVCDataSource(t *testing.T) { - testCases := []struct { testName string claimSpec core.PersistentVolumeClaimSpec @@ -13704,7 +13703,104 @@ func TestAlphaVolumePVCDataSource(t *testing.T) { if errs := ValidatePersistentVolumeClaimSpec(&tc.claimSpec, field.NewPath("spec")); len(errs) != 0 { t.Errorf("expected success: %v", errs) } + } + } +} +func TestValidateTopologySpreadConstraints(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.EvenPodsSpread, true)() + testCases := []struct { + name string + constraints []core.TopologySpreadConstraint + errtype field.ErrorType + errfield string + }{ + { + name: "all required fields ok", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + }, + }, + { + name: "missing MaxSkew", + constraints: []core.TopologySpreadConstraint{ + {TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + }, + errtype: field.ErrorTypeInvalid, + errfield: "maxSkew", + }, + { + name: "invalid MaxSkew", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 0, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + }, + errtype: field.ErrorTypeInvalid, + errfield: "maxSkew", + }, + { + name: "missing TopologyKey", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, WhenUnsatisfiable: core.DoNotSchedule}, + }, + errtype: field.ErrorTypeRequired, + errfield: "topologyKey", + }, + { + name: "missing scheduling mode", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone"}, + }, + errtype: field.ErrorTypeNotSupported, + errfield: "whenUnsatisfiable", + }, + { + name: "unsupported scheduling mode", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.UnsatisfiableConstraintAction("N/A")}, + }, + errtype: field.ErrorTypeNotSupported, + errfield: "whenUnsatisfiable", + }, + { + name: "multiple constraints ok with all required fields", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + {MaxSkew: 2, TopologyKey: "k8s.io/node", WhenUnsatisfiable: core.ScheduleAnyway}, + }, + }, + { + name: "multiple constraints missing TopologyKey on partial ones", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + {MaxSkew: 2, WhenUnsatisfiable: core.ScheduleAnyway}, + }, + errtype: field.ErrorTypeRequired, + errfield: "topologyKey", + }, + { + name: "duplicate constraints", + constraints: []core.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + {MaxSkew: 2, TopologyKey: "k8s.io/zone", WhenUnsatisfiable: core.DoNotSchedule}, + }, + errtype: field.ErrorTypeDuplicate, + errfield: "{topologyKey, whenUnsatisfiable}", + }, + } + + for i, tc := range testCases { + errs := validateTopologySpreadConstraints(tc.constraints, field.NewPath("field")) + + if len(errs) > 0 && tc.errtype == "" { + t.Errorf("[%d: %q] unexpected error(s): %v", i, tc.name, errs) + } else if len(errs) == 0 && tc.errtype != "" { + t.Errorf("[%d: %q] expected error type %v", i, tc.name, tc.errtype) + } else if len(errs) >= 1 { + if errs[0].Type != tc.errtype { + t.Errorf("[%d: %q] expected error type %v, got %v", i, tc.name, tc.errtype, errs[0].Type) + } else if !strings.HasSuffix(errs[0].Field, "."+tc.errfield) { + t.Errorf("[%d: %q] expected error on field %q, got %q", i, tc.name, tc.errfield, errs[0].Field) + } } } } diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index ad19f4c2600..d7cbf15a407 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -462,6 +462,12 @@ const ( // // Enables ipv6 dual stack IPv6DualStack featuregate.Feature = "IPv6DualStack" + + // owner: @Huang-Wei + // alpha: v1.16 + // + // Schedule pods evenly across available topology domains. + EvenPodsSpread featuregate.Feature = "EvenPodsSpread" ) func init() { @@ -539,6 +545,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS VolumePVCDataSource: {Default: false, PreRelease: featuregate.Alpha}, PodOverhead: {Default: false, PreRelease: featuregate.Alpha}, IPv6DualStack: {Default: false, PreRelease: featuregate.Alpha}, + EvenPodsSpread: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index 4a6510930ca..493a9377a60 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -3011,6 +3011,79 @@ type PodSpec struct { // This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. // +optional Overhead ResourceList `json:"overhead,omitempty" protobuf:"bytes,32,opt,name=overhead"` + // TopologySpreadConstraints describes how a group of pods ought to spread across topology + // domains. Scheduler will schedule pods in a way which abides by the constraints. + // This field is alpha-level and is only honored by clusters that enables the EvenPodsSpread + // feature. + // All topologySpreadConstraints are ANDed. + // +optional + // +patchMergeKey=topologyKey + // +patchStrategy=merge + // +listType=map + // +listMapKey=topologyKey + // +listMapKey=whenUnsatisfiable + TopologySpreadConstraints []TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty" patchStrategy:"merge" patchMergeKey:"topologyKey" protobuf:"bytes,33,opt,name=topologySpreadConstraints"` +} + +type UnsatisfiableConstraintAction string + +const ( + // DoNotSchedule instructs the scheduler not to schedule the pod + // when constraints are not satisfied. + DoNotSchedule UnsatisfiableConstraintAction = "DoNotSchedule" + // ScheduleAnyway instructs the scheduler to schedule the pod + // even if constraints are not satisfied. + ScheduleAnyway UnsatisfiableConstraintAction = "ScheduleAnyway" +) + +// TopologySpreadConstraint specifies how to spread matching pods among the given topology. +type TopologySpreadConstraint struct { + // MaxSkew describes the degree to which pods may be unevenly distributed. + // It's the maximum permitted difference between the number of matching pods in + // any two topology domains of a given topology type. + // For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + // labelSelector spread as 1/1/0: + // +-------+-------+-------+ + // | zone1 | zone2 | zone3 | + // +-------+-------+-------+ + // | P | P | | + // +-------+-------+-------+ + // - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; + // scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) + // violate MaxSkew(1). + // - if MaxSkew is 2, incoming pod can be scheduled onto any zone. + // It's a required field. Default value is 1 and 0 is not allowed. + MaxSkew int32 `json:"maxSkew" protobuf:"varint,1,opt,name=maxSkew"` + // TopologyKey is the key of node labels. Nodes that have a label with this key + // and identical values are considered to be in the same topology. + // We consider each as a "bucket", and try to put balanced number + // of pods into each bucket. + // It's a required field. + TopologyKey string `json:"topologyKey" protobuf:"bytes,2,opt,name=topologyKey"` + // WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy + // the spread constraint. + // - DoNotSchedule (default) tells the scheduler not to schedule it + // - ScheduleAnyway tells the scheduler to still schedule it + // It's considered as "Unsatisfiable" if and only if placing incoming pod on any + // topology violates "MaxSkew". + // For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same + // labelSelector spread as 3/1/1: + // +-------+-------+-------+ + // | zone1 | zone2 | zone3 | + // +-------+-------+-------+ + // | P P P | P | P | + // +-------+-------+-------+ + // If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled + // to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies + // MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler + // won't make it *more* imbalanced. + // It's a required field. + WhenUnsatisfiable UnsatisfiableConstraintAction `json:"whenUnsatisfiable" protobuf:"bytes,3,opt,name=whenUnsatisfiable,casttype=UnsatisfiableConstraintAction"` + // LabelSelector is used to find matching pods. + // Pods that match this label selector are counted to determine the number of pods + // in their corresponding topology domain. + // +optional + LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty" protobuf:"bytes,4,opt,name=labelSelector"` } const (