diff --git a/pkg/scheduler/apis/config/register.go b/pkg/scheduler/apis/config/register.go index 81a65d365df..e66fd001f44 100644 --- a/pkg/scheduler/apis/config/register.go +++ b/pkg/scheduler/apis/config/register.go @@ -39,6 +39,7 @@ func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &KubeSchedulerConfiguration{}, &Policy{}, + &DefaultPreemptionArgs{}, &InterPodAffinityArgs{}, &NodeLabelArgs{}, &NodeResourcesFitArgs{}, diff --git a/pkg/scheduler/apis/config/scheme/scheme_test.go b/pkg/scheduler/apis/config/scheme/scheme_test.go index dad18b97cb5..c473a9a6647 100644 --- a/pkg/scheduler/apis/config/scheme/scheme_test.go +++ b/pkg/scheduler/apis/config/scheme/scheme_test.go @@ -46,6 +46,10 @@ apiVersion: kubescheduler.config.k8s.io/v1beta1 kind: KubeSchedulerConfiguration profiles: - pluginConfig: + - name: DefaultPreemption + args: + minCandidateNodesPercentage: 50 + minCandidateNodesAbsolute: 500 - name: InterPodAffinity args: hardPodAffinityWeight: 5 @@ -88,6 +92,10 @@ profiles: { SchedulerName: "default-scheduler", PluginConfig: []config.PluginConfig{ + { + Name: "DefaultPreemption", + Args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 50, MinCandidateNodesAbsolute: 500}, + }, { Name: "InterPodAffinity", Args: &config.InterPodAffinityArgs{HardPodAffinityWeight: 5}, @@ -249,6 +257,8 @@ apiVersion: kubescheduler.config.k8s.io/v1beta1 kind: KubeSchedulerConfiguration profiles: - pluginConfig: + - name: DefaultPreemption + args: - name: InterPodAffinity args: - name: NodeResourcesFit @@ -266,6 +276,10 @@ profiles: { SchedulerName: "default-scheduler", PluginConfig: []config.PluginConfig{ + { + Name: "DefaultPreemption", + Args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 10, MinCandidateNodesAbsolute: 100}, + }, { Name: "InterPodAffinity", Args: &config.InterPodAffinityArgs{ diff --git a/pkg/scheduler/apis/config/types_pluginargs.go b/pkg/scheduler/apis/config/types_pluginargs.go index 11bd5b6c9ef..9e75c8ab949 100644 --- a/pkg/scheduler/apis/config/types_pluginargs.go +++ b/pkg/scheduler/apis/config/types_pluginargs.go @@ -23,6 +23,28 @@ import ( // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// DefaultPreemptionArgs holds arguments used to configure the +// DefaultPreemption plugin. +type DefaultPreemptionArgs struct { + metav1.TypeMeta + + // MinCandidateNodesPercentage is the minimum number of candidates to + // shortlist when dry running preemption as a percentage of number of nodes. + // Must be in the range [0, 100]. Defaults to 10% of the cluster size if + // unspecified. + MinCandidateNodesPercentage int32 + // MinCandidateNodesAbsolute is the absolute minimum number of candidates to + // shortlist. The likely number of candidates enumerated for dry running + // preemption is given by the formula: + // numCandidates = max(numNodes * minCandidateNodesPercentage, minCandidateNodesAbsolute) + // We say "likely" because there are other factors such as PDB violations + // that play a role in the number of candidates shortlisted. Must be at least + // 0 nodes. Defaults to 100 nodes if unspecified. + MinCandidateNodesAbsolute int32 +} + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + // InterPodAffinityArgs holds arguments used to configure the InterPodAffinity plugin. type InterPodAffinityArgs struct { metav1.TypeMeta diff --git a/pkg/scheduler/apis/config/v1beta1/defaults.go b/pkg/scheduler/apis/config/v1beta1/defaults.go index 18dd9d9b6e2..fe3729cb67c 100644 --- a/pkg/scheduler/apis/config/v1beta1/defaults.go +++ b/pkg/scheduler/apis/config/v1beta1/defaults.go @@ -163,6 +163,15 @@ func SetDefaults_KubeSchedulerConfiguration(obj *v1beta1.KubeSchedulerConfigurat } } +func SetDefaults_DefaultPreemptionArgs(obj *v1beta1.DefaultPreemptionArgs) { + if obj.MinCandidateNodesPercentage == nil { + obj.MinCandidateNodesPercentage = pointer.Int32Ptr(10) + } + if obj.MinCandidateNodesAbsolute == nil { + obj.MinCandidateNodesAbsolute = pointer.Int32Ptr(100) + } +} + func SetDefaults_InterPodAffinityArgs(obj *v1beta1.InterPodAffinityArgs) { // Note that an object is created manually in cmd/kube-scheduler/app/options/deprecated.go // DeprecatedOptions#ApplyTo. diff --git a/pkg/scheduler/apis/config/v1beta1/defaults_test.go b/pkg/scheduler/apis/config/v1beta1/defaults_test.go index effc2a5ada3..e902f52a539 100644 --- a/pkg/scheduler/apis/config/v1beta1/defaults_test.go +++ b/pkg/scheduler/apis/config/v1beta1/defaults_test.go @@ -312,6 +312,24 @@ func TestPluginArgsDefaults(t *testing.T) { in runtime.Object want runtime.Object }{ + { + name: "DefaultPreemptionArgs empty", + in: &v1beta1.DefaultPreemptionArgs{}, + want: &v1beta1.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: pointer.Int32Ptr(10), + MinCandidateNodesAbsolute: pointer.Int32Ptr(100), + }, + }, + { + name: "DefaultPreemptionArgs with value", + in: &v1beta1.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: pointer.Int32Ptr(50), + }, + want: &v1beta1.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: pointer.Int32Ptr(50), + MinCandidateNodesAbsolute: pointer.Int32Ptr(100), + }, + }, { name: "InterPodAffinityArgs empty", in: &v1beta1.InterPodAffinityArgs{}, diff --git a/pkg/scheduler/apis/config/v1beta1/zz_generated.conversion.go b/pkg/scheduler/apis/config/v1beta1/zz_generated.conversion.go index 46840442166..bd254af06ab 100644 --- a/pkg/scheduler/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/scheduler/apis/config/v1beta1/zz_generated.conversion.go @@ -24,11 +24,11 @@ import ( unsafe "unsafe" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" conversion "k8s.io/apimachinery/pkg/conversion" runtime "k8s.io/apimachinery/pkg/runtime" v1alpha1 "k8s.io/component-base/config/v1alpha1" - v1 "k8s.io/kube-scheduler/config/v1" + configv1 "k8s.io/kube-scheduler/config/v1" v1beta1 "k8s.io/kube-scheduler/config/v1beta1" config "k8s.io/kubernetes/pkg/scheduler/apis/config" ) @@ -40,6 +40,16 @@ func init() { // RegisterConversions adds conversion functions to the given scheme. // Public to allow building arbitrary schemes. func RegisterConversions(s *runtime.Scheme) error { + if err := s.AddGeneratedConversionFunc((*v1beta1.DefaultPreemptionArgs)(nil), (*config.DefaultPreemptionArgs)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(a.(*v1beta1.DefaultPreemptionArgs), b.(*config.DefaultPreemptionArgs), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*config.DefaultPreemptionArgs)(nil), (*v1beta1.DefaultPreemptionArgs)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_config_DefaultPreemptionArgs_To_v1beta1_DefaultPreemptionArgs(a.(*config.DefaultPreemptionArgs), b.(*v1beta1.DefaultPreemptionArgs), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*v1beta1.Extender)(nil), (*config.Extender)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_Extender_To_config_Extender(a.(*v1beta1.Extender), b.(*config.Extender), scope) }); err != nil { @@ -223,6 +233,36 @@ func RegisterConversions(s *runtime.Scheme) error { return nil } +func autoConvert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in *v1beta1.DefaultPreemptionArgs, out *config.DefaultPreemptionArgs, s conversion.Scope) error { + if err := v1.Convert_Pointer_int32_To_int32(&in.MinCandidateNodesPercentage, &out.MinCandidateNodesPercentage, s); err != nil { + return err + } + if err := v1.Convert_Pointer_int32_To_int32(&in.MinCandidateNodesAbsolute, &out.MinCandidateNodesAbsolute, s); err != nil { + return err + } + return nil +} + +// Convert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs is an autogenerated conversion function. +func Convert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in *v1beta1.DefaultPreemptionArgs, out *config.DefaultPreemptionArgs, s conversion.Scope) error { + return autoConvert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in, out, s) +} + +func autoConvert_config_DefaultPreemptionArgs_To_v1beta1_DefaultPreemptionArgs(in *config.DefaultPreemptionArgs, out *v1beta1.DefaultPreemptionArgs, s conversion.Scope) error { + if err := v1.Convert_int32_To_Pointer_int32(&in.MinCandidateNodesPercentage, &out.MinCandidateNodesPercentage, s); err != nil { + return err + } + if err := v1.Convert_int32_To_Pointer_int32(&in.MinCandidateNodesAbsolute, &out.MinCandidateNodesAbsolute, s); err != nil { + return err + } + return nil +} + +// Convert_config_DefaultPreemptionArgs_To_v1beta1_DefaultPreemptionArgs is an autogenerated conversion function. +func Convert_config_DefaultPreemptionArgs_To_v1beta1_DefaultPreemptionArgs(in *config.DefaultPreemptionArgs, out *v1beta1.DefaultPreemptionArgs, s conversion.Scope) error { + return autoConvert_config_DefaultPreemptionArgs_To_v1beta1_DefaultPreemptionArgs(in, out, s) +} + func autoConvert_v1beta1_Extender_To_config_Extender(in *v1beta1.Extender, out *config.Extender, s conversion.Scope) error { out.URLPrefix = in.URLPrefix out.FilterVerb = in.FilterVerb @@ -252,10 +292,10 @@ func autoConvert_config_Extender_To_v1beta1_Extender(in *config.Extender, out *v out.Weight = in.Weight out.BindVerb = in.BindVerb out.EnableHTTPS = in.EnableHTTPS - out.TLSConfig = (*v1.ExtenderTLSConfig)(unsafe.Pointer(in.TLSConfig)) + out.TLSConfig = (*configv1.ExtenderTLSConfig)(unsafe.Pointer(in.TLSConfig)) out.HTTPTimeout = in.HTTPTimeout out.NodeCacheCapable = in.NodeCacheCapable - out.ManagedResources = *(*[]v1.ExtenderManagedResource)(unsafe.Pointer(&in.ManagedResources)) + out.ManagedResources = *(*[]configv1.ExtenderManagedResource)(unsafe.Pointer(&in.ManagedResources)) out.Ignorable = in.Ignorable return nil } @@ -266,7 +306,7 @@ func Convert_config_Extender_To_v1beta1_Extender(in *config.Extender, out *v1bet } func autoConvert_v1beta1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(in *v1beta1.InterPodAffinityArgs, out *config.InterPodAffinityArgs, s conversion.Scope) error { - if err := metav1.Convert_Pointer_int32_To_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil { + if err := v1.Convert_Pointer_int32_To_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil { return err } return nil @@ -278,7 +318,7 @@ func Convert_v1beta1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(in *v1b } func autoConvert_config_InterPodAffinityArgs_To_v1beta1_InterPodAffinityArgs(in *config.InterPodAffinityArgs, out *v1beta1.InterPodAffinityArgs, s conversion.Scope) error { - if err := metav1.Convert_int32_To_Pointer_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil { + if err := v1.Convert_int32_To_Pointer_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil { return err } return nil @@ -290,7 +330,7 @@ func Convert_config_InterPodAffinityArgs_To_v1beta1_InterPodAffinityArgs(in *con } func autoConvert_v1beta1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfiguration(in *v1beta1.KubeSchedulerConfiguration, out *config.KubeSchedulerConfiguration, s conversion.Scope) error { - if err := metav1.Convert_Pointer_int32_To_int32(&in.Parallelism, &out.Parallelism, s); err != nil { + if err := v1.Convert_Pointer_int32_To_int32(&in.Parallelism, &out.Parallelism, s); err != nil { return err } if err := v1alpha1.Convert_v1alpha1_LeaderElectionConfiguration_To_config_LeaderElectionConfiguration(&in.LeaderElection, &out.LeaderElection, s); err != nil { @@ -299,22 +339,22 @@ func autoConvert_v1beta1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfi if err := v1alpha1.Convert_v1alpha1_ClientConnectionConfiguration_To_config_ClientConnectionConfiguration(&in.ClientConnection, &out.ClientConnection, s); err != nil { return err } - if err := metav1.Convert_Pointer_string_To_string(&in.HealthzBindAddress, &out.HealthzBindAddress, s); err != nil { + if err := v1.Convert_Pointer_string_To_string(&in.HealthzBindAddress, &out.HealthzBindAddress, s); err != nil { return err } - if err := metav1.Convert_Pointer_string_To_string(&in.MetricsBindAddress, &out.MetricsBindAddress, s); err != nil { + if err := v1.Convert_Pointer_string_To_string(&in.MetricsBindAddress, &out.MetricsBindAddress, s); err != nil { return err } if err := v1alpha1.Convert_v1alpha1_DebuggingConfiguration_To_config_DebuggingConfiguration(&in.DebuggingConfiguration, &out.DebuggingConfiguration, s); err != nil { return err } - if err := metav1.Convert_Pointer_int32_To_int32(&in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore, s); err != nil { + if err := v1.Convert_Pointer_int32_To_int32(&in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore, s); err != nil { return err } - if err := metav1.Convert_Pointer_int64_To_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil { + if err := v1.Convert_Pointer_int64_To_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil { return err } - if err := metav1.Convert_Pointer_int64_To_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil { + if err := v1.Convert_Pointer_int64_To_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil { return err } if in.Profiles != nil { @@ -333,7 +373,7 @@ func autoConvert_v1beta1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfi } func autoConvert_config_KubeSchedulerConfiguration_To_v1beta1_KubeSchedulerConfiguration(in *config.KubeSchedulerConfiguration, out *v1beta1.KubeSchedulerConfiguration, s conversion.Scope) error { - if err := metav1.Convert_int32_To_Pointer_int32(&in.Parallelism, &out.Parallelism, s); err != nil { + if err := v1.Convert_int32_To_Pointer_int32(&in.Parallelism, &out.Parallelism, s); err != nil { return err } // WARNING: in.AlgorithmSource requires manual conversion: does not exist in peer-type @@ -343,22 +383,22 @@ func autoConvert_config_KubeSchedulerConfiguration_To_v1beta1_KubeSchedulerConfi if err := v1alpha1.Convert_config_ClientConnectionConfiguration_To_v1alpha1_ClientConnectionConfiguration(&in.ClientConnection, &out.ClientConnection, s); err != nil { return err } - if err := metav1.Convert_string_To_Pointer_string(&in.HealthzBindAddress, &out.HealthzBindAddress, s); err != nil { + if err := v1.Convert_string_To_Pointer_string(&in.HealthzBindAddress, &out.HealthzBindAddress, s); err != nil { return err } - if err := metav1.Convert_string_To_Pointer_string(&in.MetricsBindAddress, &out.MetricsBindAddress, s); err != nil { + if err := v1.Convert_string_To_Pointer_string(&in.MetricsBindAddress, &out.MetricsBindAddress, s); err != nil { return err } if err := v1alpha1.Convert_config_DebuggingConfiguration_To_v1alpha1_DebuggingConfiguration(&in.DebuggingConfiguration, &out.DebuggingConfiguration, s); err != nil { return err } - if err := metav1.Convert_int32_To_Pointer_int32(&in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore, s); err != nil { + if err := v1.Convert_int32_To_Pointer_int32(&in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore, s); err != nil { return err } - if err := metav1.Convert_int64_To_Pointer_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil { + if err := v1.Convert_int64_To_Pointer_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil { return err } - if err := metav1.Convert_int64_To_Pointer_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil { + if err := v1.Convert_int64_To_Pointer_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil { return err } if in.Profiles != nil { @@ -377,7 +417,7 @@ func autoConvert_config_KubeSchedulerConfiguration_To_v1beta1_KubeSchedulerConfi } func autoConvert_v1beta1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(in *v1beta1.KubeSchedulerProfile, out *config.KubeSchedulerProfile, s conversion.Scope) error { - if err := metav1.Convert_Pointer_string_To_string(&in.SchedulerName, &out.SchedulerName, s); err != nil { + if err := v1.Convert_Pointer_string_To_string(&in.SchedulerName, &out.SchedulerName, s); err != nil { return err } if in.Plugins != nil { @@ -409,7 +449,7 @@ func Convert_v1beta1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(in *v1b } func autoConvert_config_KubeSchedulerProfile_To_v1beta1_KubeSchedulerProfile(in *config.KubeSchedulerProfile, out *v1beta1.KubeSchedulerProfile, s conversion.Scope) error { - if err := metav1.Convert_string_To_Pointer_string(&in.SchedulerName, &out.SchedulerName, s); err != nil { + if err := v1.Convert_string_To_Pointer_string(&in.SchedulerName, &out.SchedulerName, s); err != nil { return err } if in.Plugins != nil { @@ -530,7 +570,7 @@ func Convert_config_NodeResourcesMostAllocatedArgs_To_v1beta1_NodeResourcesMostA func autoConvert_v1beta1_Plugin_To_config_Plugin(in *v1beta1.Plugin, out *config.Plugin, s conversion.Scope) error { out.Name = in.Name - if err := metav1.Convert_Pointer_int32_To_int32(&in.Weight, &out.Weight, s); err != nil { + if err := v1.Convert_Pointer_int32_To_int32(&in.Weight, &out.Weight, s); err != nil { return err } return nil @@ -543,7 +583,7 @@ func Convert_v1beta1_Plugin_To_config_Plugin(in *v1beta1.Plugin, out *config.Plu func autoConvert_config_Plugin_To_v1beta1_Plugin(in *config.Plugin, out *v1beta1.Plugin, s conversion.Scope) error { out.Name = in.Name - if err := metav1.Convert_int32_To_Pointer_int32(&in.Weight, &out.Weight, s); err != nil { + if err := v1.Convert_int32_To_Pointer_int32(&in.Weight, &out.Weight, s); err != nil { return err } return nil @@ -969,7 +1009,7 @@ func Convert_config_UtilizationShapePoint_To_v1beta1_UtilizationShapePoint(in *c } func autoConvert_v1beta1_VolumeBindingArgs_To_config_VolumeBindingArgs(in *v1beta1.VolumeBindingArgs, out *config.VolumeBindingArgs, s conversion.Scope) error { - if err := metav1.Convert_Pointer_int64_To_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil { + if err := v1.Convert_Pointer_int64_To_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil { return err } return nil @@ -981,7 +1021,7 @@ func Convert_v1beta1_VolumeBindingArgs_To_config_VolumeBindingArgs(in *v1beta1.V } func autoConvert_config_VolumeBindingArgs_To_v1beta1_VolumeBindingArgs(in *config.VolumeBindingArgs, out *v1beta1.VolumeBindingArgs, s conversion.Scope) error { - if err := metav1.Convert_int64_To_Pointer_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil { + if err := v1.Convert_int64_To_Pointer_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil { return err } return nil diff --git a/pkg/scheduler/apis/config/v1beta1/zz_generated.defaults.go b/pkg/scheduler/apis/config/v1beta1/zz_generated.defaults.go index a6a78548741..cd3c50689bb 100644 --- a/pkg/scheduler/apis/config/v1beta1/zz_generated.defaults.go +++ b/pkg/scheduler/apis/config/v1beta1/zz_generated.defaults.go @@ -29,6 +29,7 @@ import ( // Public to allow building arbitrary schemes. // All generated defaulters are covering - they call all nested defaulters. func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&v1beta1.DefaultPreemptionArgs{}, func(obj interface{}) { SetObjectDefaults_DefaultPreemptionArgs(obj.(*v1beta1.DefaultPreemptionArgs)) }) scheme.AddTypeDefaultingFunc(&v1beta1.InterPodAffinityArgs{}, func(obj interface{}) { SetObjectDefaults_InterPodAffinityArgs(obj.(*v1beta1.InterPodAffinityArgs)) }) scheme.AddTypeDefaultingFunc(&v1beta1.KubeSchedulerConfiguration{}, func(obj interface{}) { SetObjectDefaults_KubeSchedulerConfiguration(obj.(*v1beta1.KubeSchedulerConfiguration)) @@ -47,6 +48,10 @@ func RegisterDefaults(scheme *runtime.Scheme) error { return nil } +func SetObjectDefaults_DefaultPreemptionArgs(in *v1beta1.DefaultPreemptionArgs) { + SetDefaults_DefaultPreemptionArgs(in) +} + func SetObjectDefaults_InterPodAffinityArgs(in *v1beta1.InterPodAffinityArgs) { SetDefaults_InterPodAffinityArgs(in) } diff --git a/pkg/scheduler/apis/config/validation/validation_pluginargs.go b/pkg/scheduler/apis/config/validation/validation_pluginargs.go index 47418ed330d..5af44427157 100644 --- a/pkg/scheduler/apis/config/validation/validation_pluginargs.go +++ b/pkg/scheduler/apis/config/validation/validation_pluginargs.go @@ -26,6 +26,38 @@ import ( "k8s.io/kubernetes/pkg/scheduler/apis/config" ) +// ValidateDefaultPreemptionArgs validates that DefaultPreemptionArgs are correct. +func ValidateDefaultPreemptionArgs(args config.DefaultPreemptionArgs) error { + if err := validateMinCandidateNodesPercentage(args.MinCandidateNodesPercentage); err != nil { + return err + } + if err := validateMinCandidateNodesAbsolute(args.MinCandidateNodesAbsolute); err != nil { + return err + } + if args.MinCandidateNodesPercentage == 0 && args.MinCandidateNodesAbsolute == 0 { + return fmt.Errorf("both minCandidateNodesPercentage and minCandidateNodesAbsolute cannot be zero") + } + return nil +} + +// validateMinCandidateNodesPercentage validates that +// minCandidateNodesPercentage is within the allowed range. +func validateMinCandidateNodesPercentage(minCandidateNodesPercentage int32) error { + if minCandidateNodesPercentage < 0 || minCandidateNodesPercentage > 100 { + return fmt.Errorf("minCandidateNodesPercentage is not in the range [0, 100]") + } + return nil +} + +// validateMinCandidateNodesAbsolute validates that minCandidateNodesAbsolute +// is within the allowed range. +func validateMinCandidateNodesAbsolute(minCandidateNodesAbsolute int32) error { + if minCandidateNodesAbsolute < 0 { + return fmt.Errorf("minCandidateNodesAbsolute is not in the range [0, inf)") + } + return nil +} + // ValidateInterPodAffinityArgs validates that InterPodAffinityArgs are correct. func ValidateInterPodAffinityArgs(args config.InterPodAffinityArgs) error { return ValidateHardPodAffinityWeight(field.NewPath("hardPodAffinityWeight"), args.HardPodAffinityWeight) diff --git a/pkg/scheduler/apis/config/validation/validation_pluginargs_test.go b/pkg/scheduler/apis/config/validation/validation_pluginargs_test.go index 4cd158902ad..52fd5a17784 100644 --- a/pkg/scheduler/apis/config/validation/validation_pluginargs_test.go +++ b/pkg/scheduler/apis/config/validation/validation_pluginargs_test.go @@ -24,6 +24,55 @@ import ( "k8s.io/kubernetes/pkg/scheduler/apis/config" ) +func TestValidateDefaultPreemptionArgs(t *testing.T) { + cases := map[string]struct { + args config.DefaultPreemptionArgs + wantErr string + }{ + "valid args (default)": { + args: config.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: 10, + MinCandidateNodesAbsolute: 100, + }, + }, + "negative minCandidateNodesPercentage": { + args: config.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: -1, + MinCandidateNodesAbsolute: 100, + }, + wantErr: "minCandidateNodesPercentage is not in the range [0, 100]", + }, + "minCandidateNodesPercentage over 100": { + args: config.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: 900, + MinCandidateNodesAbsolute: 100, + }, + wantErr: "minCandidateNodesPercentage is not in the range [0, 100]", + }, + "negative minCandidateNodesAbsolute": { + args: config.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: 20, + MinCandidateNodesAbsolute: -1, + }, + wantErr: "minCandidateNodesAbsolute is not in the range [0, inf)", + }, + "all zero": { + args: config.DefaultPreemptionArgs{ + MinCandidateNodesPercentage: 0, + MinCandidateNodesAbsolute: 0, + }, + wantErr: "both minCandidateNodesPercentage and minCandidateNodesAbsolute cannot be zero", + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + err := ValidateDefaultPreemptionArgs(tc.args) + assertErr(t, tc.wantErr, err) + }) + } +} + func TestValidateInterPodAffinityArgs(t *testing.T) { cases := map[string]struct { args config.InterPodAffinityArgs diff --git a/pkg/scheduler/apis/config/zz_generated.deepcopy.go b/pkg/scheduler/apis/config/zz_generated.deepcopy.go index cda2ce45aa2..d102f4c5a79 100644 --- a/pkg/scheduler/apis/config/zz_generated.deepcopy.go +++ b/pkg/scheduler/apis/config/zz_generated.deepcopy.go @@ -25,6 +25,31 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DefaultPreemptionArgs) DeepCopyInto(out *DefaultPreemptionArgs) { + *out = *in + out.TypeMeta = in.TypeMeta + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DefaultPreemptionArgs. +func (in *DefaultPreemptionArgs) DeepCopy() *DefaultPreemptionArgs { + if in == nil { + return nil + } + out := new(DefaultPreemptionArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DefaultPreemptionArgs) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Extender) DeepCopyInto(out *Extender) { *out = *in diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/BUILD b/pkg/scheduler/framework/plugins/defaultpreemption/BUILD index 946e034de46..a9a43db7dd2 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/BUILD +++ b/pkg/scheduler/framework/plugins/defaultpreemption/BUILD @@ -10,6 +10,8 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/features:go_default_library", + "//pkg/scheduler/apis/config:go_default_library", + "//pkg/scheduler/apis/config/validation:go_default_library", "//pkg/scheduler/core:go_default_library", "//pkg/scheduler/framework:go_default_library", "//pkg/scheduler/internal/parallelize:go_default_library", @@ -51,6 +53,8 @@ go_test( embed = [":go_default_library"], deps = [ "//pkg/controller/volume/scheduling:go_default_library", + "//pkg/scheduler/apis/config:go_default_library", + "//pkg/scheduler/apis/config/v1beta1:go_default_library", "//pkg/scheduler/framework:go_default_library", "//pkg/scheduler/framework/plugins/defaultbinder:go_default_library", "//pkg/scheduler/framework/plugins/interpodaffinity:go_default_library", @@ -66,6 +70,7 @@ go_test( "//pkg/scheduler/framework/plugins/volumezone:go_default_library", "//pkg/scheduler/framework/runtime:go_default_library", "//pkg/scheduler/internal/cache:go_default_library", + "//pkg/scheduler/internal/parallelize:go_default_library", "//pkg/scheduler/internal/queue:go_default_library", "//pkg/scheduler/testing:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", @@ -77,6 +82,7 @@ go_test( "//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library", "//staging/src/k8s.io/client-go/testing:go_default_library", "//staging/src/k8s.io/client-go/tools/events:go_default_library", + "//staging/src/k8s.io/kube-scheduler/config/v1beta1:go_default_library", "//staging/src/k8s.io/kube-scheduler/extender/v1:go_default_library", "//vendor/github.com/google/go-cmp/cmp:go_default_library", ], diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go index c623b611c0b..234142cd59b 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go @@ -18,9 +18,11 @@ package defaultpreemption import ( "context" + "fmt" "math" + "math/rand" "sort" - "sync" + "sync/atomic" "time" "k8s.io/klog/v2" @@ -38,6 +40,8 @@ import ( corev1helpers "k8s.io/component-helpers/scheduling/corev1" extenderv1 "k8s.io/kube-scheduler/extender/v1" kubefeatures "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/scheduler/apis/config" + "k8s.io/kubernetes/pkg/scheduler/apis/config/validation" "k8s.io/kubernetes/pkg/scheduler/core" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/internal/parallelize" @@ -53,6 +57,7 @@ const ( // DefaultPreemption is a PostFilter plugin implements the preemption logic. type DefaultPreemption struct { fh framework.Handle + args config.DefaultPreemptionArgs podLister corelisters.PodLister pdbLister policylisters.PodDisruptionBudgetLister } @@ -65,9 +70,17 @@ func (pl *DefaultPreemption) Name() string { } // New initializes a new plugin and returns it. -func New(_ runtime.Object, fh framework.Handle) (framework.Plugin, error) { +func New(dpArgs runtime.Object, fh framework.Handle) (framework.Plugin, error) { + args, ok := dpArgs.(*config.DefaultPreemptionArgs) + if !ok { + return nil, fmt.Errorf("got args of type %T, want *DefaultPreemptionArgs", dpArgs) + } + if err := validation.ValidateDefaultPreemptionArgs(*args); err != nil { + return nil, err + } pl := DefaultPreemption{ fh: fh, + args: *args, podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(), pdbLister: getPDBLister(fh.SharedInformerFactory()), } @@ -124,7 +137,7 @@ func (pl *DefaultPreemption) preempt(ctx context.Context, state *framework.Cycle } // 2) Find all preemption candidates. - candidates, err := FindCandidates(ctx, cs, state, pod, m, ph, nodeLister, pl.pdbLister) + candidates, err := pl.FindCandidates(ctx, state, pod, m) if err != nil || len(candidates) == 0 { return "", err } @@ -149,12 +162,31 @@ func (pl *DefaultPreemption) preempt(ctx context.Context, state *framework.Cycle return bestCandidate.Name(), nil } +// calculateNumCandidates returns the number of candidates the FindCandidates +// method must produce from dry running based on the constraints given by +// and . The number of +// candidates returned will never be greater than . +func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 { + n := (numNodes * pl.args.MinCandidateNodesPercentage) / 100 + if n < pl.args.MinCandidateNodesAbsolute { + n = pl.args.MinCandidateNodesAbsolute + } + if n > numNodes { + n = numNodes + } + return n +} + +// getOffsetAndNumCandidates chooses a random offset and calculates the number +// of candidates that should be shortlisted for dry running preemption. +func (pl *DefaultPreemption) getOffsetAndNumCandidates(numNodes int32) (int32, int32) { + return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes) +} + // FindCandidates calculates a slice of preemption candidates. // Each candidate is executable to make the given schedulable. -func FindCandidates(ctx context.Context, cs kubernetes.Interface, state *framework.CycleState, pod *v1.Pod, - m framework.NodeToStatusMap, ph framework.PreemptHandle, nodeLister framework.NodeInfoLister, - pdbLister policylisters.PodDisruptionBudgetLister) ([]Candidate, error) { - allNodes, err := nodeLister.List() +func (pl *DefaultPreemption) FindCandidates(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusMap) ([]Candidate, error) { + allNodes, err := pl.fh.SnapshotSharedLister().NodeInfos().List() if err != nil { return nil, err } @@ -166,24 +198,28 @@ func FindCandidates(ctx context.Context, cs kubernetes.Interface, state *framewo if len(potentialNodes) == 0 { klog.V(3).Infof("Preemption will not help schedule pod %v/%v on any node.", pod.Namespace, pod.Name) // In this case, we should clean-up any existing nominated node name of the pod. - if err := util.ClearNominatedNodeName(cs, pod); err != nil { + if err := util.ClearNominatedNodeName(pl.fh.ClientSet(), pod); err != nil { klog.Errorf("Cannot clear 'NominatedNodeName' field of pod %v/%v: %v", pod.Namespace, pod.Name, err) // We do not return as this error is not critical. } return nil, nil } - if klog.V(5).Enabled() { - var sample []string - for i := 0; i < 10 && i < len(potentialNodes); i++ { - sample = append(sample, potentialNodes[i].Node().Name) - } - klog.Infof("%v potential nodes for preemption, first %v are: %v", len(potentialNodes), len(sample), sample) - } - pdbs, err := getPodDisruptionBudgets(pdbLister) + + pdbs, err := getPodDisruptionBudgets(pl.pdbLister) if err != nil { return nil, err } - return dryRunPreemption(ctx, ph, state, pod, potentialNodes, pdbs), nil + + offset, numCandidates := pl.getOffsetAndNumCandidates(int32(len(potentialNodes))) + if klog.V(5).Enabled() { + var sample []string + for i := offset; i < offset+10 && i < int32(len(potentialNodes)); i++ { + sample = append(sample, potentialNodes[i].Node().Name) + } + klog.Infof("from a pool of %d nodes (offset: %d, sample %d nodes: %v), ~%d candidates will be chosen", len(potentialNodes), offset, len(sample), sample, numCandidates) + } + + return dryRunPreemption(ctx, pl.fh.PreemptHandle(), state, pod, potentialNodes, pdbs, offset, numCandidates), nil } // PodEligibleToPreemptOthers determines whether this pod should be considered @@ -234,33 +270,77 @@ func nodesWherePreemptionMightHelp(nodes []*framework.NodeInfo, m framework.Node return potentialNodes } +type candidateList struct { + idx int32 + items []Candidate +} + +func newCandidateList(size int32) *candidateList { + return &candidateList{idx: -1, items: make([]Candidate, size)} +} + +// add adds a new candidate to the internal array atomically. +func (cl *candidateList) add(c *candidate) { + if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) { + cl.items[idx] = c + } +} + +// size returns the number of candidate stored. Note that some add() operations +// might still be executing when this is called, so care must be taken to +// ensure that all add() operations complete before accessing the elements of +// the list. +func (cl *candidateList) size() int32 { + n := atomic.LoadInt32(&cl.idx) + 1 + if n >= int32(len(cl.items)) { + n = int32(len(cl.items)) + } + return n +} + +// get returns the internal candidate array. This function is NOT atomic and +// assumes that all add() operations have been completed. +func (cl *candidateList) get() []Candidate { + return cl.items[:cl.size()] +} + // dryRunPreemption simulates Preemption logic on in parallel, -// and returns all possible preemption candidates. -func dryRunPreemption(ctx context.Context, fh framework.PreemptHandle, state *framework.CycleState, - pod *v1.Pod, potentialNodes []*framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) []Candidate { - var resultLock sync.Mutex - var candidates []Candidate +// and returns preemption candidates. The number of candidates depends on the +// constraints defined in the plugin's args. In the returned list of +// candidates, ones that do not violate PDB are preferred over ones that do. +func dryRunPreemption(ctx context.Context, fh framework.PreemptHandle, + state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo, + pdbs []*policy.PodDisruptionBudget, offset int32, numCandidates int32) []Candidate { + nonViolatingCandidates := newCandidateList(numCandidates) + violatingCandidates := newCandidateList(numCandidates) + parallelCtx, cancel := context.WithCancel(ctx) checkNode := func(i int) { - nodeInfoCopy := potentialNodes[i].Clone() + nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Clone() stateCopy := state.Clone() pods, numPDBViolations, fits := selectVictimsOnNode(ctx, fh, stateCopy, pod, nodeInfoCopy, pdbs) if fits { - resultLock.Lock() victims := extenderv1.Victims{ Pods: pods, NumPDBViolations: int64(numPDBViolations), } - c := candidate{ + c := &candidate{ victims: &victims, name: nodeInfoCopy.Node().Name, } - candidates = append(candidates, &c) - resultLock.Unlock() + if numPDBViolations == 0 { + nonViolatingCandidates.add(c) + } else { + violatingCandidates.add(c) + } + nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size() + if nvcSize > 0 && nvcSize+vcSize >= numCandidates { + cancel() + } } } - parallelize.Until(ctx, len(potentialNodes), checkNode) - return candidates + parallelize.Until(parallelCtx, len(potentialNodes), checkNode) + return append(nonViolatingCandidates.get(), violatingCandidates.get()...) } // CallExtenders calls given to select the list of feasible candidates. diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go index ebf2ddd5572..1d0bb339cd9 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go @@ -19,6 +19,7 @@ package defaultpreemption import ( "context" "fmt" + "math/rand" "reflect" "sort" "strings" @@ -35,8 +36,11 @@ import ( clientsetfake "k8s.io/client-go/kubernetes/fake" clienttesting "k8s.io/client-go/testing" "k8s.io/client-go/tools/events" + kubeschedulerconfigv1beta1 "k8s.io/kube-scheduler/config/v1beta1" extenderv1 "k8s.io/kube-scheduler/extender/v1" volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling" + "k8s.io/kubernetes/pkg/scheduler/apis/config" + configv1beta1 "k8s.io/kubernetes/pkg/scheduler/apis/config/v1beta1" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" @@ -52,6 +56,7 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" + "k8s.io/kubernetes/pkg/scheduler/internal/parallelize" internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" st "k8s.io/kubernetes/pkg/scheduler/testing" ) @@ -85,6 +90,14 @@ var ( epochTime6 = metav1.NewTime(time.Unix(0, 6)) ) +func getDefaultDefaultPreemptionArgs() *config.DefaultPreemptionArgs { + v1beta1dpa := &kubeschedulerconfigv1beta1.DefaultPreemptionArgs{} + configv1beta1.SetDefaults_DefaultPreemptionArgs(v1beta1dpa) + dpa := &config.DefaultPreemptionArgs{} + configv1beta1.Convert_v1beta1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(v1beta1dpa, dpa, nil) + return dpa +} + func TestPostFilter(t *testing.T) { onePodRes := map[v1.ResourceName]string{v1.ResourcePods: "1"} tests := []struct { @@ -222,6 +235,7 @@ func TestPostFilter(t *testing.T) { fh: f, podLister: informerFactory.Core().V1().Pods().Lister(), pdbLister: getPDBLister(informerFactory), + args: *getDefaultDefaultPreemptionArgs(), } state := framework.NewCycleState() @@ -246,14 +260,16 @@ func TestPostFilter(t *testing.T) { func TestDryRunPreemption(t *testing.T) { tests := []struct { name string + args *config.DefaultPreemptionArgs nodeNames []string - pod *v1.Pod - pods []*v1.Pod + testPods []*v1.Pod + initPods []*v1.Pod registerPlugins []st.RegisterPluginFunc pdbs []*policy.PodDisruptionBudget fakeFilterRC framework.Code // return code for fake filter plugin - expected []Candidate - expectedNumFilterCalled int32 + disableParallelism bool + expected [][]Candidate + expectedNumFilterCalled []int32 }{ { name: "a pod that does not fit on any node", @@ -261,13 +277,15 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Obj(), }, - expected: nil, - expectedNumFilterCalled: 2, + expected: [][]Candidate{{}}, + expectedNumFilterCalled: []int32{2}, }, { name: "a pod that fits with no preemption", @@ -275,16 +293,20 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Obj(), }, - expected: []Candidate{ - &candidate{victims: &extenderv1.Victims{}, name: "node1"}, - &candidate{victims: &extenderv1.Victims{}, name: "node2"}, + expected: [][]Candidate{ + { + &candidate{victims: &extenderv1.Victims{}, name: "node1"}, + &candidate{victims: &extenderv1.Victims{}, name: "node2"}, + }, }, - expectedNumFilterCalled: 4, + expectedNumFilterCalled: []int32{4}, }, { name: "a pod that fits on one node with no preemption", @@ -292,16 +314,20 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), }, nodeNames: []string{"node1", "node2"}, - // Name the pod as "node1" to fit "MatchFilter" plugin. - pod: st.MakePod().Name("node1").UID("node1").Priority(highPriority).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + // Name the pod as "node1" to fit "MatchFilter" plugin. + st.MakePod().Name("node1").UID("node1").Priority(highPriority).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Obj(), }, - expected: []Candidate{ - &candidate{victims: &extenderv1.Victims{}, name: "node1"}, + expected: [][]Candidate{ + { + &candidate{victims: &extenderv1.Victims{}, name: "node1"}, + }, }, - expectedNumFilterCalled: 3, + expectedNumFilterCalled: []int32{3}, }, { name: "a pod that fits on both nodes when lower priority pods are preempted", @@ -309,26 +335,30 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + }, + name: "node1", }, - name: "node1", - }, - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj()}, + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj()}, + }, + name: "node2", }, - name: "node2", }, }, - expectedNumFilterCalled: 4, + expectedNumFilterCalled: []int32{4}, }, { name: "a pod that would fit on the nodes, but other pods running are higher priority, no preemption would happen", @@ -336,13 +366,15 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(lowPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(lowPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), }, - expected: nil, - expectedNumFilterCalled: 0, + expected: [][]Candidate{{}}, + expectedNumFilterCalled: []int32{0}, }, { name: "medium priority pod is preempted, but lower priority one stays as it is small", @@ -350,27 +382,31 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Priority(lowPriority).Req(smallRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(midPriority).Req(largeRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + }, + name: "node1", }, - name: "node1", - }, - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj()}, + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj()}, + }, + name: "node2", }, - name: "node2", }, }, - expectedNumFilterCalled: 5, + expectedNumFilterCalled: []int32{5}, }, { name: "mixed priority pods are preempted", @@ -378,26 +414,30 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Priority(midPriority).Req(smallRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(lowPriority).Req(smallRes).Obj(), st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.4").UID("p1.4").Node("node1").Priority(highPriority).Req(smallRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(highPriority).Req(largeRes).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(lowPriority).Req(smallRes).Obj(), - st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(lowPriority).Req(smallRes).Obj(), + st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).Obj(), + }, }, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 4, + expectedNumFilterCalled: []int32{4}, }, { name: "mixed priority pods are preempted, pick later StartTime one when priorities are equal", @@ -405,26 +445,30 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Priority(lowPriority).Req(smallRes).StartTime(epochTime5).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(lowPriority).Req(smallRes).StartTime(epochTime4).Obj(), st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).StartTime(epochTime3).Obj(), st.MakePod().Name("p1.4").UID("p1.4").Node("node1").Priority(highPriority).Req(smallRes).StartTime(epochTime2).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(highPriority).Req(largeRes).StartTime(epochTime1).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Priority(lowPriority).Req(smallRes).StartTime(epochTime5).Obj(), - st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).StartTime(epochTime3).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Priority(lowPriority).Req(smallRes).StartTime(epochTime5).Obj(), + st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(midPriority).Req(mediumRes).StartTime(epochTime3).Obj(), + }, }, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 4, // no preemption would happen on node2 and no filter call is counted. + expectedNumFilterCalled: []int32{4}, // no preemption would happen on node2 and no filter call is counted. }, { name: "pod with anti-affinity is preempted", @@ -433,26 +477,30 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(interpodaffinity.Name, interpodaffinity.New, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Label("foo", "").Priority(highPriority).Req(smallRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Label("foo", "").Priority(highPriority).Req(smallRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("foo", "").Priority(lowPriority).Req(smallRes). PodAntiAffinityExists("foo", "hostname", st.PodAntiAffinityWithRequiredReq).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Priority(midPriority).Req(smallRes).Obj(), st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Priority(highPriority).Req(smallRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(highPriority).Req(smallRes).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("foo", "").Priority(lowPriority).Req(smallRes). - PodAntiAffinityExists("foo", "hostname", st.PodAntiAffinityWithRequiredReq).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("foo", "").Priority(lowPriority).Req(smallRes). + PodAntiAffinityExists("foo", "hostname", st.PodAntiAffinityWithRequiredReq).Obj(), + }, }, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 3, // no preemption would happen on node2 and no filter call is counted. + expectedNumFilterCalled: []int32{3}, // no preemption would happen on node2 and no filter call is counted. }, { name: "preemption to resolve pod topology spread filter failure", @@ -460,32 +508,36 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(podtopologyspread.Name, podtopologyspread.New, "PreFilter", "Filter"), }, nodeNames: []string{"node-a/zone1", "node-b/zone1", "node-x/zone2"}, - pod: st.MakePod().Name("p").UID("p").Label("foo", "").Priority(highPriority). - SpreadConstraint(1, "zone", v1.DoNotSchedule, st.MakeLabelSelector().Exists("foo").Obj()). - SpreadConstraint(1, "hostname", v1.DoNotSchedule, st.MakeLabelSelector().Exists("foo").Obj()). - Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Label("foo", "").Priority(highPriority). + SpreadConstraint(1, "zone", v1.DoNotSchedule, st.MakeLabelSelector().Exists("foo").Obj()). + SpreadConstraint(1, "hostname", v1.DoNotSchedule, st.MakeLabelSelector().Exists("foo").Obj()). + Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("pod-a1").UID("pod-a1").Node("node-a").Label("foo", "").Priority(midPriority).Obj(), st.MakePod().Name("pod-a2").UID("pod-a2").Node("node-a").Label("foo", "").Priority(lowPriority).Obj(), st.MakePod().Name("pod-b1").UID("pod-b1").Node("node-b").Label("foo", "").Priority(lowPriority).Obj(), st.MakePod().Name("pod-x1").UID("pod-x1").Node("node-x").Label("foo", "").Priority(highPriority).Obj(), st.MakePod().Name("pod-x2").UID("pod-x2").Node("node-x").Label("foo", "").Priority(highPriority).Obj(), }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("pod-a2").UID("pod-a2").Node("node-a").Label("foo", "").Priority(lowPriority).Obj()}, + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("pod-a2").UID("pod-a2").Node("node-a").Label("foo", "").Priority(lowPriority).Obj()}, + }, + name: "node-a", }, - name: "node-a", - }, - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{st.MakePod().Name("pod-b1").UID("pod-b1").Node("node-b").Label("foo", "").Priority(lowPriority).Obj()}, + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("pod-b1").UID("pod-b1").Node("node-b").Label("foo", "").Priority(lowPriority).Obj()}, + }, + name: "node-b", }, - name: "node-b", }, }, - expectedNumFilterCalled: 5, // node-a (3), node-b (2), node-x (0) + expectedNumFilterCalled: []int32{5}, // node-a (3), node-b (2), node-x (0) }, { name: "get Unschedulable in the preemption phase when the filter plugins filtering the nodes", @@ -493,14 +545,16 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1", "node2"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), }, fakeFilterRC: framework.Unschedulable, - expected: nil, - expectedNumFilterCalled: 2, + expected: [][]Candidate{{}}, + expectedNumFilterCalled: []int32{2}, }, { name: "preemption with violation of same pdb", @@ -508,8 +562,10 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), }, @@ -519,19 +575,21 @@ func TestDryRunPreemption(t *testing.T) { Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}, }, }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), - st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + }, + NumPDBViolations: 1, }, - NumPDBViolations: 1, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 3, + expectedNumFilterCalled: []int32{3}, }, { name: "preemption with violation of the pdb with pod whose eviction was processed, the victim doesn't belong to DisruptedPods", @@ -539,8 +597,10 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), }, @@ -550,19 +610,21 @@ func TestDryRunPreemption(t *testing.T) { Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"p2": {Time: time.Now()}}}, }, }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), - st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + }, + NumPDBViolations: 1, }, - NumPDBViolations: 1, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 3, + expectedNumFilterCalled: []int32{3}, }, { name: "preemption with violation of the pdb with pod whose eviction was processed, the victim belongs to DisruptedPods", @@ -570,8 +632,10 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), }, @@ -581,19 +645,21 @@ func TestDryRunPreemption(t *testing.T) { Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"p1.2": {Time: time.Now()}}}, }, }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), - st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + }, + NumPDBViolations: 0, }, - NumPDBViolations: 0, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 3, + expectedNumFilterCalled: []int32{3}, }, { name: "preemption with violation of the pdb with pod whose eviction was processed, the victim which belongs to DisruptedPods is treated as 'nonViolating'", @@ -601,8 +667,10 @@ func TestDryRunPreemption(t *testing.T) { st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), }, nodeNames: []string{"node1"}, - pod: st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), - pods: []*v1.Pod{ + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(veryLargeRes).Obj(), + }, + initPods: []*v1.Pod{ st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), @@ -613,26 +681,229 @@ func TestDryRunPreemption(t *testing.T) { Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 1, DisruptedPods: map[string]metav1.Time{"p1.3": {Time: time.Now()}}}, }, }, - expected: []Candidate{ - &candidate{ - victims: &extenderv1.Victims{ - Pods: []*v1.Pod{ - st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), - st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), - st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + expected: [][]Candidate{ + { + &candidate{ + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{ + st.MakePod().Name("p1.1").UID("p1.1").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + st.MakePod().Name("p1.2").UID("p1.2").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + st.MakePod().Name("p1.3").UID("p1.3").Node("node1").Label("app", "foo").Priority(midPriority).Req(mediumRes).Obj(), + }, + NumPDBViolations: 1, }, - NumPDBViolations: 1, + name: "node1", }, - name: "node1", }, }, - expectedNumFilterCalled: 4, + expectedNumFilterCalled: []int32{4}, + }, + { + name: "all nodes are possible candidates, but DefaultPreemptionArgs limits to 2", + args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 40, MinCandidateNodesAbsolute: 1}, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + }, + nodeNames: []string{"node1", "node2", "node3", "node4", "node5"}, + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p4").UID("p4").Node("node4").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p5").UID("p5").Node("node5").Priority(midPriority).Req(largeRes).Obj(), + }, + disableParallelism: true, + expected: [][]Candidate{ + { + // cycle=0 => offset=4 => node5 (yes), node1 (yes) + &candidate{ + name: "node1", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node5", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p5").UID("p5").Node("node5").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + }, + }, + expectedNumFilterCalled: []int32{4}, + }, + { + name: "some nodes are not possible candidates, DefaultPreemptionArgs limits to 2", + args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 40, MinCandidateNodesAbsolute: 1}, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + }, + nodeNames: []string{"node1", "node2", "node3", "node4", "node5"}, + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").Priority(veryHighPriority).Req(largeRes).Obj(), + st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p4").UID("p4").Node("node4").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p5").UID("p5").Node("node5").Priority(veryHighPriority).Req(largeRes).Obj(), + }, + disableParallelism: true, + expected: [][]Candidate{ + { + // cycle=0 => offset=4 => node5 (no), node1 (yes), node2 (no), node3 (yes) + &candidate{ + name: "node1", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node3", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + }, + }, + expectedNumFilterCalled: []int32{4}, + }, + { + name: "preemption offset across multiple scheduling cycles and wrap around", + args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 40, MinCandidateNodesAbsolute: 1}, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + }, + nodeNames: []string{"node1", "node2", "node3", "node4", "node5"}, + testPods: []*v1.Pod{ + st.MakePod().Name("tp1").UID("tp1").Priority(highPriority).Req(largeRes).Obj(), + st.MakePod().Name("tp2").UID("tp2").Priority(highPriority).Req(largeRes).Obj(), + st.MakePod().Name("tp3").UID("tp3").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p4").UID("p4").Node("node4").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p5").UID("p5").Node("node5").Priority(midPriority).Req(largeRes).Obj(), + }, + disableParallelism: true, + expected: [][]Candidate{ + { + // cycle=0 => offset=4 => node5 (yes), node1 (yes) + &candidate{ + name: "node1", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node5", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p5").UID("p5").Node("node5").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + }, + { + // cycle=1 => offset=1 => node2 (yes), node3 (yes) + &candidate{ + name: "node2", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p2").UID("p2").Node("node2").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node3", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + }, + { + // cycle=2 => offset=3 => node4 (yes), node5 (yes) + &candidate{ + name: "node4", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p4").UID("p4").Node("node4").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node5", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p5").UID("p5").Node("node5").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + }, + }, + expectedNumFilterCalled: []int32{4, 4, 4}, + }, + { + name: "preemption looks past numCandidates until a non-PDB violating node is found", + args: &config.DefaultPreemptionArgs{MinCandidateNodesPercentage: 40, MinCandidateNodesAbsolute: 2}, + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterPluginAsExtensions(noderesources.FitName, noderesources.NewFit, "Filter", "PreFilter"), + }, + nodeNames: []string{"node1", "node2", "node3", "node4", "node5"}, + testPods: []*v1.Pod{ + st.MakePod().Name("p").UID("p").Priority(highPriority).Req(largeRes).Obj(), + }, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Label("app", "foo").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").Label("app", "foo").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p4").UID("p4").Node("node4").Priority(midPriority).Req(largeRes).Obj(), + st.MakePod().Name("p5").UID("p5").Node("node5").Label("app", "foo").Priority(midPriority).Req(largeRes).Obj(), + }, + pdbs: []*policy.PodDisruptionBudget{ + { + Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, + Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }, + }, + disableParallelism: true, + expected: [][]Candidate{ + { + // Even though the DefaultPreemptionArgs constraints suggest that the + // minimum number of candidates is 2, we get three candidates here + // because we're okay with being a little over (in production, if a + // non-PDB violating candidate isn't found close to the offset, the + // number of additional candidates returned will be at most + // approximately equal to the parallelism in dryRunPreemption). + // cycle=0 => offset=4 => node5 (yes, pdb), node1 (yes, pdb), node2 (no, pdb), node3 (yes) + &candidate{ + name: "node1", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Node("node1").Label("app", "foo").Priority(midPriority).Req(largeRes).Obj()}, + NumPDBViolations: 1, + }, + }, + &candidate{ + name: "node3", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p3").UID("p3").Node("node3").Priority(midPriority).Req(largeRes).Obj()}, + }, + }, + &candidate{ + name: "node5", + victims: &extenderv1.Victims{ + Pods: []*v1.Pod{st.MakePod().Name("p5").UID("p5").Node("node5").Label("app", "foo").Priority(midPriority).Req(largeRes).Obj()}, + NumPDBViolations: 1, + }, + }, + }, + }, + expectedNumFilterCalled: []int32{8}, }, } labelKeys := []string{"hostname", "zone", "region"} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + rand.Seed(4) nodes := make([]*v1.Node, len(tt.nodeNames)) fakeFilterRCMap := make(map[string]framework.Code, len(tt.nodeNames)) for i, nodeName := range tt.nodeNames { @@ -647,7 +918,7 @@ func TestDryRunPreemption(t *testing.T) { nodes[i] = nodeWrapper.Obj() fakeFilterRCMap[nodeName] = tt.fakeFilterRC } - snapshot := internalcache.NewSnapshot(tt.pods, nodes) + snapshot := internalcache.NewSnapshot(tt.initPods, nodes) // For each test, register a FakeFilterPlugin along with essential plugins and tt.registerPlugins. fakePlugin := st.FakeFilterPlugin{ @@ -675,36 +946,59 @@ func TestDryRunPreemption(t *testing.T) { t.Fatal(err) } - state := framework.NewCycleState() - // Some tests rely on PreFilter plugin to compute its CycleState. - if status := fwk.RunPreFilterPlugins(context.Background(), state, tt.pod); !status.IsSuccess() { - t.Errorf("Unexpected PreFilter Status: %v", status) - } - nodeInfos, err := snapshot.NodeInfos().List() if err != nil { t.Fatal(err) } - got := dryRunPreemption(context.Background(), fwk.PreemptHandle(), state, tt.pod, nodeInfos, tt.pdbs) - if err != nil { - t.Fatal(err) - } - // Sort the values (inner victims) and the candidate itself (by its NominatedNodeName). - for i := range got { - victims := got[i].Victims().Pods - sort.Slice(victims, func(i, j int) bool { - return victims[i].Name < victims[j].Name - }) - } - sort.Slice(got, func(i, j int) bool { - return got[i].Name() < got[j].Name() + sort.Slice(nodeInfos, func(i, j int) bool { + return nodeInfos[i].Node().Name < nodeInfos[j].Node().Name }) - if tt.expectedNumFilterCalled != fakePlugin.NumFilterCalled { - t.Errorf("expected fakePlugin.numFilterCalled is %d, but got %d", tt.expectedNumFilterCalled, fakePlugin.NumFilterCalled) + if tt.disableParallelism { + // We need disableParallelism because of the non-deterministic nature + // of the results of tests that set custom minCandidateNodesPercentage + // or minCandidateNodesAbsolute. This is only done in a handful of tests. + oldParallelism := parallelize.GetParallelism() + parallelize.SetParallelism(1) + t.Cleanup(func() { + parallelize.SetParallelism(oldParallelism) + }) } - if diff := cmp.Diff(tt.expected, got, cmp.AllowUnexported(candidate{})); diff != "" { - t.Errorf("Unexpected candidates (-want, +got): %s", diff) + + if tt.args == nil { + tt.args = getDefaultDefaultPreemptionArgs() + } + pl := &DefaultPreemption{args: *tt.args} + + var prevNumFilterCalled int32 + for cycle, pod := range tt.testPods { + state := framework.NewCycleState() + // Some tests rely on PreFilter plugin to compute its CycleState. + if status := fwk.RunPreFilterPlugins(context.Background(), state, pod); !status.IsSuccess() { + t.Errorf("cycle %d: Unexpected PreFilter Status: %v", cycle, status) + } + offset, numCandidates := pl.getOffsetAndNumCandidates(int32(len(nodeInfos))) + got := dryRunPreemption(context.Background(), fwk.PreemptHandle(), state, pod, nodeInfos, tt.pdbs, offset, numCandidates) + if err != nil { + t.Fatal(err) + } + // Sort the values (inner victims) and the candidate itself (by its NominatedNodeName). + for i := range got { + victims := got[i].Victims().Pods + sort.Slice(victims, func(i, j int) bool { + return victims[i].Name < victims[j].Name + }) + } + sort.Slice(got, func(i, j int) bool { + return got[i].Name() < got[j].Name() + }) + if fakePlugin.NumFilterCalled-prevNumFilterCalled != tt.expectedNumFilterCalled[cycle] { + t.Errorf("cycle %d: got NumFilterCalled=%d, want %d", cycle, fakePlugin.NumFilterCalled-prevNumFilterCalled, tt.expectedNumFilterCalled[cycle]) + } + prevNumFilterCalled = fakePlugin.NumFilterCalled + if diff := cmp.Diff(tt.expected[cycle], got, cmp.AllowUnexported(candidate{})); diff != "" { + t.Errorf("cycle %d: unexpected candidates (-want, +got): %s", cycle, diff) + } } }) } @@ -876,6 +1170,7 @@ func TestSelectBestCandidate(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + rand.Seed(4) nodes := make([]*v1.Node, len(tt.nodeNames)) for i, nodeName := range tt.nodeNames { nodes[i] = st.MakeNode().Name(nodeName).Capacity(veryLargeRes).Obj() @@ -903,7 +1198,10 @@ func TestSelectBestCandidate(t *testing.T) { if err != nil { t.Fatal(err) } - candidates := dryRunPreemption(context.Background(), fwk.PreemptHandle(), state, tt.pod, nodeInfos, nil) + + pl := &DefaultPreemption{args: *getDefaultDefaultPreemptionArgs()} + offset, numCandidates := pl.getOffsetAndNumCandidates(int32(len(nodeInfos))) + candidates := dryRunPreemption(context.Background(), fwk.PreemptHandle(), state, tt.pod, nodeInfos, nil, offset, numCandidates) s := SelectCandidate(candidates) found := false for _, nodeName := range tt.expected { @@ -1312,6 +1610,7 @@ func TestPreempt(t *testing.T) { fh: fwk, podLister: informerFactory.Core().V1().Pods().Lister(), pdbLister: getPDBLister(informerFactory), + args: *getDefaultDefaultPreemptionArgs(), } node, err := pl.preempt(context.Background(), state, test.pod, make(framework.NodeToStatusMap)) if err != nil { diff --git a/pkg/scheduler/internal/parallelize/parallelism.go b/pkg/scheduler/internal/parallelize/parallelism.go index 4eebc62e33b..db2df1c5eaa 100644 --- a/pkg/scheduler/internal/parallelize/parallelism.go +++ b/pkg/scheduler/internal/parallelize/parallelism.go @@ -27,6 +27,11 @@ var ( parallelism = 16 ) +// GetParallelism returns the currently set parallelism. +func GetParallelism() int { + return parallelism +} + // SetParallelism sets the parallelism for all scheduler algorithms. // TODO(#95952): Remove global setter in favor of a struct that holds the configuration. func SetParallelism(p int) { diff --git a/staging/src/k8s.io/kube-scheduler/config/v1beta1/register.go b/staging/src/k8s.io/kube-scheduler/config/v1beta1/register.go index 98a8f7b22e9..602053ac627 100644 --- a/staging/src/k8s.io/kube-scheduler/config/v1beta1/register.go +++ b/staging/src/k8s.io/kube-scheduler/config/v1beta1/register.go @@ -38,6 +38,7 @@ var ( func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &KubeSchedulerConfiguration{}, + &DefaultPreemptionArgs{}, &InterPodAffinityArgs{}, &NodeLabelArgs{}, &NodeResourcesFitArgs{}, diff --git a/staging/src/k8s.io/kube-scheduler/config/v1beta1/types_pluginargs.go b/staging/src/k8s.io/kube-scheduler/config/v1beta1/types_pluginargs.go index d9092b8277e..0c7d14b85ac 100644 --- a/staging/src/k8s.io/kube-scheduler/config/v1beta1/types_pluginargs.go +++ b/staging/src/k8s.io/kube-scheduler/config/v1beta1/types_pluginargs.go @@ -23,6 +23,28 @@ import ( // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// DefaultPreemptionArgs holds arguments used to configure the +// DefaultPreemption plugin. +type DefaultPreemptionArgs struct { + metav1.TypeMeta `json:",inline"` + + // MinCandidateNodesPercentage is the minimum number of candidates to + // shortlist when dry running preemption as a percentage of number of nodes. + // Must be in the range [0, 100]. Defaults to 10% of the cluster size if + // unspecified. + MinCandidateNodesPercentage *int32 `json:"minCandidateNodesPercentage,omitempty"` + // MinCandidateNodesAbsolute is the absolute minimum number of candidates to + // shortlist. The likely number of candidates enumerated for dry running + // preemption is given by the formula: + // numCandidates = max(numNodes * minCandidateNodesPercentage, minCandidateNodesAbsolute) + // We say "likely" because there are other factors such as PDB violations + // that play a role in the number of candidates shortlisted. Must be at least + // 0 nodes. Defaults to 100 nodes if unspecified. + MinCandidateNodesAbsolute *int32 `json:"minCandidateNodesAbsolute,omitempty"` +} + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + // InterPodAffinityArgs holds arguments used to configure the InterPodAffinity plugin. type InterPodAffinityArgs struct { metav1.TypeMeta `json:",inline"` diff --git a/staging/src/k8s.io/kube-scheduler/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kube-scheduler/config/v1beta1/zz_generated.deepcopy.go index 8bc3650b38b..00d64b2ff09 100644 --- a/staging/src/k8s.io/kube-scheduler/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kube-scheduler/config/v1beta1/zz_generated.deepcopy.go @@ -26,6 +26,41 @@ import ( v1 "k8s.io/kube-scheduler/config/v1" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DefaultPreemptionArgs) DeepCopyInto(out *DefaultPreemptionArgs) { + *out = *in + out.TypeMeta = in.TypeMeta + if in.MinCandidateNodesPercentage != nil { + in, out := &in.MinCandidateNodesPercentage, &out.MinCandidateNodesPercentage + *out = new(int32) + **out = **in + } + if in.MinCandidateNodesAbsolute != nil { + in, out := &in.MinCandidateNodesAbsolute, &out.MinCandidateNodesAbsolute + *out = new(int32) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DefaultPreemptionArgs. +func (in *DefaultPreemptionArgs) DeepCopy() *DefaultPreemptionArgs { + if in == nil { + return nil + } + out := new(DefaultPreemptionArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DefaultPreemptionArgs) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Extender) DeepCopyInto(out *Extender) { *out = *in