Merge pull request #50949 from bsalamat/preemption_eviction

Automatic merge from submit-queue Add pod preemption to the scheduler **What this PR does / why we need it**: This is the last of a series of PRs to add priority-based preemption to the scheduler. This PR connects the preemption logic to the scheduler workflow. **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #48646 **Special notes for your reviewer**: This PR includes other PRs which are under review (#50805, #50405, #50190). All the new code is located in 43627afdf9. **Release note**: ```release-note Add priority-based preemption to the scheduler. ``` ref/ #47604 /assign @davidopp @kubernetes/sig-scheduling-pr-reviews
2025-10-22 06:59:03 +00:00 · 2017-09-08 14:19:42 -07:00
parent ed154988c5 c0b718373b
commit f695a3120a
34 changed files with 1900 additions and 91 deletions
--- a/test/integration/scheduler/BUILD
+++ b/test/integration/scheduler/BUILD
@@ -21,12 +21,14 @@ go_test(
    deps = [
        "//pkg/api:go_default_library",
        "//pkg/api/testapi:go_default_library",
+        "//pkg/features:go_default_library",
        "//plugin/cmd/kube-scheduler/app:go_default_library",
        "//plugin/cmd/kube-scheduler/app/options:go_default_library",
        "//plugin/pkg/scheduler:go_default_library",
        "//plugin/pkg/scheduler/algorithm:go_default_library",
        "//plugin/pkg/scheduler/algorithmprovider:go_default_library",
        "//plugin/pkg/scheduler/api:go_default_library",
+        "//plugin/pkg/scheduler/core:go_default_library",
        "//plugin/pkg/scheduler/factory:go_default_library",
        "//plugin/pkg/scheduler/schedulercache:go_default_library",
        "//test/e2e/framework:go_default_library",
@@ -37,6 +39,7 @@ go_test(
        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
        "//vendor/k8s.io/client-go/informers:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
--- a/test/integration/scheduler/priorities_test.go
+++ b/test/integration/scheduler/priorities_test.go
@@ -51,7 +51,7 @@ func TestNodeAffinity(t *testing.T) {
 	}
 	// Create a pod with node affinity.
 	podName := "pod-with-node-affinity"
-	pod, err := runPausePod(context.clientSet, &pausePodConfig{
+	pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      podName,
 		Namespace: context.ns.Name,
 		Affinity: &v1.Affinity{
@@ -72,7 +72,7 @@ func TestNodeAffinity(t *testing.T) {
 				},
 			},
 		},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running pause pod: %v", err)
 	}
@@ -110,11 +110,11 @@ func TestPodAffinity(t *testing.T) {
 	// Add a pod with a label and wait for it to schedule.
 	labelKey := "service"
 	labelValue := "S1"
-	_, err = runPausePod(context.clientSet, &pausePodConfig{
+	_, err = runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      "attractor-pod",
 		Namespace: context.ns.Name,
 		Labels:    map[string]string{labelKey: labelValue},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running the attractor pod: %v", err)
 	}
@@ -125,7 +125,7 @@ func TestPodAffinity(t *testing.T) {
 	}
 	// Add a new pod with affinity to the attractor pod.
 	podName := "pod-with-podaffinity"
-	pod, err := runPausePod(context.clientSet, &pausePodConfig{
+	pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      podName,
 		Namespace: context.ns.Name,
 		Affinity: &v1.Affinity{
@@ -158,7 +158,7 @@ func TestPodAffinity(t *testing.T) {
 				},
 			},
 		},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running pause pod: %v", err)
 	}
--- a/test/integration/scheduler/scheduler_test.go
+++ b/test/integration/scheduler/scheduler_test.go
@@ -24,9 +24,11 @@ import (
 	"time"

 	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	"k8s.io/client-go/informers"
 	clientset "k8s.io/client-go/kubernetes"
 	clientv1core "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -36,15 +38,18 @@ import (
 	"k8s.io/client-go/tools/record"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/testapi"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/plugin/cmd/kube-scheduler/app"
 	"k8s.io/kubernetes/plugin/cmd/kube-scheduler/app/options"
 	"k8s.io/kubernetes/plugin/pkg/scheduler"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	_ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/core"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/factory"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 	"k8s.io/kubernetes/test/integration/framework"
+	testutils "k8s.io/kubernetes/test/utils"
 )

 const enableEquivalenceCache = true
@@ -56,11 +61,11 @@ type nodeStateManager struct {
 	makeUnSchedulable nodeMutationFunc
 }

-func PredicateOne(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
+func PredicateOne(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
 	return true, nil, nil
 }

-func PredicateTwo(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
+func PredicateTwo(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
 	return true, nil, nil
 }

@@ -457,13 +462,13 @@ func TestMultiScheduler(t *testing.T) {
 	}

 	defaultScheduler := "default-scheduler"
-	testPodFitsDefault, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler})
+	testPodFitsDefault, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler}))
 	if err != nil {
 		t.Fatalf("Failed to create pod: %v", err)
 	}

 	fooScheduler := "foo-scheduler"
-	testPodFitsFoo, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler})
+	testPodFitsFoo, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler}))
 	if err != nil {
 		t.Fatalf("Failed to create pod: %v", err)
 	}
@@ -647,3 +652,251 @@ func TestAllocatable(t *testing.T) {
 		t.Logf("Test allocatable awareness: %s Pod not scheduled as expected", testAllocPod2.Name)
 	}
 }
+
+// TestPreemption tests a few preemption scenarios.
+func TestPreemption(t *testing.T) {
+	// Enable PodPriority feature gate.
+	utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority))
+	// Initialize scheduler.
+	context := initTest(t, "preemption")
+	defer cleanupTest(t, context)
+	cs := context.clientSet
+
+	lowPriority, mediumPriority, highPriority := int32(100), int32(200), int32(300)
+	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
+		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
+		v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
+	}
+
+	tests := []struct {
+		description         string
+		existingPods        []*v1.Pod
+		pod                 *v1.Pod
+		preemptedPodIndexes map[int]struct{}
+	}{
+		{
+			description: "basic pod preemption",
+			existingPods: []*v1.Pod{
+				initPausePod(context.clientSet, &pausePodConfig{
+					Name:      "victim-pod",
+					Namespace: context.ns.Name,
+					Priority:  &lowPriority,
+					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
+						v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
+						v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
+					},
+				}),
+			},
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
+					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
+					v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{0: {}},
+		},
+		{
+			description: "preemption is performed to satisfy anti-affinity",
+			existingPods: []*v1.Pod{
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-0", Namespace: context.ns.Name,
+					Priority:  &mediumPriority,
+					Labels:    map[string]string{"pod": "p0"},
+					Resources: defaultPodRes,
+				}),
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-1", Namespace: context.ns.Name,
+					Priority:  &lowPriority,
+					Labels:    map[string]string{"pod": "p1"},
+					Resources: defaultPodRes,
+					Affinity: &v1.Affinity{
+						PodAntiAffinity: &v1.PodAntiAffinity{
+							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+								{
+									LabelSelector: &metav1.LabelSelector{
+										MatchExpressions: []metav1.LabelSelectorRequirement{
+											{
+												Key:      "pod",
+												Operator: metav1.LabelSelectorOpIn,
+												Values:   []string{"preemptor"},
+											},
+										},
+									},
+									TopologyKey: "node",
+								},
+							},
+						},
+					},
+				}),
+			},
+			// A higher priority pod with anti-affinity.
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Labels:    map[string]string{"pod": "preemptor"},
+				Resources: defaultPodRes,
+				Affinity: &v1.Affinity{
+					PodAntiAffinity: &v1.PodAntiAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+							{
+								LabelSelector: &metav1.LabelSelector{
+									MatchExpressions: []metav1.LabelSelectorRequirement{
+										{
+											Key:      "pod",
+											Operator: metav1.LabelSelectorOpIn,
+											Values:   []string{"p0"},
+										},
+									},
+								},
+								TopologyKey: "node",
+							},
+						},
+					},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
+		},
+		{
+			// This is similar to the previous case only pod-1 is high priority.
+			description: "preemption is not performed when anti-affinity is not satisfied",
+			existingPods: []*v1.Pod{
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-0", Namespace: context.ns.Name,
+					Priority:  &mediumPriority,
+					Labels:    map[string]string{"pod": "p0"},
+					Resources: defaultPodRes,
+				}),
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-1", Namespace: context.ns.Name,
+					Priority:  &highPriority,
+					Labels:    map[string]string{"pod": "p1"},
+					Resources: defaultPodRes,
+					Affinity: &v1.Affinity{
+						PodAntiAffinity: &v1.PodAntiAffinity{
+							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+								{
+									LabelSelector: &metav1.LabelSelector{
+										MatchExpressions: []metav1.LabelSelectorRequirement{
+											{
+												Key:      "pod",
+												Operator: metav1.LabelSelectorOpIn,
+												Values:   []string{"preemptor"},
+											},
+										},
+									},
+									TopologyKey: "node",
+								},
+							},
+						},
+					},
+				}),
+			},
+			// A higher priority pod with anti-affinity.
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Labels:    map[string]string{"pod": "preemptor"},
+				Resources: defaultPodRes,
+				Affinity: &v1.Affinity{
+					PodAntiAffinity: &v1.PodAntiAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+							{
+								LabelSelector: &metav1.LabelSelector{
+									MatchExpressions: []metav1.LabelSelectorRequirement{
+										{
+											Key:      "pod",
+											Operator: metav1.LabelSelectorOpIn,
+											Values:   []string{"p0"},
+										},
+									},
+								},
+								TopologyKey: "node",
+							},
+						},
+					},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{},
+		},
+	}
+
+	// Create a node with some resources and a label.
+	nodeRes := &v1.ResourceList{
+		v1.ResourcePods:   *resource.NewQuantity(32, resource.DecimalSI),
+		v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
+		v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
+	}
+	node, err := createNode(context.clientSet, "node1", nodeRes)
+	if err != nil {
+		t.Fatalf("Error creating nodes: %v", err)
+	}
+	nodeLabels := map[string]string{"node": node.Name}
+	if err = testutils.AddLabelsToNode(context.clientSet, node.Name, nodeLabels); err != nil {
+		t.Fatalf("Cannot add labels to node: %v", err)
+	}
+	if err = waitForNodeLabels(context.clientSet, node.Name, nodeLabels); err != nil {
+		t.Fatalf("Adding labels to node didn't succeed: %v", err)
+	}
+
+	for _, test := range tests {
+		pods := make([]*v1.Pod, len(test.existingPods))
+		// Create and run existingPods.
+		for i, p := range test.existingPods {
+			pods[i], err = runPausePod(cs, p)
+			if err != nil {
+				t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
+			}
+		}
+		// Create the "pod".
+		preemptor, err := createPausePod(cs, test.pod)
+		if err != nil {
+			t.Errorf("Error while creating high priority pod: %v", err)
+		}
+		// Wait for preemption of pods and make sure the other ones are not preempted.
+		for i, p := range pods {
+			if _, found := test.preemptedPodIndexes[i]; found {
+				if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
+					t.Errorf("Test [%v]: Pod %v is not getting evicted.", test.description, p.Name)
+				}
+			} else {
+				if p.DeletionTimestamp != nil {
+					t.Errorf("Test [%v]: Didn't expect pod %v to get preempted.", test.description, p.Name)
+				}
+			}
+		}
+		// Also check that the preemptor pod gets the annotation for nominated node name.
+		if len(test.preemptedPodIndexes) > 0 {
+			if err = wait.Poll(time.Second, wait.ForeverTestTimeout, func() (bool, error) {
+				pod, err := context.clientSet.CoreV1().Pods(context.ns.Name).Get("preemptor-pod", metav1.GetOptions{})
+				if err != nil {
+					t.Errorf("Test [%v]: error getting pod: %v", test.description, err)
+				}
+				annot, found := pod.Annotations[core.NominatedNodeAnnotationKey]
+				if found && len(annot) > 0 {
+					return true, nil
+				}
+				return false, err
+			}); err != nil {
+				t.Errorf("Test [%v]: Pod annotation did not get set.", test.description)
+			}
+		}
+
+		// Cleanup
+		pods = append(pods, preemptor)
+		for _, p := range pods {
+			err = cs.CoreV1().Pods(p.Namespace).Delete(p.Name, metav1.NewDeleteOptions(0))
+			if err != nil && !errors.IsNotFound(err) {
+				t.Errorf("Test [%v]: error, %v, while deleting pod during test.", test.description, err)
+			}
+			err = wait.Poll(time.Second, wait.ForeverTestTimeout, podDeleted(cs, p.Namespace, p.Name))
+			if err != nil {
+				t.Errorf("Test [%v]: error, %v, while waiting for pod to get deleted.", test.description, err)
+			}
+		}
+	}
+}
--- a/test/integration/scheduler/util.go
+++ b/test/integration/scheduler/util.go
@@ -205,6 +205,7 @@ type pausePodConfig struct {
 	Tolerations                       []v1.Toleration
 	NodeName                          string
 	SchedulerName                     string
+	Priority                          *int32
 }

 // initPausePod initializes a pod API object from the given config. It is used
@@ -213,6 +214,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {
 	pod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:        conf.Name,
+			Namespace:   conf.Namespace,
 			Labels:      conf.Labels,
 			Annotations: conf.Annotations,
 		},
@@ -228,6 +230,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {
 			Tolerations:   conf.Tolerations,
 			NodeName:      conf.NodeName,
 			SchedulerName: conf.SchedulerName,
+			Priority:      conf.Priority,
 		},
 	}
 	if conf.Resources != nil {
@@ -238,9 +241,8 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {

 // createPausePod creates a pod with "Pause" image and the given config and
 // return its pointer and error status.
-func createPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) {
-	p := initPausePod(cs, conf)
-	return cs.CoreV1().Pods(conf.Namespace).Create(p)
+func createPausePod(cs clientset.Interface, p *v1.Pod) (*v1.Pod, error) {
+	return cs.CoreV1().Pods(p.Namespace).Create(p)
 }

 // createPausePodWithResource creates a pod with "Pause" image and the given
@@ -262,22 +264,21 @@ func createPausePodWithResource(cs clientset.Interface, podName string, nsName s
 			},
 		}
 	}
-	return createPausePod(cs, &conf)
+	return createPausePod(cs, initPausePod(cs, &conf))
 }

 // runPausePod creates a pod with "Pause" image and the given config and waits
 // until it is scheduled. It returns its pointer and error status.
-func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) {
-	p := initPausePod(cs, conf)
-	pod, err := cs.CoreV1().Pods(conf.Namespace).Create(p)
+func runPausePod(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) {
+	pod, err := cs.CoreV1().Pods(pod.Namespace).Create(pod)
 	if err != nil {
 		return nil, fmt.Errorf("Error creating pause pod: %v", err)
 	}
 	if err = waitForPodToSchedule(cs, pod); err != nil {
 		return pod, fmt.Errorf("Pod %v didn't schedule successfully. Error: %v", pod.Name, err)
 	}
-	if pod, err = cs.CoreV1().Pods(conf.Namespace).Get(conf.Name, metav1.GetOptions{}); err != nil {
-		return pod, fmt.Errorf("Error getting pod %v info: %v", conf.Name, err)
+	if pod, err = cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil {
+		return pod, fmt.Errorf("Error getting pod %v info: %v", pod.Name, err)
 	}
 	return pod, nil
 }
@@ -285,7 +286,10 @@ func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error)
 // podDeleted returns true if a pod is not found in the given namespace.
 func podDeleted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
 	return func() (bool, error) {
-		_, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		if pod.DeletionTimestamp != nil {
+			return true, nil
+		}
 		if errors.IsNotFound(err) {
 			return true, nil
 		}
@@ -293,6 +297,20 @@ func podDeleted(c clientset.Interface, podNamespace, podName string) wait.Condit
 	}
 }

+// podIsGettingEvicted returns true if the pod's deletion timestamp is set.
+func podIsGettingEvicted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
+	return func() (bool, error) {
+		pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		if err != nil {
+			return false, err
+		}
+		if pod.DeletionTimestamp != nil {
+			return true, nil
+		}
+		return false, nil
+	}
+}
+
 // podScheduled returns true if a node is assigned to the given pod.
 func podScheduled(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
 	return func() (bool, error) {