From 08e8403e2eae3f4c8d3d336fc917818c935410a6 Mon Sep 17 00:00:00 2001
From: Kensei Nakada <handbomusic@gmail.com>
Date: Thu, 31 Oct 2024 17:10:15 +0900
Subject: [PATCH] feat: add the e2e test for the async preemption

---
 test/e2e/feature/feature.go       |   4 +
 test/e2e/scheduling/preemption.go | 157 ++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)

diff --git a/test/e2e/feature/feature.go b/test/e2e/feature/feature.go
index 557d0930cee..166518eee0c 100644
--- a/test/e2e/feature/feature.go
+++ b/test/e2e/feature/feature.go
@@ -303,6 +303,10 @@ var (
 	// TODO: document the feature (owning SIG, when to use this feature for a test)
 	RegularResourceUsageTracking = framework.WithFeature(framework.ValidFeatures.Add("RegularResourceUsageTracking"))
 
+	// Owner: sig-scheduling
+	// Marks tests of the asynchronous preemption (KEP-4832) that require the `SchedulerAsyncPreemption` feature gate.
+	SchedulerAsyncPreemption = framework.WithFeature(framework.ValidFeatures.Add("SchedulerAsyncPreemption"))
+
 	// Owner: sig-network
 	// Marks tests that require a pod networking implementation that supports SCTP
 	// traffic between pods.
diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go
index a3668c94fd4..7ae3873fc4a 100644
--- a/test/e2e/scheduling/preemption.go
+++ b/test/e2e/scheduling/preemption.go
@@ -43,6 +43,7 @@ import (
 	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/kubernetes/pkg/apis/scheduling"
+	"k8s.io/kubernetes/test/e2e/feature"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -305,6 +306,162 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 		}
 	})
 
+	/*
+		Release: v1.23
+		Testname: Scheduler runs the preemption with various priority classes expectedly
+		Description: When there are Pods with various priority classes running the preemption,
+		the scheduler must prioritize the Pods with the higher priority class.
+	*/
+	framework.It("validates various priority Pods preempt expectedly with the async preemption", feature.SchedulerAsyncPreemption, func(ctx context.Context) {
+		var podRes v1.ResourceList
+		// Create two pods per node that uses a lot of the node's resources.
+		ginkgo.By("Create 10 low-priority pods on each node.")
+		lowPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
+		// Create pods in the cluster.
+		// One of them has low priority, making it the victim for preemption.
+		for i, node := range nodeList.Items {
+			// Update each node to advertise 3 available extended resources
+			nodeCopy := node.DeepCopy()
+			nodeCopy.Status.Capacity[testExtendedResource] = resource.MustParse("10")
+			nodeCopy.Status.Allocatable[testExtendedResource] = resource.MustParse("10")
+			err := patchNode(ctx, cs, &node, nodeCopy)
+			framework.ExpectNoError(err)
+
+			// Create 10 low priority pods on each node, which will use up 10/10 of the node's resources.
+			for j := 0; j < 10; j++ {
+				// Request 2 of the available resources for the victim pods
+				podRes = v1.ResourceList{}
+				podRes[testExtendedResource] = resource.MustParse("1")
+				pausePod := createPausePod(ctx, f, pausePodConfig{
+					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, lowPriorityClassName),
+					PriorityClassName: lowPriorityClassName,
+					// This victim pod will be preempted by the high priority pod.
+					// But, the deletion will be blocked by the finalizer.
+					//
+					// The finalizer is needed to prevent the medium Pods from being scheduled instead of the high Pods,
+					// depending on when the scheduler notices the existence of all the high Pods we create.
+					Finalizers: []string{testFinalizer},
+					Resources: &v1.ResourceRequirements{
+						Requests: podRes,
+						Limits:   podRes,
+					},
+					Affinity: &v1.Affinity{
+						NodeAffinity: &v1.NodeAffinity{
+							RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+								NodeSelectorTerms: []v1.NodeSelectorTerm{
+									{
+										MatchFields: []v1.NodeSelectorRequirement{
+											{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
+										},
+									},
+								},
+							},
+						},
+					},
+				})
+				lowPriorityPods = append(lowPriorityPods, pausePod)
+				framework.Logf("Created pod: %v", pausePod.Name)
+			}
+		}
+
+		ginkgo.By("Wait for lower priority pods to be scheduled.")
+		for _, pod := range lowPriorityPods {
+			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
+		}
+
+		highPriorityPods := make([]*v1.Pod, 0, 5*len(nodeList.Items))
+		mediumPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
+
+		ginkgo.By("Run high/medium priority pods that have same requirements as that of lower priority pod")
+		for i := range nodeList.Items {
+			// Create medium priority pods first
+			// to confirm the scheduler finally prioritize the high priority pods, ignoring the medium priority pods.
+			for j := 0; j < 10; j++ {
+				// 5 pods per node will be unschedulable
+				// because the node only has 10 resource, and high priority pods will use 5 resource.
+				p := createPausePod(ctx, f, pausePodConfig{
+					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, mediumPriorityClassName),
+					PriorityClassName: mediumPriorityClassName,
+					Resources: &v1.ResourceRequirements{
+						// Set the pod request to the low priority pod's resources
+						Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
+						Limits:   lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
+					},
+				})
+				mediumPriorityPods = append(mediumPriorityPods, p)
+			}
+
+			for j := 0; j < 5; j++ {
+				p := createPausePod(ctx, f, pausePodConfig{
+					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, highPriorityClassName),
+					PriorityClassName: highPriorityClassName,
+					Resources: &v1.ResourceRequirements{
+						// Set the pod request to the low priority pod's resources
+						Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
+						Limits:   lowPriorityPods[0].Spec.Containers[0].Resources.Requests,
+					},
+				})
+				highPriorityPods = append(highPriorityPods, p)
+			}
+		}
+
+		// All low priority Pods should be the target of preemption.
+		// Those Pods have a finalizer and hence should not be deleted yet at this point.
+		ginkgo.By("Check all low priority pods to be about to preempted.")
+		for _, pod := range lowPriorityPods {
+			framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
+				preemptedPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
+				if err != nil {
+					return false, err
+				}
+				return preemptedPod.DeletionTimestamp != nil, nil
+			}))
+		}
+
+		// All high priority Pods should be schedulable by removing the low priority Pods.
+		ginkgo.By("Wait for high priority pods to be ready for the preemption.")
+		for _, pod := range highPriorityPods {
+			framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
+				highPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
+				if err != nil {
+					return false, err
+				}
+				return highPod.Status.NominatedNodeName != "", err
+			}))
+		}
+
+		ginkgo.By("Remove the finalizer from all low priority pods to proceed the preemption.")
+		for _, pod := range lowPriorityPods {
+			// Remove the finalizer so that the pod can be deleted by GC
+			e2epod.NewPodClient(f).RemoveFinalizer(ctx, pod.Name, testFinalizer)
+		}
+
+		ginkgo.By("Wait for high priority pods to be scheduled.")
+		for _, pod := range highPriorityPods {
+			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
+		}
+
+		ginkgo.By("Wait for 5 medium priority pods to be scheduled.")
+		framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
+			scheduled := 0
+			for _, pod := range mediumPriorityPods {
+				medPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
+				if err != nil {
+					return false, err
+				}
+
+				if medPod.Spec.NodeName != "" {
+					scheduled++
+				}
+			}
+			if scheduled > 5 {
+				return false, fmt.Errorf("expected 5 medium priority pods to be scheduled, but got %d", scheduled)
+			}
+
+			return scheduled == 5, nil
+		}))
+	})
+
 	/*
 		Release: v1.31
 		Testname: Verify the DisruptionTarget condition is added to the preempted pod