From f240b3abf526705f7c4be6f59961ec3b935e275b Mon Sep 17 00:00:00 2001
From: Jan Chaloupka <jchaloup@redhat.com>
Date: Tue, 13 May 2025 13:50:20 +0200
Subject: [PATCH] SchedulerPreemption [Serial] validates various priority Pods
 preempt expectedly with the async preemption: replace finalizers with preStop
 hook and TerminationGracePeriodSeconds

Finalizers do not work as expected when an informer with a field
selector is used. Any time a pod changing its state gets excluded by the
field selector a synthetic delete event is issues even though the pod
with a finalizer set is still present. Thus, making the scheduler
schedule the high and medium priority pods before any of the low
priority pod finalizers is removed. Instead, rely on preStop hook and
TerminationGracePeriodSeconds to keep all low priority pods long enough
included by the field selector so all high priority pods can set their
.status.nominatedNodeName field.

Also, update the check for how many medium priority pods are expected to
be scheduled. Each node can accept 10 pods of the given extended
resources. Given there's 5 high priority created per node, there's
always 5 times number of nodes spots left for the medium priority pods.
---
 test/e2e/scheduling/predicates.go | 14 +++++++++++
 test/e2e/scheduling/preemption.go | 39 ++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/test/e2e/scheduling/predicates.go b/test/e2e/scheduling/predicates.go
index d7adb87d032..e3af318bcf6 100644
--- a/test/e2e/scheduling/predicates.go
+++ b/test/e2e/scheduling/predicates.go
@@ -75,6 +75,8 @@ type pausePodConfig struct {
 	DeletionGracePeriodSeconds        *int64
 	TopologySpreadConstraints         []v1.TopologySpreadConstraint
 	SchedulingGates                   []v1.PodSchedulingGate
+	TerminationGracePeriodSeconds     *int64
+	PreStopHookSleepSeconds           *int64
 }
 
 var _ = SIGDescribe("SchedulerPredicates", framework.WithSerial(), func() {
@@ -1009,6 +1011,18 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
 	if conf.DeletionGracePeriodSeconds != nil {
 		pod.ObjectMeta.DeletionGracePeriodSeconds = conf.DeletionGracePeriodSeconds
 	}
+	if conf.TerminationGracePeriodSeconds != nil {
+		pod.Spec.TerminationGracePeriodSeconds = conf.TerminationGracePeriodSeconds
+	}
+	if conf.PreStopHookSleepSeconds != nil {
+		pod.Spec.Containers[0].Lifecycle = &v1.Lifecycle{
+			PreStop: &v1.LifecycleHandler{
+				Sleep: &v1.SleepAction{
+					Seconds: *conf.PreStopHookSleepSeconds,
+				},
+			},
+		}
+	}
 	return pod
 }
 
diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go
index 644fa1ec508..a4e3129c33a 100644
--- a/test/e2e/scheduling/preemption.go
+++ b/test/e2e/scheduling/preemption.go
@@ -50,6 +50,7 @@ import (
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	e2ereplicaset "k8s.io/kubernetes/test/e2e/framework/replicaset"
 	admissionapi "k8s.io/pod-security-admission/api"
+	"k8s.io/utils/ptr"
 )
 
 type priorityPair struct {
@@ -317,7 +318,8 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 		var podRes v1.ResourceList
 		// Create 10 pods per node that will eat up all the node's resources.
 		ginkgo.By("Create 10 low-priority pods on each node.")
-		lowPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
+		nodeListLen := len(nodeList.Items)
+		lowPriorityPods := make([]*v1.Pod, 0, 10*nodeListLen)
 		// Create pods in the cluster.
 		for i, node := range nodeList.Items {
 			// Update each node to advertise 3 available extended resources
@@ -331,12 +333,6 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 				pausePod := createPausePod(ctx, f, pausePodConfig{
 					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, lowPriorityClassName),
 					PriorityClassName: lowPriorityClassName,
-					// This victim pod will be preempted by the high priority pod.
-					// But, the deletion will be blocked by the finalizer.
-					//
-					// The finalizer is needed to prevent the medium Pods from being scheduled instead of the high Pods,
-					// depending on when the scheduler notices the existence of all the high Pods we create.
-					Finalizers: []string{testFinalizer},
 					Resources: &v1.ResourceRequirements{
 						Requests: podRes,
 						Limits:   podRes,
@@ -354,6 +350,15 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 							},
 						},
 					},
+					// This victim pod will be preempted by the high priority pod.
+					// But, the deletion will be blocked by the preStop hook with
+					// TerminationGracePeriodSeconds set.
+					//
+					// The preStop hook + TerminationGracePeriodSeconds are needed to prevent the medium Pods
+					// from being scheduled instead of the high Pods,
+					// depending on when the scheduler notices the existence of all the high Pods we create.
+					TerminationGracePeriodSeconds: ptr.To[int64](80),
+					PreStopHookSleepSeconds:       ptr.To[int64](79),
 				})
 				lowPriorityPods = append(lowPriorityPods, pausePod)
 				framework.Logf("Created pod: %v", pausePod.Name)
@@ -365,8 +370,8 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
 		}
 
-		highPriorityPods := make([]*v1.Pod, 0, 5*len(nodeList.Items))
-		mediumPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items))
+		highPriorityPods := make([]*v1.Pod, 0, 5*nodeListLen)
+		mediumPriorityPods := make([]*v1.Pod, 0, 10*nodeListLen)
 
 		ginkgo.By("Run high/medium priority pods that have same requirements as that of lower priority pod")
 		for i := range nodeList.Items {
@@ -426,10 +431,12 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 			}))
 		}
 
-		ginkgo.By("Remove the finalizer from all low priority pods to proceed the preemption.")
+		ginkgo.By("Delete all low priority pods to proceed the preemption faster.")
 		for _, pod := range lowPriorityPods {
-			// Remove the finalizer so that the pod can be deleted by GC
-			e2epod.NewPodClient(f).RemoveFinalizer(ctx, pod.Name, testFinalizer)
+			err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: ptr.To[int64](0)})
+			if err != nil && !apierrors.IsNotFound(err) {
+				framework.Logf("Deleting %v pod failed: %v", pod.Name, err)
+			}
 		}
 
 		ginkgo.By("Wait for high priority pods to be scheduled.")
@@ -437,7 +444,7 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
 		}
 
-		ginkgo.By("Wait for 5 medium priority pods to be scheduled.")
+		ginkgo.By(fmt.Sprintf("Wait for %v medium priority pods to be scheduled.", 5*nodeListLen))
 		framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) {
 			scheduled := 0
 			for _, pod := range mediumPriorityPods {
@@ -450,11 +457,11 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
 					scheduled++
 				}
 			}
-			if scheduled > 5 {
-				return false, fmt.Errorf("expected 5 medium priority pods to be scheduled, but got %d", scheduled)
+			if scheduled > 5*nodeListLen {
+				return false, fmt.Errorf("expected %v medium priority pods to be scheduled, but got %d", 5*nodeListLen, scheduled)
 			}
 
-			return scheduled == 5, nil
+			return scheduled == 5*nodeListLen, nil
 		}))
 	})