From f240b3abf526705f7c4be6f59961ec3b935e275b Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Tue, 13 May 2025 13:50:20 +0200 Subject: [PATCH] SchedulerPreemption [Serial] validates various priority Pods preempt expectedly with the async preemption: replace finalizers with preStop hook and TerminationGracePeriodSeconds Finalizers do not work as expected when an informer with a field selector is used. Any time a pod changing its state gets excluded by the field selector a synthetic delete event is issues even though the pod with a finalizer set is still present. Thus, making the scheduler schedule the high and medium priority pods before any of the low priority pod finalizers is removed. Instead, rely on preStop hook and TerminationGracePeriodSeconds to keep all low priority pods long enough included by the field selector so all high priority pods can set their .status.nominatedNodeName field. Also, update the check for how many medium priority pods are expected to be scheduled. Each node can accept 10 pods of the given extended resources. Given there's 5 high priority created per node, there's always 5 times number of nodes spots left for the medium priority pods. --- test/e2e/scheduling/predicates.go | 14 +++++++++++ test/e2e/scheduling/preemption.go | 39 ++++++++++++++++++------------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/test/e2e/scheduling/predicates.go b/test/e2e/scheduling/predicates.go index d7adb87d032..e3af318bcf6 100644 --- a/test/e2e/scheduling/predicates.go +++ b/test/e2e/scheduling/predicates.go @@ -75,6 +75,8 @@ type pausePodConfig struct { DeletionGracePeriodSeconds *int64 TopologySpreadConstraints []v1.TopologySpreadConstraint SchedulingGates []v1.PodSchedulingGate + TerminationGracePeriodSeconds *int64 + PreStopHookSleepSeconds *int64 } var _ = SIGDescribe("SchedulerPredicates", framework.WithSerial(), func() { @@ -1009,6 +1011,18 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod { if conf.DeletionGracePeriodSeconds != nil { pod.ObjectMeta.DeletionGracePeriodSeconds = conf.DeletionGracePeriodSeconds } + if conf.TerminationGracePeriodSeconds != nil { + pod.Spec.TerminationGracePeriodSeconds = conf.TerminationGracePeriodSeconds + } + if conf.PreStopHookSleepSeconds != nil { + pod.Spec.Containers[0].Lifecycle = &v1.Lifecycle{ + PreStop: &v1.LifecycleHandler{ + Sleep: &v1.SleepAction{ + Seconds: *conf.PreStopHookSleepSeconds, + }, + }, + } + } return pod } diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go index 644fa1ec508..a4e3129c33a 100644 --- a/test/e2e/scheduling/preemption.go +++ b/test/e2e/scheduling/preemption.go @@ -50,6 +50,7 @@ import ( e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2ereplicaset "k8s.io/kubernetes/test/e2e/framework/replicaset" admissionapi "k8s.io/pod-security-admission/api" + "k8s.io/utils/ptr" ) type priorityPair struct { @@ -317,7 +318,8 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { var podRes v1.ResourceList // Create 10 pods per node that will eat up all the node's resources. ginkgo.By("Create 10 low-priority pods on each node.") - lowPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items)) + nodeListLen := len(nodeList.Items) + lowPriorityPods := make([]*v1.Pod, 0, 10*nodeListLen) // Create pods in the cluster. for i, node := range nodeList.Items { // Update each node to advertise 3 available extended resources @@ -331,12 +333,6 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { pausePod := createPausePod(ctx, f, pausePodConfig{ Name: fmt.Sprintf("pod%d-%d-%v", i, j, lowPriorityClassName), PriorityClassName: lowPriorityClassName, - // This victim pod will be preempted by the high priority pod. - // But, the deletion will be blocked by the finalizer. - // - // The finalizer is needed to prevent the medium Pods from being scheduled instead of the high Pods, - // depending on when the scheduler notices the existence of all the high Pods we create. - Finalizers: []string{testFinalizer}, Resources: &v1.ResourceRequirements{ Requests: podRes, Limits: podRes, @@ -354,6 +350,15 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { }, }, }, + // This victim pod will be preempted by the high priority pod. + // But, the deletion will be blocked by the preStop hook with + // TerminationGracePeriodSeconds set. + // + // The preStop hook + TerminationGracePeriodSeconds are needed to prevent the medium Pods + // from being scheduled instead of the high Pods, + // depending on when the scheduler notices the existence of all the high Pods we create. + TerminationGracePeriodSeconds: ptr.To[int64](80), + PreStopHookSleepSeconds: ptr.To[int64](79), }) lowPriorityPods = append(lowPriorityPods, pausePod) framework.Logf("Created pod: %v", pausePod.Name) @@ -365,8 +370,8 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod)) } - highPriorityPods := make([]*v1.Pod, 0, 5*len(nodeList.Items)) - mediumPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items)) + highPriorityPods := make([]*v1.Pod, 0, 5*nodeListLen) + mediumPriorityPods := make([]*v1.Pod, 0, 10*nodeListLen) ginkgo.By("Run high/medium priority pods that have same requirements as that of lower priority pod") for i := range nodeList.Items { @@ -426,10 +431,12 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { })) } - ginkgo.By("Remove the finalizer from all low priority pods to proceed the preemption.") + ginkgo.By("Delete all low priority pods to proceed the preemption faster.") for _, pod := range lowPriorityPods { - // Remove the finalizer so that the pod can be deleted by GC - e2epod.NewPodClient(f).RemoveFinalizer(ctx, pod.Name, testFinalizer) + err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: ptr.To[int64](0)}) + if err != nil && !apierrors.IsNotFound(err) { + framework.Logf("Deleting %v pod failed: %v", pod.Name, err) + } } ginkgo.By("Wait for high priority pods to be scheduled.") @@ -437,7 +444,7 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod)) } - ginkgo.By("Wait for 5 medium priority pods to be scheduled.") + ginkgo.By(fmt.Sprintf("Wait for %v medium priority pods to be scheduled.", 5*nodeListLen)) framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) { scheduled := 0 for _, pod := range mediumPriorityPods { @@ -450,11 +457,11 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { scheduled++ } } - if scheduled > 5 { - return false, fmt.Errorf("expected 5 medium priority pods to be scheduled, but got %d", scheduled) + if scheduled > 5*nodeListLen { + return false, fmt.Errorf("expected %v medium priority pods to be scheduled, but got %d", 5*nodeListLen, scheduled) } - return scheduled == 5, nil + return scheduled == 5*nodeListLen, nil })) })