From 08e8403e2eae3f4c8d3d336fc917818c935410a6 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Thu, 31 Oct 2024 17:10:15 +0900 Subject: [PATCH] feat: add the e2e test for the async preemption --- test/e2e/feature/feature.go | 4 + test/e2e/scheduling/preemption.go | 157 ++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) diff --git a/test/e2e/feature/feature.go b/test/e2e/feature/feature.go index 557d0930cee..166518eee0c 100644 --- a/test/e2e/feature/feature.go +++ b/test/e2e/feature/feature.go @@ -303,6 +303,10 @@ var ( // TODO: document the feature (owning SIG, when to use this feature for a test) RegularResourceUsageTracking = framework.WithFeature(framework.ValidFeatures.Add("RegularResourceUsageTracking")) + // Owner: sig-scheduling + // Marks tests of the asynchronous preemption (KEP-4832) that require the `SchedulerAsyncPreemption` feature gate. + SchedulerAsyncPreemption = framework.WithFeature(framework.ValidFeatures.Add("SchedulerAsyncPreemption")) + // Owner: sig-network // Marks tests that require a pod networking implementation that supports SCTP // traffic between pods. diff --git a/test/e2e/scheduling/preemption.go b/test/e2e/scheduling/preemption.go index a3668c94fd4..7ae3873fc4a 100644 --- a/test/e2e/scheduling/preemption.go +++ b/test/e2e/scheduling/preemption.go @@ -43,6 +43,7 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" @@ -305,6 +306,162 @@ var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() { } }) + /* + Release: v1.23 + Testname: Scheduler runs the preemption with various priority classes expectedly + Description: When there are Pods with various priority classes running the preemption, + the scheduler must prioritize the Pods with the higher priority class. + */ + framework.It("validates various priority Pods preempt expectedly with the async preemption", feature.SchedulerAsyncPreemption, func(ctx context.Context) { + var podRes v1.ResourceList + // Create two pods per node that uses a lot of the node's resources. + ginkgo.By("Create 10 low-priority pods on each node.") + lowPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items)) + // Create pods in the cluster. + // One of them has low priority, making it the victim for preemption. + for i, node := range nodeList.Items { + // Update each node to advertise 3 available extended resources + nodeCopy := node.DeepCopy() + nodeCopy.Status.Capacity[testExtendedResource] = resource.MustParse("10") + nodeCopy.Status.Allocatable[testExtendedResource] = resource.MustParse("10") + err := patchNode(ctx, cs, &node, nodeCopy) + framework.ExpectNoError(err) + + // Create 10 low priority pods on each node, which will use up 10/10 of the node's resources. + for j := 0; j < 10; j++ { + // Request 2 of the available resources for the victim pods + podRes = v1.ResourceList{} + podRes[testExtendedResource] = resource.MustParse("1") + pausePod := createPausePod(ctx, f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%d-%v", i, j, lowPriorityClassName), + PriorityClassName: lowPriorityClassName, + // This victim pod will be preempted by the high priority pod. + // But, the deletion will be blocked by the finalizer. + // + // The finalizer is needed to prevent the medium Pods from being scheduled instead of the high Pods, + // depending on when the scheduler notices the existence of all the high Pods we create. + Finalizers: []string{testFinalizer}, + Resources: &v1.ResourceRequirements{ + Requests: podRes, + Limits: podRes, + }, + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchFields: []v1.NodeSelectorRequirement{ + {Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}}, + }, + }, + }, + }, + }, + }, + }) + lowPriorityPods = append(lowPriorityPods, pausePod) + framework.Logf("Created pod: %v", pausePod.Name) + } + } + + ginkgo.By("Wait for lower priority pods to be scheduled.") + for _, pod := range lowPriorityPods { + framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod)) + } + + highPriorityPods := make([]*v1.Pod, 0, 5*len(nodeList.Items)) + mediumPriorityPods := make([]*v1.Pod, 0, 10*len(nodeList.Items)) + + ginkgo.By("Run high/medium priority pods that have same requirements as that of lower priority pod") + for i := range nodeList.Items { + // Create medium priority pods first + // to confirm the scheduler finally prioritize the high priority pods, ignoring the medium priority pods. + for j := 0; j < 10; j++ { + // 5 pods per node will be unschedulable + // because the node only has 10 resource, and high priority pods will use 5 resource. + p := createPausePod(ctx, f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%d-%v", i, j, mediumPriorityClassName), + PriorityClassName: mediumPriorityClassName, + Resources: &v1.ResourceRequirements{ + // Set the pod request to the low priority pod's resources + Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests, + Limits: lowPriorityPods[0].Spec.Containers[0].Resources.Requests, + }, + }) + mediumPriorityPods = append(mediumPriorityPods, p) + } + + for j := 0; j < 5; j++ { + p := createPausePod(ctx, f, pausePodConfig{ + Name: fmt.Sprintf("pod%d-%d-%v", i, j, highPriorityClassName), + PriorityClassName: highPriorityClassName, + Resources: &v1.ResourceRequirements{ + // Set the pod request to the low priority pod's resources + Requests: lowPriorityPods[0].Spec.Containers[0].Resources.Requests, + Limits: lowPriorityPods[0].Spec.Containers[0].Resources.Requests, + }, + }) + highPriorityPods = append(highPriorityPods, p) + } + } + + // All low priority Pods should be the target of preemption. + // Those Pods have a finalizer and hence should not be deleted yet at this point. + ginkgo.By("Check all low priority pods to be about to preempted.") + for _, pod := range lowPriorityPods { + framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) { + preemptedPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + return preemptedPod.DeletionTimestamp != nil, nil + })) + } + + // All high priority Pods should be schedulable by removing the low priority Pods. + ginkgo.By("Wait for high priority pods to be ready for the preemption.") + for _, pod := range highPriorityPods { + framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) { + highPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + return highPod.Status.NominatedNodeName != "", err + })) + } + + ginkgo.By("Remove the finalizer from all low priority pods to proceed the preemption.") + for _, pod := range lowPriorityPods { + // Remove the finalizer so that the pod can be deleted by GC + e2epod.NewPodClient(f).RemoveFinalizer(ctx, pod.Name, testFinalizer) + } + + ginkgo.By("Wait for high priority pods to be scheduled.") + for _, pod := range highPriorityPods { + framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod)) + } + + ginkgo.By("Wait for 5 medium priority pods to be scheduled.") + framework.ExpectNoError(wait.PollUntilContextTimeout(ctx, time.Second, framework.PodStartTimeout, false, func(ctx context.Context) (bool, error) { + scheduled := 0 + for _, pod := range mediumPriorityPods { + medPod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + + if medPod.Spec.NodeName != "" { + scheduled++ + } + } + if scheduled > 5 { + return false, fmt.Errorf("expected 5 medium priority pods to be scheduled, but got %d", scheduled) + } + + return scheduled == 5, nil + })) + }) + /* Release: v1.31 Testname: Verify the DisruptionTarget condition is added to the preempted pod