From 369a9001c6f7192eaef6cc4460aeb4b4373f25e5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 3 Jun 2020 17:33:58 -0700 Subject: [PATCH] Fix an issue that a Pod's nominatedNodeName cannot be cleared when the nominated node is deleted --- pkg/scheduler/scheduler.go | 7 +- pkg/scheduler/scheduler_test.go | 4 +- test/integration/scheduler/preemption_test.go | 88 +++++++++++++++++++ test/integration/scheduler/util.go | 2 +- 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 483e4e7bbc9..28f277bd88d 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -386,13 +386,10 @@ func updatePod(client clientset.Interface, pod *v1.Pod, condition *v1.PodConditi podCopy := pod.DeepCopy() // NominatedNodeName is updated only if we are trying to set it, and the value is // different from the existing one. - if !podutil.UpdatePodCondition(&podCopy.Status, condition) && - (len(nominatedNode) == 0 || pod.Status.NominatedNodeName == nominatedNode) { + if !podutil.UpdatePodCondition(&podCopy.Status, condition) && pod.Status.NominatedNodeName == nominatedNode { return nil } - if nominatedNode != "" { - podCopy.Status.NominatedNodeName = nominatedNode - } + podCopy.Status.NominatedNodeName = nominatedNode return patchPod(client, pod, podCopy) } diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 529f97bfdb7..c4e15e92373 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -1455,7 +1455,7 @@ func TestUpdatePod(t *testing.T) { expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","message":"newMessage","reason":"newReason","type":"currentType"}]}}`, }, { - name: "Should not make patch request if pod condition already exists and is identical and nominated node name is not set", + name: "Should make patch request if pod condition already exists and is identical but nominated node name is different", currentPodConditions: []v1.PodCondition{ { Type: "currentType", @@ -1475,7 +1475,7 @@ func TestUpdatePod(t *testing.T) { Message: "currentMessage", }, currentNominatedNodeName: "node1", - expectedPatchRequests: 0, + expectedPatchRequests: 1, }, { name: "Should make patch request if pod condition already exists and is identical but nominated node name is set and different", diff --git a/test/integration/scheduler/preemption_test.go b/test/integration/scheduler/preemption_test.go index e3fffcfc9dd..ceceb292ee1 100644 --- a/test/integration/scheduler/preemption_test.go +++ b/test/integration/scheduler/preemption_test.go @@ -26,6 +26,7 @@ import ( v1 "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -1023,6 +1024,93 @@ func TestNominatedNodeCleanUp(t *testing.T) { } } +func TestNominatedNodeCleanUpUponNodeDeletion(t *testing.T) { + // Initialize scheduler. + testCtx := initTest(t, "preemption") + defer testutils.CleanupTest(t, testCtx) + + cs := testCtx.ClientSet + defer cleanupPodsInNamespace(cs, t, testCtx.NS.Name) + + // Create a node with some resources and a label. + nodeRes := &v1.ResourceList{ + v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI), + } + nodeNames := []string{"node1", "node2"} + for _, nodeName := range nodeNames { + _, err := createNode(testCtx.ClientSet, nodeName, nodeRes) + if err != nil { + t.Fatalf("Error creating nodes: %v", err) + } + } + + // Fill the cluster with one high-priority and one low-priority Pod. + highPriPod, err := createPausePod(cs, mkPriorityPodWithGrace(testCtx, "high-pod", highPriority, 60)) + if err != nil { + t.Fatalf("Error creating high-priority pod: %v", err) + } + // Make sure the pod is scheduled. + if err := testutils.WaitForPodToSchedule(cs, highPriPod); err != nil { + t.Fatalf("Pod %v/%v didn't get scheduled: %v", highPriPod.Namespace, highPriPod.Name, err) + } + + lowPriPod, err := createPausePod(cs, mkPriorityPodWithGrace(testCtx, "low-pod", lowPriority, 60)) + if err != nil { + t.Fatalf("Error creating low-priority pod: %v", err) + } + // Make sure the pod is scheduled. + if err := testutils.WaitForPodToSchedule(cs, lowPriPod); err != nil { + t.Fatalf("Pod %v/%v didn't get scheduled: %v", lowPriPod.Namespace, lowPriPod.Name, err) + } + + // Create a medium-priority Pod. + medPriPod, err := createPausePod(cs, mkPriorityPodWithGrace(testCtx, "med-pod", mediumPriority, 60)) + if err != nil { + t.Fatalf("Error creating med-priority pod: %v", err) + } + // Check its nominatedNodeName field is set properly. + if err := waitForNominatedNodeName(cs, medPriPod); err != nil { + t.Fatalf("NominatedNodeName was not set for pod %v/%v: %v", medPriPod.Namespace, medPriPod.Name, err) + } + + // Get the live version of low and med pods. + lowPriPod, _ = getPod(cs, lowPriPod.Name, lowPriPod.Namespace) + medPriPod, _ = getPod(cs, medPriPod.Name, medPriPod.Namespace) + + want, got := lowPriPod.Spec.NodeName, medPriPod.Status.NominatedNodeName + if want != got { + t.Fatalf("Expect med-priority's nominatedNodeName to be %v, but got %v.", want, got) + } + + // Delete the node where med-priority pod is nominated to. + cs.CoreV1().Nodes().Delete(context.TODO(), got, metav1.DeleteOptions{}) + if err := wait.Poll(200*time.Millisecond, wait.ForeverTestTimeout, func() (bool, error) { + _, err := cs.CoreV1().Nodes().Get(context.TODO(), got, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err + }); err != nil { + t.Fatalf("Node %v cannot be deleted: %v.", got, err) + } + + // Finally verify if med-priority pod's nominatedNodeName gets cleared. + if err := wait.Poll(200*time.Millisecond, wait.ForeverTestTimeout, func() (bool, error) { + pod, err := cs.CoreV1().Pods(medPriPod.Namespace).Get(context.TODO(), medPriPod.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Error getting the medium priority pod info: %v", err) + } + if len(pod.Status.NominatedNodeName) == 0 { + return true, nil + } + return false, err + }); err != nil { + t.Errorf("The nominated node name of the medium priority pod was not cleared: %v", err) + } +} + func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget { intMinAvailable := intstr.FromInt(minAvailable) return &policy.PodDisruptionBudget{ diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index 343725ee9ba..1087c1ea8a1 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -483,7 +483,7 @@ func noPodsInNamespace(c clientset.Interface, podNamespace string) wait.Conditio // cleanupPodsInNamespace deletes the pods in the given namespace and waits for them to // be actually deleted. func cleanupPodsInNamespace(cs clientset.Interface, t *testing.T, ns string) { - if err := cs.CoreV1().Pods(ns).DeleteCollection(context.TODO(), metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil { + if err := cs.CoreV1().Pods(ns).DeleteCollection(context.TODO(), *metav1.NewDeleteOptions(0), metav1.ListOptions{}); err != nil { t.Errorf("error while listing pod in namespace %v: %v", ns, err) return }