From ca90621b2a6bccf773aa2547e263825498e6b765 Mon Sep 17 00:00:00 2001 From: Michal Wozniak Date: Fri, 5 Aug 2022 16:27:07 +0200 Subject: [PATCH] Simplify the integration test for node lifecycle manager --- test/integration/node/lifecycle_test.go | 174 ++++++------------------ 1 file changed, 44 insertions(+), 130 deletions(-) diff --git a/test/integration/node/lifecycle_test.go b/test/integration/node/lifecycle_test.go index c274d691b7f..0e077eb8ac2 100644 --- a/test/integration/node/lifecycle_test.go +++ b/test/integration/node/lifecycle_test.go @@ -23,7 +23,6 @@ import ( "time" v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -44,11 +43,6 @@ import ( imageutils "k8s.io/kubernetes/test/utils/image" ) -// poll is how often to poll pods, nodes and claims. -const poll = 2 * time.Second - -type podCondition func(pod *v1.Pod) (bool, error) - // TestEvictionForNoExecuteTaintAddedByUser tests taint-based eviction for a node tainted NoExecute func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) { tests := map[string]struct { @@ -187,9 +181,9 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) { func TestTaintBasedEvictions(t *testing.T) { // we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode nodeCount := 3 + nodeIndex := 1 // the exact node doesn't matter, pick one zero := int64(0) gracePeriod := int64(1) - heartbeatInternal := time.Second * 2 testPod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero}, Spec: v1.PodSpec{ @@ -206,19 +200,20 @@ func TestTaintBasedEvictions(t *testing.T) { TerminationGracePeriodSeconds: &gracePeriod, }, } - tolerationSeconds := []int64{200, 300, 0} tests := []struct { name string nodeTaints []v1.Taint nodeConditions []v1.NodeCondition pod *v1.Pod + tolerationSeconds int64 expectedWaitForPodCondition string }{ { name: "Taint based evictions for NodeNotReady and 200 tolerationseconds", nodeTaints: []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, - pod: testPod, + pod: testPod.DeepCopy(), + tolerationSeconds: 200, expectedWaitForPodCondition: "updated with tolerationSeconds of 200", }, { @@ -233,13 +228,15 @@ func TestTaintBasedEvictions(t *testing.T) { }, }, }, + tolerationSeconds: 300, expectedWaitForPodCondition: "updated with tolerationSeconds=300", }, { name: "Taint based evictions for NodeNotReady and 0 tolerationseconds", nodeTaints: []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, - pod: testPod, + pod: testPod.DeepCopy(), + tolerationSeconds: 0, expectedWaitForPodCondition: "terminating", }, { @@ -255,7 +252,7 @@ func TestTaintBasedEvictions(t *testing.T) { podTolerations, defaulttolerationseconds.NewDefaultTolerationSeconds(), ) - for i, test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { testCtx := testutils.InitTestAPIServer(t, "taint-based-evictions", admission) @@ -267,7 +264,6 @@ func TestTaintBasedEvictions(t *testing.T) { podTolerations.SetExternalKubeClientSet(externalClientset) podTolerations.SetExternalKubeInformerFactory(externalInformers) - testCtx = testutils.InitTestScheduler(t, testCtx) defer testutils.CleanupTest(t, testCtx) cs := testCtx.ClientSet @@ -279,7 +275,7 @@ func TestTaintBasedEvictions(t *testing.T) { externalInformers.Core().V1().Nodes(), externalInformers.Apps().V1().DaemonSets(), cs, - 5*time.Second, // Node monitor grace period + 1*time.Second, // Node monitor grace period time.Minute, // Node startup grace period time.Millisecond, // Node monitor period time.Second, // Pod eviction timeout @@ -290,18 +286,15 @@ func TestTaintBasedEvictions(t *testing.T) { true, // Run taint manager ) if err != nil { - t.Errorf("Failed to create node controller: %v", err) - return + t.Fatalf("Failed to create node controller: %v", err) } // Waiting for all controllers to sync externalInformers.Start(testCtx.Ctx.Done()) externalInformers.WaitForCacheSync(testCtx.Ctx.Done()) - testutils.SyncInformerFactory(testCtx) - // Run all controllers + // Run the controller go nc.Run(testCtx.Ctx) - go testCtx.Scheduler.Run(testCtx.Ctx) nodeRes := v1.ResourceList{ v1.ResourceCPU: resource.MustParse("4000m"), @@ -311,120 +304,68 @@ func TestTaintBasedEvictions(t *testing.T) { var nodes []*v1.Node for i := 0; i < nodeCount; i++ { - nodes = append(nodes, &v1.Node{ + node := &v1.Node{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("node-%d", i), - Labels: map[string]string{v1.LabelTopologyRegion: "region1", v1.LabelTopologyZone: "zone1"}, + Name: fmt.Sprintf("node-%d", i), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + "node.kubernetes.io/exclude-disruption": "true", + }, }, Spec: v1.NodeSpec{}, Status: v1.NodeStatus{ Capacity: nodeRes, Allocatable: nodeRes, - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - }, - }, }, - }) - if _, err := cs.CoreV1().Nodes().Create(context.TODO(), nodes[i], metav1.CreateOptions{}); err != nil { - t.Errorf("Failed to create node, err: %v", err) + } + if i == nodeIndex { + node.Status.Conditions = append(node.Status.Conditions, test.nodeConditions...) + } else { + node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }) + } + nodes = append(nodes, node) + if _, err := cs.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}); err != nil { + t.Fatalf("Failed to create node: %q, err: %v", klog.KObj(node), err) } } - neededNode := nodes[1] if test.pod != nil { - test.pod.Name = fmt.Sprintf("testpod-%d", i) + test.pod.Spec.NodeName = nodes[nodeIndex].Name + test.pod.Name = "testpod" if len(test.pod.Spec.Tolerations) > 0 { - test.pod.Spec.Tolerations[0].TolerationSeconds = &tolerationSeconds[i] + test.pod.Spec.Tolerations[0].TolerationSeconds = &test.tolerationSeconds } test.pod, err = cs.CoreV1().Pods(testCtx.NS.Name).Create(context.TODO(), test.pod, metav1.CreateOptions{}) if err != nil { - t.Fatalf("Test Failed: error: %v, while creating pod", err) - } - - if err := testutils.WaitForPodToSchedule(cs, test.pod); err != nil { - t.Errorf("Failed to schedule pod %s/%s on the node, err: %v", - test.pod.Namespace, test.pod.Name, err) - } - test.pod, err = cs.CoreV1().Pods(testCtx.NS.Name).Get(context.TODO(), test.pod.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Test Failed: error: %v, while creating pod", err) - } - neededNode, err = cs.CoreV1().Nodes().Get(context.TODO(), test.pod.Spec.NodeName, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Error while getting node associated with pod %v with err %v", test.pod.Name, err) + t.Fatalf("Test Failed: error: %q, while creating pod %q", err, klog.KObj(test.pod)) } } - // Regularly send heartbeat event to APIServer so that the cluster doesn't enter fullyDisruption mode. - // TODO(Huang-Wei): use "NodeDisruptionExclusion" feature to simply the below logic when it's beta. - for i := 0; i < nodeCount; i++ { - var conditions []v1.NodeCondition - // If current node is not - if neededNode.Name != nodes[i].Name { - conditions = []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - }, - } - } else { - c, err := testutils.NodeReadyStatus(test.nodeConditions) - if err != nil { - t.Error(err) - } - // Need to distinguish NodeReady/False and NodeReady/Unknown. - // If we try to update the node with condition NotReady/False, i.e. expect a NotReady:NoExecute taint - // we need to keep sending the update event to keep it alive, rather than just sending once. - if c == v1.ConditionFalse { - conditions = test.nodeConditions - } else if c == v1.ConditionUnknown { - // If it's expected to update the node with condition NotReady/Unknown, - // i.e. expect a Unreachable:NoExecute taint, - // we need to only send the update event once to simulate the network unreachable scenario. - nodeCopy := testutils.NodeCopyWithConditions(nodes[i], test.nodeConditions) - if err := testutils.UpdateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) { - t.Errorf("Cannot update node: %v", err) - } - continue - } - } - // Keeping sending NodeReady/True or NodeReady/False events. - go func(i int) { - for { - select { - case <-testCtx.Ctx.Done(): - return - case <-time.Tick(heartbeatInternal): - nodeCopy := testutils.NodeCopyWithConditions(nodes[i], conditions) - if err := testutils.UpdateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) { - t.Errorf("Cannot update node: %v", err) - } - } - } - }(i) - } - - if err := testutils.WaitForNodeTaints(cs, neededNode, test.nodeTaints); err != nil { - t.Errorf("Failed to taint node in test %d <%s>, err: %v", i, neededNode.Name, err) + if err := testutils.WaitForNodeTaints(cs, nodes[nodeIndex], test.nodeTaints); err != nil { + t.Errorf("Failed to taint node %q, err: %v", klog.KObj(nodes[nodeIndex]), err) } if test.pod != nil { - err = waitForPodCondition(cs, testCtx.NS.Name, test.pod.Name, test.expectedWaitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) { + err = wait.PollImmediate(time.Second, time.Second*15, func() (bool, error) { + pod, err := cs.CoreV1().Pods(test.pod.Namespace).Get(context.TODO(), test.pod.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } // as node is unreachable, pod0 is expected to be in Terminating status // rather than getting deleted - if tolerationSeconds[i] == 0 { + if test.tolerationSeconds == 0 { return pod.DeletionTimestamp != nil, nil } if seconds, err := testutils.GetTolerationSeconds(pod.Spec.Tolerations); err == nil { - return seconds == tolerationSeconds[i], nil + return seconds == test.tolerationSeconds, nil } return false, nil - }, t) + }) if err != nil { pod, _ := cs.CoreV1().Pods(testCtx.NS.Name).Get(context.TODO(), test.pod.Name, metav1.GetOptions{}) t.Fatalf("Error: %v, Expected test pod to be %s but it's %v", err, test.expectedWaitForPodCondition, pod) @@ -432,33 +373,6 @@ func TestTaintBasedEvictions(t *testing.T) { testutils.CleanupPods(cs, t, []*v1.Pod{test.pod}) } testutils.CleanupNodes(cs, t) - testutils.WaitForSchedulerCacheCleanup(testCtx.Scheduler, t) }) } } - -// waitForPodCondition waits a pods to be matched to the given condition. -func waitForPodCondition(c clientset.Interface, ns, podName, desc string, timeout time.Duration, condition podCondition, t *testing.T) error { - t.Logf("Waiting up to %v for pod %q in namespace %q to be %q", timeout, podName, ns, desc) - for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { - pod, err := c.CoreV1().Pods(ns).Get(context.TODO(), podName, metav1.GetOptions{}) - if err != nil { - if apierrors.IsNotFound(err) { - t.Logf("Pod %q in namespace %q not found. Error: %v", podName, ns, err) - return err - } - t.Logf("Get pod %q in namespace %q failed, ignoring for %v. Error: %v", podName, ns, poll, err) - continue - } - // log now so that current pod info is reported before calling `condition()` - t.Logf("Pod %q: Phase=%q, Reason=%q, readiness=%t. Elapsed: %v", - podName, pod.Status.Phase, pod.Status.Reason, podutil.IsPodReady(pod), time.Since(start)) - if done, err := condition(pod); done { - if err == nil { - t.Logf("Pod %q satisfied condition %q", podName, desc) - } - return err - } - } - return fmt.Errorf("gave up after waiting %v for pod %q to be %q", timeout, podName, desc) -}