From ca90621b2a6bccf773aa2547e263825498e6b765 Mon Sep 17 00:00:00 2001
From: Michal Wozniak <mw219725@gmail.com>
Date: Fri, 5 Aug 2022 16:27:07 +0200
Subject: [PATCH] Simplify the integration test for node lifecycle manager

---
 test/integration/node/lifecycle_test.go | 174 ++++++------------------
 1 file changed, 44 insertions(+), 130 deletions(-)

diff --git a/test/integration/node/lifecycle_test.go b/test/integration/node/lifecycle_test.go
index c274d691b7f..0e077eb8ac2 100644
--- a/test/integration/node/lifecycle_test.go
+++ b/test/integration/node/lifecycle_test.go
@@ -23,7 +23,6 @@ import (
 	"time"
 
 	v1 "k8s.io/api/core/v1"
-	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
@@ -44,11 +43,6 @@ import (
 	imageutils "k8s.io/kubernetes/test/utils/image"
 )
 
-// poll is how often to poll pods, nodes and claims.
-const poll = 2 * time.Second
-
-type podCondition func(pod *v1.Pod) (bool, error)
-
 // TestEvictionForNoExecuteTaintAddedByUser tests taint-based eviction for a node tainted NoExecute
 func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) {
 	tests := map[string]struct {
@@ -187,9 +181,9 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) {
 func TestTaintBasedEvictions(t *testing.T) {
 	// we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode
 	nodeCount := 3
+	nodeIndex := 1 // the exact node doesn't matter, pick one
 	zero := int64(0)
 	gracePeriod := int64(1)
-	heartbeatInternal := time.Second * 2
 	testPod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero},
 		Spec: v1.PodSpec{
@@ -206,19 +200,20 @@ func TestTaintBasedEvictions(t *testing.T) {
 			TerminationGracePeriodSeconds: &gracePeriod,
 		},
 	}
-	tolerationSeconds := []int64{200, 300, 0}
 	tests := []struct {
 		name                        string
 		nodeTaints                  []v1.Taint
 		nodeConditions              []v1.NodeCondition
 		pod                         *v1.Pod
+		tolerationSeconds           int64
 		expectedWaitForPodCondition string
 	}{
 		{
 			name:                        "Taint based evictions for NodeNotReady and 200 tolerationseconds",
 			nodeTaints:                  []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
 			nodeConditions:              []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
-			pod:                         testPod,
+			pod:                         testPod.DeepCopy(),
+			tolerationSeconds:           200,
 			expectedWaitForPodCondition: "updated with tolerationSeconds of 200",
 		},
 		{
@@ -233,13 +228,15 @@ func TestTaintBasedEvictions(t *testing.T) {
 					},
 				},
 			},
+			tolerationSeconds:           300,
 			expectedWaitForPodCondition: "updated with tolerationSeconds=300",
 		},
 		{
 			name:                        "Taint based evictions for NodeNotReady and 0 tolerationseconds",
 			nodeTaints:                  []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
 			nodeConditions:              []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
-			pod:                         testPod,
+			pod:                         testPod.DeepCopy(),
+			tolerationSeconds:           0,
 			expectedWaitForPodCondition: "terminating",
 		},
 		{
@@ -255,7 +252,7 @@ func TestTaintBasedEvictions(t *testing.T) {
 		podTolerations,
 		defaulttolerationseconds.NewDefaultTolerationSeconds(),
 	)
-	for i, test := range tests {
+	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			testCtx := testutils.InitTestAPIServer(t, "taint-based-evictions", admission)
 
@@ -267,7 +264,6 @@ func TestTaintBasedEvictions(t *testing.T) {
 			podTolerations.SetExternalKubeClientSet(externalClientset)
 			podTolerations.SetExternalKubeInformerFactory(externalInformers)
 
-			testCtx = testutils.InitTestScheduler(t, testCtx)
 			defer testutils.CleanupTest(t, testCtx)
 			cs := testCtx.ClientSet
 
@@ -279,7 +275,7 @@ func TestTaintBasedEvictions(t *testing.T) {
 				externalInformers.Core().V1().Nodes(),
 				externalInformers.Apps().V1().DaemonSets(),
 				cs,
-				5*time.Second,    // Node monitor grace period
+				1*time.Second,    // Node monitor grace period
 				time.Minute,      // Node startup grace period
 				time.Millisecond, // Node monitor period
 				time.Second,      // Pod eviction timeout
@@ -290,18 +286,15 @@ func TestTaintBasedEvictions(t *testing.T) {
 				true,             // Run taint manager
 			)
 			if err != nil {
-				t.Errorf("Failed to create node controller: %v", err)
-				return
+				t.Fatalf("Failed to create node controller: %v", err)
 			}
 
 			// Waiting for all controllers to sync
 			externalInformers.Start(testCtx.Ctx.Done())
 			externalInformers.WaitForCacheSync(testCtx.Ctx.Done())
-			testutils.SyncInformerFactory(testCtx)
 
-			// Run all controllers
+			// Run the controller
 			go nc.Run(testCtx.Ctx)
-			go testCtx.Scheduler.Run(testCtx.Ctx)
 
 			nodeRes := v1.ResourceList{
 				v1.ResourceCPU:    resource.MustParse("4000m"),
@@ -311,120 +304,68 @@ func TestTaintBasedEvictions(t *testing.T) {
 
 			var nodes []*v1.Node
 			for i := 0; i < nodeCount; i++ {
-				nodes = append(nodes, &v1.Node{
+				node := &v1.Node{
 					ObjectMeta: metav1.ObjectMeta{
-						Name:   fmt.Sprintf("node-%d", i),
-						Labels: map[string]string{v1.LabelTopologyRegion: "region1", v1.LabelTopologyZone: "zone1"},
+						Name: fmt.Sprintf("node-%d", i),
+						Labels: map[string]string{
+							v1.LabelTopologyRegion:                  "region1",
+							v1.LabelTopologyZone:                    "zone1",
+							"node.kubernetes.io/exclude-disruption": "true",
+						},
 					},
 					Spec: v1.NodeSpec{},
 					Status: v1.NodeStatus{
 						Capacity:    nodeRes,
 						Allocatable: nodeRes,
-						Conditions: []v1.NodeCondition{
-							{
-								Type:              v1.NodeReady,
-								Status:            v1.ConditionTrue,
-								LastHeartbeatTime: metav1.Now(),
-							},
-						},
 					},
-				})
-				if _, err := cs.CoreV1().Nodes().Create(context.TODO(), nodes[i], metav1.CreateOptions{}); err != nil {
-					t.Errorf("Failed to create node, err: %v", err)
+				}
+				if i == nodeIndex {
+					node.Status.Conditions = append(node.Status.Conditions, test.nodeConditions...)
+				} else {
+					node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
+						Type:   v1.NodeReady,
+						Status: v1.ConditionTrue,
+					})
+				}
+				nodes = append(nodes, node)
+				if _, err := cs.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}); err != nil {
+					t.Fatalf("Failed to create node: %q, err: %v", klog.KObj(node), err)
 				}
 			}
 
-			neededNode := nodes[1]
 			if test.pod != nil {
-				test.pod.Name = fmt.Sprintf("testpod-%d", i)
+				test.pod.Spec.NodeName = nodes[nodeIndex].Name
+				test.pod.Name = "testpod"
 				if len(test.pod.Spec.Tolerations) > 0 {
-					test.pod.Spec.Tolerations[0].TolerationSeconds = &tolerationSeconds[i]
+					test.pod.Spec.Tolerations[0].TolerationSeconds = &test.tolerationSeconds
 				}
 
 				test.pod, err = cs.CoreV1().Pods(testCtx.NS.Name).Create(context.TODO(), test.pod, metav1.CreateOptions{})
 				if err != nil {
-					t.Fatalf("Test Failed: error: %v, while creating pod", err)
-				}
-
-				if err := testutils.WaitForPodToSchedule(cs, test.pod); err != nil {
-					t.Errorf("Failed to schedule pod %s/%s on the node, err: %v",
-						test.pod.Namespace, test.pod.Name, err)
-				}
-				test.pod, err = cs.CoreV1().Pods(testCtx.NS.Name).Get(context.TODO(), test.pod.Name, metav1.GetOptions{})
-				if err != nil {
-					t.Fatalf("Test Failed: error: %v, while creating pod", err)
-				}
-				neededNode, err = cs.CoreV1().Nodes().Get(context.TODO(), test.pod.Spec.NodeName, metav1.GetOptions{})
-				if err != nil {
-					t.Fatalf("Error while getting node associated with pod %v with err %v", test.pod.Name, err)
+					t.Fatalf("Test Failed: error: %q, while creating pod %q", err, klog.KObj(test.pod))
 				}
 			}
 
-			// Regularly send heartbeat event to APIServer so that the cluster doesn't enter fullyDisruption mode.
-			// TODO(Huang-Wei): use "NodeDisruptionExclusion" feature to simply the below logic when it's beta.
-			for i := 0; i < nodeCount; i++ {
-				var conditions []v1.NodeCondition
-				// If current node is not <neededNode>
-				if neededNode.Name != nodes[i].Name {
-					conditions = []v1.NodeCondition{
-						{
-							Type:   v1.NodeReady,
-							Status: v1.ConditionTrue,
-						},
-					}
-				} else {
-					c, err := testutils.NodeReadyStatus(test.nodeConditions)
-					if err != nil {
-						t.Error(err)
-					}
-					// Need to distinguish NodeReady/False and NodeReady/Unknown.
-					// If we try to update the node with condition NotReady/False, i.e. expect a NotReady:NoExecute taint
-					// we need to keep sending the update event to keep it alive, rather than just sending once.
-					if c == v1.ConditionFalse {
-						conditions = test.nodeConditions
-					} else if c == v1.ConditionUnknown {
-						// If it's expected to update the node with condition NotReady/Unknown,
-						// i.e. expect a Unreachable:NoExecute taint,
-						// we need to only send the update event once to simulate the network unreachable scenario.
-						nodeCopy := testutils.NodeCopyWithConditions(nodes[i], test.nodeConditions)
-						if err := testutils.UpdateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) {
-							t.Errorf("Cannot update node: %v", err)
-						}
-						continue
-					}
-				}
-				// Keeping sending NodeReady/True or NodeReady/False events.
-				go func(i int) {
-					for {
-						select {
-						case <-testCtx.Ctx.Done():
-							return
-						case <-time.Tick(heartbeatInternal):
-							nodeCopy := testutils.NodeCopyWithConditions(nodes[i], conditions)
-							if err := testutils.UpdateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) {
-								t.Errorf("Cannot update node: %v", err)
-							}
-						}
-					}
-				}(i)
-			}
-
-			if err := testutils.WaitForNodeTaints(cs, neededNode, test.nodeTaints); err != nil {
-				t.Errorf("Failed to taint node in test %d <%s>, err: %v", i, neededNode.Name, err)
+			if err := testutils.WaitForNodeTaints(cs, nodes[nodeIndex], test.nodeTaints); err != nil {
+				t.Errorf("Failed to taint node %q, err: %v", klog.KObj(nodes[nodeIndex]), err)
 			}
 
 			if test.pod != nil {
-				err = waitForPodCondition(cs, testCtx.NS.Name, test.pod.Name, test.expectedWaitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) {
+				err = wait.PollImmediate(time.Second, time.Second*15, func() (bool, error) {
+					pod, err := cs.CoreV1().Pods(test.pod.Namespace).Get(context.TODO(), test.pod.Name, metav1.GetOptions{})
+					if err != nil {
+						return false, err
+					}
 					// as node is unreachable, pod0 is expected to be in Terminating status
 					// rather than getting deleted
-					if tolerationSeconds[i] == 0 {
+					if test.tolerationSeconds == 0 {
 						return pod.DeletionTimestamp != nil, nil
 					}
 					if seconds, err := testutils.GetTolerationSeconds(pod.Spec.Tolerations); err == nil {
-						return seconds == tolerationSeconds[i], nil
+						return seconds == test.tolerationSeconds, nil
 					}
 					return false, nil
-				}, t)
+				})
 				if err != nil {
 					pod, _ := cs.CoreV1().Pods(testCtx.NS.Name).Get(context.TODO(), test.pod.Name, metav1.GetOptions{})
 					t.Fatalf("Error: %v, Expected test pod to be %s but it's %v", err, test.expectedWaitForPodCondition, pod)
@@ -432,33 +373,6 @@ func TestTaintBasedEvictions(t *testing.T) {
 				testutils.CleanupPods(cs, t, []*v1.Pod{test.pod})
 			}
 			testutils.CleanupNodes(cs, t)
-			testutils.WaitForSchedulerCacheCleanup(testCtx.Scheduler, t)
 		})
 	}
 }
-
-// waitForPodCondition waits a pods to be matched to the given condition.
-func waitForPodCondition(c clientset.Interface, ns, podName, desc string, timeout time.Duration, condition podCondition, t *testing.T) error {
-	t.Logf("Waiting up to %v for pod %q in namespace %q to be %q", timeout, podName, ns, desc)
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
-		pod, err := c.CoreV1().Pods(ns).Get(context.TODO(), podName, metav1.GetOptions{})
-		if err != nil {
-			if apierrors.IsNotFound(err) {
-				t.Logf("Pod %q in namespace %q not found. Error: %v", podName, ns, err)
-				return err
-			}
-			t.Logf("Get pod %q in namespace %q failed, ignoring for %v. Error: %v", podName, ns, poll, err)
-			continue
-		}
-		// log now so that current pod info is reported before calling `condition()`
-		t.Logf("Pod %q: Phase=%q, Reason=%q, readiness=%t. Elapsed: %v",
-			podName, pod.Status.Phase, pod.Status.Reason, podutil.IsPodReady(pod), time.Since(start))
-		if done, err := condition(pod); done {
-			if err == nil {
-				t.Logf("Pod %q satisfied condition %q", podName, desc)
-			}
-			return err
-		}
-	}
-	return fmt.Errorf("gave up after waiting %v for pod %q to be %q", timeout, podName, desc)
-}