E2E test node upgrade (to same version)

2025-09-10 13:42:02 +00:00 · 2015-05-21 14:43:42 -07:00
parent a8a3e9d0c7
commit a6c47a07de
4 changed files with 519 additions and 117 deletions
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -42,6 +42,7 @@ import (
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"

 	"code.google.com/p/go-uuid/uuid"
@@ -60,15 +61,28 @@ const (
 	// String used to mark pod deletion
 	nonExist = "NonExist"

-	// How often to poll pods.
-	podPoll = 5 * time.Second
+	// How often to poll pods and nodes.
+	poll = 5 * time.Second

 	// service accounts are provisioned after namespace creation
 	// a service account is required to support pod creation in a namespace as part of admission control
 	serviceAccountProvisionTimeout = 2 * time.Minute

-	// How often to poll for service accounts
-	serviceAccountPoll = 5 * time.Second
+	// How long to try single API calls (like 'get' or 'list'). Used to prevent
+	// transient failures from failing tests.
+	singleCallTimeout = 30 * time.Second
+
+	// How long nodes have to be "ready" when a test begins. They should already
+	// be "ready" before the test starts, so this is small.
+	nodeReadyInitialTimeout = 20 * time.Second
+
+	// How long pods have to be "ready" when a test begins. They should already
+	// be "ready" before the test starts, so this is small.
+	podReadyBeforeTimeout = 20 * time.Second
+
+	// How wide to print pod names, by default. Useful for aligning printing to
+	// quickly scan through output.
+	podPrintWidth = 55
 )

 type CloudConfig struct {
@@ -218,7 +232,6 @@ func podRunningReady(p *api.Pod) (bool, error) {
 	if !podReady(p) {
 		return false, fmt.Errorf("pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v",
 			p.ObjectMeta.Name, p.Spec.NodeName, api.PodReady, api.ConditionTrue, p.Status.Conditions)
-
 	}
 	return true, nil
 }
@@ -233,15 +246,16 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
 	if err != nil {
 		return err
 	}
+	start := time.Now()
 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
 		timeout, minPods, ns)
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(podPoll) {
+	if wait.Poll(poll, timeout, func() (bool, error) {
 		// We get the new list of pods in every iteration beause more pods come
 		// online during startup and we want to ensure they are also checked.
 		podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything())
 		if err != nil {
 			Logf("Error getting pods in namespace '%s': %v", ns, err)
-			continue
+			return false, nil
 		}
 		nOk, badPods := 0, []api.Pod{}
 		for _, pod := range podList.Items {
@@ -254,14 +268,17 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
 		Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
 			nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
 		if nOk == len(podList.Items) && nOk >= minPods {
-			return nil
+			return true, nil
 		}
 		logPodStates(badPods)
+		return false, nil
+	}) != nil {
+		return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
 	}
-	return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
+	return nil
 }

-func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, poll, timeout time.Duration) error {
+func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, timeout time.Duration) error {
 	Logf("Waiting up to %v for service account %s to be provisioned in ns %s", timeout, serviceAccountName, ns)
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		_, err := c.ServiceAccounts(ns).Get(serviceAccountName)
@@ -275,20 +292,23 @@ func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName s
 	return fmt.Errorf("Service account %s in namespace %s not ready within %v", serviceAccountName, ns, timeout)
 }

-func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error {
-	Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc)
+func waitForPodCondition(c *client.Client, ns, podName, desc string, timeout time.Duration, condition podCondition) error {
+	Logf("Waiting up to %[1]v for pod %-[2]*[3]s status to be %[4]s", timeout, podPrintWidth, podName, desc)
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		pod, err := c.Pods(ns).Get(podName)
 		if err != nil {
-			Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err)
+			// Aligning this text makes it much more readable
+			Logf("Get pod %-[1]*[2]s in namespace '%[3]s' failed, ignoring for %[4]v. Error: %[5]v",
+				podPrintWidth, podName, ns, poll, err)
 			continue
 		}
 		done, err := condition(pod)
 		if done {
 			return err
 		}
-		Logf("Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)",
-			podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
+		Logf("Waiting for pod %-[1]*[2]s in namespace '%[3]s' status to be '%[4]s'"+
+			"(found phase: %[5]q, readiness: %[6]t) (%[7]v elapsed)",
+			podPrintWidth, podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
 	}
 	return fmt.Errorf("gave up waiting for pod '%s' to be '%s' after %v", podName, desc, timeout)
 }
@@ -297,7 +317,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeo
 // the default service account is what is associated with pods when they do not specify a service account
 // as a result, pods are not able to be provisioned in a namespace until the service account is provisioned
 func waitForDefaultServiceAccountInNamespace(c *client.Client, namespace string) error {
-	return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountPoll, serviceAccountProvisionTimeout)
+	return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountProvisionTimeout)
 }

 // createNS should be used by every test, note that we append a common prefix to the provided test name.
@@ -315,7 +335,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error)
 }

 func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "running", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		if pod.Status.Phase == api.PodRunning {
 			return true, nil
 		}
@@ -332,7 +352,7 @@ func waitForPodRunning(c *client.Client, podName string) error {

 // waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
 func waitForPodNotPending(c *client.Client, ns, podName string) error {
-	return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, ns, podName, "!pending", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		if pod.Status.Phase != api.PodPending {
 			Logf("Saw pod '%s' in namespace '%s' out of pending state (found '%q')", podName, ns, pod.Status.Phase)
 			return true, nil
@@ -343,7 +363,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error {

 // waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
 func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "success or failure", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
 		ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName)
 		if !ok {
@@ -986,6 +1006,19 @@ func DeleteRC(c *client.Client, ns, name string) error {
 	return err
 }

+// Convenient wrapper around listing nodes supporting retries.
+func listNodes(c *client.Client, label labels.Selector, field fields.Selector) (*api.NodeList, error) {
+	var nodes *api.NodeList
+	var errLast error
+	if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
+		nodes, errLast = c.Nodes().List(label, field)
+		return errLast == nil, nil
+	}) != nil {
+		return nil, fmt.Errorf("listNodes() failed with last error: %v", errLast)
+	}
+	return nodes, nil
+}
+
 // FailedContainers inspects all containers in a pod and returns failure
 // information for containers that have failed or been restarted.
 // A map is returned where the key is the containerID and the value is a
@@ -1111,6 +1144,74 @@ func getSigner(provider string) (ssh.Signer, error) {
 	return util.MakePrivateKeySigner(key)
 }

+// checkPodsRunning returns whether all pods whose names are listed in podNames
+// are running and ready.
+func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool {
+	np, desc := len(podNames), "running and ready"
+	Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames)
+	result := make(chan bool, len(podNames))
+	for ix := range podNames {
+		// Launch off pod readiness checkers.
+		go func(name string) {
+			err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady)
+			result <- err == nil
+		}(podNames[ix])
+	}
+	// Wait for them all to finish.
+	success := true
+	// TODO(mbforbes): Change to `for range` syntax and remove logging once we
+	// support only Go >= 1.4.
+	for _, podName := range podNames {
+		if !<-result {
+			Logf("Pod %-[1]*[2]s failed to be %[3]s.", podPrintWidth, podName, desc)
+			success = false
+		}
+	}
+	Logf("Wanted all %d pods to be %s. Result: %t. Pods: %v", np, desc, success, podNames)
+	return success
+}
+
+// waitForNodeToBeReady returns whether node name is ready within timeout.
+func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, true, timeout)
+}
+
+// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
+// readiness condition is anything but ready, e.g false or unknown) within
+// timeout.
+func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, false, timeout)
+}
+
+// waitForNodeToBe returns whether node name's readiness state matches wantReady
+// within timeout. If wantReady is true, it will ensure the node is ready; if
+// it's false, it ensures the node is in any state other than ready (e.g. not
+// ready or unknown).
+func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
+	Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
+		node, err := c.Nodes().Get(name)
+		if err != nil {
+			Logf("Couldn't get node %s", name)
+			continue
+		}
+
+		// Check the node readiness condition (logging all).
+		for i, cond := range node.Status.Conditions {
+			Logf("Node %s condition %d/%d: type: %v, status: %v",
+				name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
+			// Ensure that the condition type is readiness and the status
+			// matches as desired.
+			if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
+				Logf("Successfully found node %s readiness to be %t", name, wantReady)
+				return true
+			}
+		}
+	}
+	Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
+	return false
+}
+
 // LatencyMetrics stores data about request latency at a given quantile
 // broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services).
 type LatencyMetric struct {