From 5d1d37f9ca61bb6bf109ac9351857e2ca7c48119 Mon Sep 17 00:00:00 2001
From: Max Forbes <maxforbes@google.com>
Date: Wed, 29 Apr 2015 15:28:48 -0700
Subject: [PATCH] Add a node reboot test.

---
 test/e2e/reboot.go | 281 +++++++++++++++++++++++++++++++++++++++++++++
 test/e2e/ssh.go    |  27 +----
 test/e2e/util.go   |  50 ++++++--
 3 files changed, 325 insertions(+), 33 deletions(-)
 create mode 100644 test/e2e/reboot.go

diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go
new file mode 100644
index 00000000000..aca6d01b673
--- /dev/null
+++ b/test/e2e/reboot.go
@@ -0,0 +1,281 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	// How long to pause between polling node or pod status.
+	poll = 5 * time.Second
+
+	// How long nodes have to be "ready" before the reboot. They should already
+	// be "ready" before the test starts, so this is small.
+	nodeReadyInitialTimeout = 20 * time.Second
+
+	// How long pods have to be "ready" before the reboot. They should already
+	// be "ready" before the test starts, so this is small.
+	podReadyBeforeTimeout = 20 * time.Second
+
+	// How long a node is allowed to go from "Ready" to "NotReady" after a
+	// reboot is issued before the test is considered failed.
+	rebootNotReadyTimeout = 2 * time.Minute
+
+	// How long a node is allowed to go from "NotReady" to "Ready" after a
+	// reboot is issued and it is found to be "NotReady" before the test is
+	// considered failed.
+	rebootReadyAgainTimeout = 5 * time.Minute
+
+	// How long pods have to be "ready" after the reboot.
+	podReadyAgainTimeout = 5 * time.Minute
+)
+
+var _ = Describe("Reboot", func() {
+	BeforeEach(func() {
+		var err error
+		c, err = loadClient()
+		Expect(err).NotTo(HaveOccurred())
+	})
+
+	It("should reboot each node and ensure they function upon restart", func() {
+		// This test requires SSH, so the provider check should be identical to
+		// there (the limiting factor is the implementation of util.go's
+		// getSigner(...)).
+		provider := testContext.Provider
+		if !providerIs("gce") {
+			By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider))
+			return
+		}
+
+		// Get all nodes, and kick off the test on each.
+		nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
+		if err != nil {
+			Failf("Error getting nodes: %v", err)
+		}
+		result := make(chan bool, len(nodelist.Items))
+		for _, n := range nodelist.Items {
+			go rebootNode(c, provider, n.ObjectMeta.Name, result)
+		}
+
+		// Wait for all to finish and check the final result.
+		failed := false
+		// TODO(mbforbes): Change to `for range` syntax and remove logging once
+		// we support only Go >= 1.4.
+		for _, n := range nodelist.Items {
+			if !<-result {
+				Failf("Node %s failed reboot test.", n.ObjectMeta.Name)
+				failed = true
+			}
+		}
+		if failed {
+			Failf("Test failed; at least one node failed to reboot in the time given.")
+		}
+	})
+})
+
+// rebootNode takes node name on provider through the following steps using c:
+//  - ensures the node is ready
+//  - ensures all pods on the node are running and ready
+//  - reboots the node
+//  - ensures the node reaches some non-ready state
+//  - ensures the node becomes ready again
+//  - ensures all pods on the node become running and ready again
+//
+// It returns true through result only if all of the steps pass; at the first
+// failed step, it will return false through result and not run the rest.
+func rebootNode(c *client.Client, provider, name string, result chan bool) {
+	// Get the node initially.
+	Logf("Getting %s", name)
+	node, err := c.Nodes().Get(name)
+	if err != nil {
+		Logf("Gouldn't get node %s", name)
+		result <- false
+		return
+	}
+
+	// Node sanity check: ensure it is "ready".
+	if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) {
+		result <- false
+		return
+	}
+
+	// Get all the pods on the node.
+	podList, err := c.Pods(api.NamespaceDefault).List(
+		labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
+	if err != nil {
+		Logf("Error getting pods for node %s: %v", name, err)
+		result <- false
+		return
+	}
+	podNames := make([]string, len(podList.Items))
+	for i, p := range podList.Items {
+		podNames[i] = p.ObjectMeta.Name
+	}
+	Logf("Node %s has %d pods: %v", name, len(podNames), podNames)
+
+	// For each pod, we do a sanity check to ensure it's running / healthy
+	// now, as that's what we'll be checking later.
+	if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) {
+		result <- false
+		return
+	}
+
+	// Reboot the node.
+	Logf("Getting external IP address for %s", name)
+	host := ""
+	for _, a := range node.Status.Addresses {
+		if a.Type == api.NodeExternalIP {
+			host = a.Address + ":22"
+			break
+		}
+	}
+	if host == "" {
+		Logf("Couldn't find external IP address for node %s", name)
+		result <- false
+		return
+	}
+	Logf("Calling reboot on %s", name)
+	rebootCmd := "sudo reboot"
+	if _, _, code, err := SSH(rebootCmd, host, provider); code != 0 || err != nil {
+		Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v",
+			rebootCmd, node, code, err)
+		result <- false
+		return
+	}
+
+	// Wait for some kind of "not ready" status.
+	if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) {
+		result <- false
+		return
+	}
+
+	// Wait for some kind of "ready" status.
+	if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) {
+		result <- false
+		return
+	}
+
+	// Ensure all of the pods that we found on this node before the reboot are
+	// running / healthy.
+	if !checkPodsRunning(c, podNames, podReadyAgainTimeout) {
+		result <- false
+		return
+	}
+
+	Logf("Reboot successful on node %s", name)
+	result <- true
+}
+
+// podRunningReady is the checker function passed to waitForPodCondition(...)
+// (found in util.go). It ensures that the pods' phase is running and that the
+// ready condition is true.
+func podRunningReady(p *api.Pod) (bool, error) {
+	// Check the phase is running.
+	if p.Status.Phase != api.PodRunning {
+		return false, fmt.Errorf("want pod %s on %s to be %v but was %v",
+			p.ObjectMeta.Name, p.Spec.Host, api.PodRunning, p.Status.Phase)
+	}
+	// Check the ready condition is true.
+	for _, cond := range p.Status.Conditions {
+		if cond.Type == api.PodReady && cond.Status == api.ConditionTrue {
+			return true, nil
+		}
+	}
+	return false, fmt.Errorf("pod %s on %s didn't have condition %v, %v; conditions: %v",
+		p.ObjectMeta.Name, p.Spec.Host, api.PodReady, api.ConditionTrue, p.Status.Conditions)
+}
+
+// checkPodsRunning returns whether all pods whose names are listed in podNames
+// are running.
+func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool {
+	desc := "running and ready"
+	Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames)
+	result := make(chan bool, len(podNames))
+	for _, podName := range podNames {
+		// Don't you just love Go?
+		podName := podName
+		// Launch off pod readiness checkers.
+		go func() {
+			err := waitForPodCondition(c, api.NamespaceDefault, podName, desc,
+				poll, timeout, podRunningReady)
+			result <- err == nil
+		}()
+	}
+	// Wait for them all to finish.
+	success := true
+	// TODO(mbforbes): Change to `for range` syntax and remove logging once we
+	// support only Go >= 1.4.
+	for _, podName := range podNames {
+		if !<-result {
+			Logf("Pod %s failed to be %s.", podName, desc)
+			success = false
+		}
+	}
+	Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames)
+	return success
+}
+
+// waitForNodeToBeReady returns whether node name is ready within timeout.
+func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, true, timeout)
+}
+
+// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
+// readiness condition is anything but ready, e.g false or unknown) within
+// timeout.
+func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, false, timeout)
+}
+
+// waitForNodeToBe returns whether node name's readiness state matches wantReady
+// within timeout. If wantReady is true, it will ensure the node is ready; if
+// it's false, it ensures the node is in any state other than ready (e.g. not
+// ready or unknown).
+func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
+	Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
+		node, err := c.Nodes().Get(name)
+		if err != nil {
+			Logf("Couldn't get node %s", name)
+			continue
+		}
+
+		// Check the node readiness condition (logging all).
+		for i, cond := range node.Status.Conditions {
+			Logf("Node %s condition %d/%d: type: %v, status: %v",
+				name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
+			// Ensure that the condition type is readiness and the status
+			// matches as desired.
+			if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
+				Logf("Successfully found node %s readiness to be %t", name, wantReady)
+				return true
+			}
+		}
+	}
+	Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
+	return false
+}
diff --git a/test/e2e/ssh.go b/test/e2e/ssh.go
index d1568c98dd7..59aa7144321 100644
--- a/test/e2e/ssh.go
+++ b/test/e2e/ssh.go
@@ -20,10 +20,6 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
-	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
-	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
-
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
 )
@@ -46,28 +42,9 @@ var _ = Describe("SSH", func() {
 
 		// Get all nodes' external IPs.
 		By("Getting all nodes' SSH-able IP addresses")
-		nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
+		hosts, err := NodeSSHHosts(c)
 		if err != nil {
-			Failf("Error getting nodes: %v", err)
-		}
-		hosts := make([]string, 0, len(nodelist.Items))
-		for _, n := range nodelist.Items {
-			for _, addr := range n.Status.Addresses {
-				// Use the first external IP address we find on the node, and
-				// use at most one per node.
-				// NOTE: Until #7412 is fixed this will repeatedly ssh into the
-				// master node and not check any of the minions.
-				if addr.Type == api.NodeExternalIP {
-					hosts = append(hosts, addr.Address+":22")
-					break
-				}
-			}
-		}
-
-		// Fail if any node didn't have an external IP.
-		if len(hosts) != len(nodelist.Items) {
-			Failf("Only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
-				len(hosts), len(nodelist.Items), nodelist)
+			Failf("Error getting node hostnames: %v", err)
 		}
 
 		testCases := []struct {
diff --git a/test/e2e/util.go b/test/e2e/util.go
index b91255ca485..77f2f2ffd0b 100644
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -52,6 +52,9 @@ const (
 
 	// String used to mark pod deletion
 	nonExist = "NonExist"
+
+	// How often to poll pods.
+	podPoll = 5 * time.Second
 )
 
 type TestContextType struct {
@@ -88,12 +91,12 @@ func providerIs(providers ...string) bool {
 
 type podCondition func(pod *api.Pod) (bool, error)
 
-func waitForPodCondition(c *client.Client, ns, podName, desc string, condition podCondition) error {
-	By(fmt.Sprintf("waiting up to %v for pod %s status to be %s", podStartTimeout, podName, desc))
-	for start := time.Now(); time.Since(start) < podStartTimeout; time.Sleep(5 * time.Second) {
+func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error {
+	Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc)
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		pod, err := c.Pods(ns).Get(podName)
 		if err != nil {
-			Logf("Get pod %v in ns %v failed, ignoring for 5s: %v", podName, ns, err)
+			Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err)
 			continue
 		}
 		done, err := condition(pod)
@@ -102,7 +105,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, condition p
 		}
 		Logf("Waiting for pod %s in namespace %s status to be %q (found %q) (%v)", podName, ns, desc, pod.Status.Phase, time.Since(start))
 	}
-	return fmt.Errorf("gave up waiting for pod %s to be %s after %.2f seconds", podName, desc, podStartTimeout.Seconds())
+	return fmt.Errorf("gave up waiting for pod %s to be %s after %v", podName, desc, timeout)
 }
 
 // createNS should be used by every test, note that we append a common prefix to the provided test name.
@@ -119,7 +122,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error)
 }
 
 func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "running", func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
 		return (pod.Status.Phase == api.PodRunning), nil
 	})
 }
@@ -130,7 +133,7 @@ func waitForPodRunning(c *client.Client, podName string) error {
 
 // waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
 func waitForPodNotPending(c *client.Client, ns, podName string) error {
-	return waitForPodCondition(c, ns, podName, "!pending", func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
 		if pod.Status.Phase != api.PodPending {
 			Logf("Saw pod %s in namespace %s out of pending state (found %q)", podName, ns, pod.Status.Phase)
 			return true, nil
@@ -141,7 +144,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error {
 
 // waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
 func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "success or failure", func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
 		// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
 		ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName)
 		if !ok {
@@ -741,6 +744,37 @@ func BadEvents(events []*api.Event) int {
 	return badEvents
 }
 
+// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error
+// if it can't find an external IP for every node, though it still returns all
+// hosts that it found in that case.
+func NodeSSHHosts(c *client.Client) ([]string, error) {
+	var hosts []string
+	nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
+	if err != nil {
+		return hosts, fmt.Errorf("error getting nodes: %v", err)
+	}
+	for _, n := range nodelist.Items {
+		for _, addr := range n.Status.Addresses {
+			// Use the first external IP address we find on the node, and
+			// use at most one per node.
+			// TODO(mbforbes): Use the "preferred" address for the node, once
+			// such a thing is defined (#2462).
+			if addr.Type == api.NodeExternalIP {
+				hosts = append(hosts, addr.Address+":22")
+				break
+			}
+		}
+	}
+
+	// Error if any node didn't have an external IP.
+	if len(hosts) != len(nodelist.Items) {
+		return hosts, fmt.Errorf(
+			"only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
+			len(hosts), len(nodelist.Items), nodelist)
+	}
+	return hosts, nil
+}
+
 // SSH synchronously SSHs to a node running on provider and runs cmd. If there
 // is no error performing the SSH, the stdout, stderr, and exit code are
 // returned.