From 5d1d37f9ca61bb6bf109ac9351857e2ca7c48119 Mon Sep 17 00:00:00 2001 From: Max Forbes Date: Wed, 29 Apr 2015 15:28:48 -0700 Subject: [PATCH] Add a node reboot test. --- test/e2e/reboot.go | 281 +++++++++++++++++++++++++++++++++++++++++++++ test/e2e/ssh.go | 27 +---- test/e2e/util.go | 50 ++++++-- 3 files changed, 325 insertions(+), 33 deletions(-) create mode 100644 test/e2e/reboot.go diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go new file mode 100644 index 00000000000..aca6d01b673 --- /dev/null +++ b/test/e2e/reboot.go @@ -0,0 +1,281 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" + "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const ( + // How long to pause between polling node or pod status. + poll = 5 * time.Second + + // How long nodes have to be "ready" before the reboot. They should already + // be "ready" before the test starts, so this is small. + nodeReadyInitialTimeout = 20 * time.Second + + // How long pods have to be "ready" before the reboot. They should already + // be "ready" before the test starts, so this is small. + podReadyBeforeTimeout = 20 * time.Second + + // How long a node is allowed to go from "Ready" to "NotReady" after a + // reboot is issued before the test is considered failed. + rebootNotReadyTimeout = 2 * time.Minute + + // How long a node is allowed to go from "NotReady" to "Ready" after a + // reboot is issued and it is found to be "NotReady" before the test is + // considered failed. + rebootReadyAgainTimeout = 5 * time.Minute + + // How long pods have to be "ready" after the reboot. + podReadyAgainTimeout = 5 * time.Minute +) + +var _ = Describe("Reboot", func() { + BeforeEach(func() { + var err error + c, err = loadClient() + Expect(err).NotTo(HaveOccurred()) + }) + + It("should reboot each node and ensure they function upon restart", func() { + // This test requires SSH, so the provider check should be identical to + // there (the limiting factor is the implementation of util.go's + // getSigner(...)). + provider := testContext.Provider + if !providerIs("gce") { + By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider)) + return + } + + // Get all nodes, and kick off the test on each. + nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) + if err != nil { + Failf("Error getting nodes: %v", err) + } + result := make(chan bool, len(nodelist.Items)) + for _, n := range nodelist.Items { + go rebootNode(c, provider, n.ObjectMeta.Name, result) + } + + // Wait for all to finish and check the final result. + failed := false + // TODO(mbforbes): Change to `for range` syntax and remove logging once + // we support only Go >= 1.4. + for _, n := range nodelist.Items { + if !<-result { + Failf("Node %s failed reboot test.", n.ObjectMeta.Name) + failed = true + } + } + if failed { + Failf("Test failed; at least one node failed to reboot in the time given.") + } + }) +}) + +// rebootNode takes node name on provider through the following steps using c: +// - ensures the node is ready +// - ensures all pods on the node are running and ready +// - reboots the node +// - ensures the node reaches some non-ready state +// - ensures the node becomes ready again +// - ensures all pods on the node become running and ready again +// +// It returns true through result only if all of the steps pass; at the first +// failed step, it will return false through result and not run the rest. +func rebootNode(c *client.Client, provider, name string, result chan bool) { + // Get the node initially. + Logf("Getting %s", name) + node, err := c.Nodes().Get(name) + if err != nil { + Logf("Gouldn't get node %s", name) + result <- false + return + } + + // Node sanity check: ensure it is "ready". + if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { + result <- false + return + } + + // Get all the pods on the node. + podList, err := c.Pods(api.NamespaceDefault).List( + labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) + if err != nil { + Logf("Error getting pods for node %s: %v", name, err) + result <- false + return + } + podNames := make([]string, len(podList.Items)) + for i, p := range podList.Items { + podNames[i] = p.ObjectMeta.Name + } + Logf("Node %s has %d pods: %v", name, len(podNames), podNames) + + // For each pod, we do a sanity check to ensure it's running / healthy + // now, as that's what we'll be checking later. + if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) { + result <- false + return + } + + // Reboot the node. + Logf("Getting external IP address for %s", name) + host := "" + for _, a := range node.Status.Addresses { + if a.Type == api.NodeExternalIP { + host = a.Address + ":22" + break + } + } + if host == "" { + Logf("Couldn't find external IP address for node %s", name) + result <- false + return + } + Logf("Calling reboot on %s", name) + rebootCmd := "sudo reboot" + if _, _, code, err := SSH(rebootCmd, host, provider); code != 0 || err != nil { + Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v", + rebootCmd, node, code, err) + result <- false + return + } + + // Wait for some kind of "not ready" status. + if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) { + result <- false + return + } + + // Wait for some kind of "ready" status. + if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) { + result <- false + return + } + + // Ensure all of the pods that we found on this node before the reboot are + // running / healthy. + if !checkPodsRunning(c, podNames, podReadyAgainTimeout) { + result <- false + return + } + + Logf("Reboot successful on node %s", name) + result <- true +} + +// podRunningReady is the checker function passed to waitForPodCondition(...) +// (found in util.go). It ensures that the pods' phase is running and that the +// ready condition is true. +func podRunningReady(p *api.Pod) (bool, error) { + // Check the phase is running. + if p.Status.Phase != api.PodRunning { + return false, fmt.Errorf("want pod %s on %s to be %v but was %v", + p.ObjectMeta.Name, p.Spec.Host, api.PodRunning, p.Status.Phase) + } + // Check the ready condition is true. + for _, cond := range p.Status.Conditions { + if cond.Type == api.PodReady && cond.Status == api.ConditionTrue { + return true, nil + } + } + return false, fmt.Errorf("pod %s on %s didn't have condition %v, %v; conditions: %v", + p.ObjectMeta.Name, p.Spec.Host, api.PodReady, api.ConditionTrue, p.Status.Conditions) +} + +// checkPodsRunning returns whether all pods whose names are listed in podNames +// are running. +func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool { + desc := "running and ready" + Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames) + result := make(chan bool, len(podNames)) + for _, podName := range podNames { + // Don't you just love Go? + podName := podName + // Launch off pod readiness checkers. + go func() { + err := waitForPodCondition(c, api.NamespaceDefault, podName, desc, + poll, timeout, podRunningReady) + result <- err == nil + }() + } + // Wait for them all to finish. + success := true + // TODO(mbforbes): Change to `for range` syntax and remove logging once we + // support only Go >= 1.4. + for _, podName := range podNames { + if !<-result { + Logf("Pod %s failed to be %s.", podName, desc) + success = false + } + } + Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames) + return success +} + +// waitForNodeToBeReady returns whether node name is ready within timeout. +func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool { + return waitForNodeToBe(c, name, true, timeout) +} + +// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the +// readiness condition is anything but ready, e.g false or unknown) within +// timeout. +func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool { + return waitForNodeToBe(c, name, false, timeout) +} + +// waitForNodeToBe returns whether node name's readiness state matches wantReady +// within timeout. If wantReady is true, it will ensure the node is ready; if +// it's false, it ensures the node is in any state other than ready (e.g. not +// ready or unknown). +func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool { + Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady) + for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { + node, err := c.Nodes().Get(name) + if err != nil { + Logf("Couldn't get node %s", name) + continue + } + + // Check the node readiness condition (logging all). + for i, cond := range node.Status.Conditions { + Logf("Node %s condition %d/%d: type: %v, status: %v", + name, i+1, len(node.Status.Conditions), cond.Type, cond.Status) + // Ensure that the condition type is readiness and the status + // matches as desired. + if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady { + Logf("Successfully found node %s readiness to be %t", name, wantReady) + return true + } + } + } + Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout) + return false +} diff --git a/test/e2e/ssh.go b/test/e2e/ssh.go index d1568c98dd7..59aa7144321 100644 --- a/test/e2e/ssh.go +++ b/test/e2e/ssh.go @@ -20,10 +20,6 @@ import ( "fmt" "strings" - "github.com/GoogleCloudPlatform/kubernetes/pkg/api" - "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" - "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" - . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" ) @@ -46,28 +42,9 @@ var _ = Describe("SSH", func() { // Get all nodes' external IPs. By("Getting all nodes' SSH-able IP addresses") - nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) + hosts, err := NodeSSHHosts(c) if err != nil { - Failf("Error getting nodes: %v", err) - } - hosts := make([]string, 0, len(nodelist.Items)) - for _, n := range nodelist.Items { - for _, addr := range n.Status.Addresses { - // Use the first external IP address we find on the node, and - // use at most one per node. - // NOTE: Until #7412 is fixed this will repeatedly ssh into the - // master node and not check any of the minions. - if addr.Type == api.NodeExternalIP { - hosts = append(hosts, addr.Address+":22") - break - } - } - } - - // Fail if any node didn't have an external IP. - if len(hosts) != len(nodelist.Items) { - Failf("Only found %d external IPs on nodes, but found %d nodes. Nodelist: %v", - len(hosts), len(nodelist.Items), nodelist) + Failf("Error getting node hostnames: %v", err) } testCases := []struct { diff --git a/test/e2e/util.go b/test/e2e/util.go index b91255ca485..77f2f2ffd0b 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -52,6 +52,9 @@ const ( // String used to mark pod deletion nonExist = "NonExist" + + // How often to poll pods. + podPoll = 5 * time.Second ) type TestContextType struct { @@ -88,12 +91,12 @@ func providerIs(providers ...string) bool { type podCondition func(pod *api.Pod) (bool, error) -func waitForPodCondition(c *client.Client, ns, podName, desc string, condition podCondition) error { - By(fmt.Sprintf("waiting up to %v for pod %s status to be %s", podStartTimeout, podName, desc)) - for start := time.Now(); time.Since(start) < podStartTimeout; time.Sleep(5 * time.Second) { +func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error { + Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc) + for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { pod, err := c.Pods(ns).Get(podName) if err != nil { - Logf("Get pod %v in ns %v failed, ignoring for 5s: %v", podName, ns, err) + Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err) continue } done, err := condition(pod) @@ -102,7 +105,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, condition p } Logf("Waiting for pod %s in namespace %s status to be %q (found %q) (%v)", podName, ns, desc, pod.Status.Phase, time.Since(start)) } - return fmt.Errorf("gave up waiting for pod %s to be %s after %.2f seconds", podName, desc, podStartTimeout.Seconds()) + return fmt.Errorf("gave up waiting for pod %s to be %s after %v", podName, desc, timeout) } // createNS should be used by every test, note that we append a common prefix to the provided test name. @@ -119,7 +122,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error) } func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error { - return waitForPodCondition(c, namespace, podName, "running", func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { return (pod.Status.Phase == api.PodRunning), nil }) } @@ -130,7 +133,7 @@ func waitForPodRunning(c *client.Client, podName string) error { // waitForPodNotPending returns an error if it took too long for the pod to go out of pending state. func waitForPodNotPending(c *client.Client, ns, podName string) error { - return waitForPodCondition(c, ns, podName, "!pending", func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { if pod.Status.Phase != api.PodPending { Logf("Saw pod %s in namespace %s out of pending state (found %q)", podName, ns, pod.Status.Phase) return true, nil @@ -141,7 +144,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error { // waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long. func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error { - return waitForPodCondition(c, namespace, podName, "success or failure", func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { // Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632 ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName) if !ok { @@ -741,6 +744,37 @@ func BadEvents(events []*api.Event) int { return badEvents } +// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error +// if it can't find an external IP for every node, though it still returns all +// hosts that it found in that case. +func NodeSSHHosts(c *client.Client) ([]string, error) { + var hosts []string + nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) + if err != nil { + return hosts, fmt.Errorf("error getting nodes: %v", err) + } + for _, n := range nodelist.Items { + for _, addr := range n.Status.Addresses { + // Use the first external IP address we find on the node, and + // use at most one per node. + // TODO(mbforbes): Use the "preferred" address for the node, once + // such a thing is defined (#2462). + if addr.Type == api.NodeExternalIP { + hosts = append(hosts, addr.Address+":22") + break + } + } + } + + // Error if any node didn't have an external IP. + if len(hosts) != len(nodelist.Items) { + return hosts, fmt.Errorf( + "only found %d external IPs on nodes, but found %d nodes. Nodelist: %v", + len(hosts), len(nodelist.Items), nodelist) + } + return hosts, nil +} + // SSH synchronously SSHs to a node running on provider and runs cmd. If there // is no error performing the SSH, the stdout, stderr, and exit code are // returned.