diff --git a/pkg/util/wait/wait.go b/pkg/util/wait/wait.go index d6585cbe64b..45109dd0652 100644 --- a/pkg/util/wait/wait.go +++ b/pkg/util/wait/wait.go @@ -57,7 +57,7 @@ type WaitFunc func() <-chan struct{} // placed on the channel and once more when the channel is closed. If c // returns an error the loop ends and that error is returned, and if c returns // true the loop ends and nil is returned. ErrWaitTimeout will be returned if -// the channel is closed without c every returning true. +// the channel is closed without c ever returning true. func WaitFor(wait WaitFunc, c ConditionFunc) error { w := wait() for { diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index 44fd607066a..2a71629d853 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -30,28 +30,17 @@ import ( ) const ( - // How long to pause between polling node or pod status. - poll = 5 * time.Second - - // How long nodes have to be "ready" before the reboot. They should already - // be "ready" before the test starts, so this is small. - nodeReadyInitialTimeout = 20 * time.Second - - // How long pods have to be "ready" before the reboot. They should already - // be "ready" before the test starts, so this is small. - podReadyBeforeTimeout = 20 * time.Second - // How long a node is allowed to go from "Ready" to "NotReady" after a // reboot is issued before the test is considered failed. - rebootNotReadyTimeout = 2 * time.Minute + rebootNodeNotReadyTimeout = 2 * time.Minute // How long a node is allowed to go from "NotReady" to "Ready" after a // reboot is issued and it is found to be "NotReady" before the test is // considered failed. - rebootReadyAgainTimeout = 5 * time.Minute + rebootNodeReadyAgainTimeout = 5 * time.Minute // How long pods have to be "ready" after the reboot. - podReadyAgainTimeout = 5 * time.Minute + rebootPodReadyAgainTimeout = 5 * time.Minute ) var _ = Describe("Reboot", func() { @@ -105,7 +94,7 @@ func testReboot(c *client.Client, rebootCmd string) { } // Get all nodes, and kick off the test on each. - nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) + nodelist, err := listNodes(c, labels.Everything(), fields.Everything()) if err != nil { Failf("Error getting nodes: %v", err) } @@ -159,6 +148,10 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error { // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { + // Setup + ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) + defer ps.Stop() + // Get the node initially. Logf("Getting %s", name) node, err := c.Nodes().Get(name) @@ -175,22 +168,16 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan } // Get all the pods on the node. - podList, err := c.Pods(api.NamespaceDefault).List( - labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) - if err != nil { - Logf("Error getting pods for node %s: %v", name, err) - result <- false - return - } - podNames := make([]string, len(podList.Items)) - for i, p := range podList.Items { + pods := ps.List() + podNames := make([]string, len(pods)) + for i, p := range pods { podNames[i] = p.ObjectMeta.Name } Logf("Node %s has %d pods: %v", name, len(podNames), podNames) // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. - if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) { + if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) { result <- false return } @@ -202,20 +189,20 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan } // Wait for some kind of "not ready" status. - if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) { + if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { result <- false return } // Wait for some kind of "ready" status. - if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) { + if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { result <- false return } // Ensure all of the pods that we found on this node before the reboot are // running / healthy. - if !checkPodsRunning(c, podNames, podReadyAgainTimeout) { + if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) { result <- false return } @@ -223,72 +210,3 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan Logf("Reboot successful on node %s", name) result <- true } - -// checkPodsRunning returns whether all pods whose names are listed in podNames -// are running. -func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool { - desc := "running and ready" - Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames) - result := make(chan bool, len(podNames)) - for ix := range podNames { - // Launch off pod readiness checkers. - go func(name string) { - err := waitForPodCondition(c, api.NamespaceDefault, name, desc, - poll, timeout, podRunningReady) - result <- err == nil - }(podNames[ix]) - } - // Wait for them all to finish. - success := true - // TODO(mbforbes): Change to `for range` syntax and remove logging once we - // support only Go >= 1.4. - for _, podName := range podNames { - if !<-result { - Logf("Pod %s failed to be %s.", podName, desc) - success = false - } - } - Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames) - return success -} - -// waitForNodeToBeReady returns whether node name is ready within timeout. -func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool { - return waitForNodeToBe(c, name, true, timeout) -} - -// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the -// readiness condition is anything but ready, e.g false or unknown) within -// timeout. -func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool { - return waitForNodeToBe(c, name, false, timeout) -} - -// waitForNodeToBe returns whether node name's readiness state matches wantReady -// within timeout. If wantReady is true, it will ensure the node is ready; if -// it's false, it ensures the node is in any state other than ready (e.g. not -// ready or unknown). -func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool { - Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady) - for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { - node, err := c.Nodes().Get(name) - if err != nil { - Logf("Couldn't get node %s", name) - continue - } - - // Check the node readiness condition (logging all). - for i, cond := range node.Status.Conditions { - Logf("Node %s condition %d/%d: type: %v, status: %v", - name, i+1, len(node.Status.Conditions), cond.Type, cond.Status) - // Ensure that the condition type is readiness and the status - // matches as desired. - if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady { - Logf("Successfully found node %s readiness to be %t", name, wantReady) - return true - } - } - } - Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout) - return false -} diff --git a/test/e2e/restart.go b/test/e2e/restart.go new file mode 100644 index 00000000000..bae8b275513 --- /dev/null +++ b/test/e2e/restart.go @@ -0,0 +1,383 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "os/exec" + "strings" + "time" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client" + "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" + "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const ( + // How long each node is given during a process that restarts all nodes + // before the test is considered failed. (Note that the total time to + // restart all nodes will be this number times the nubmer of nodes.) + restartPerNodeTimeout = 5 * time.Minute + + // How often to poll the statues of a restart. + restartPoll = 20 * time.Second + + // How long a node is allowed to become "Ready" after it is restarted before + // the test is considered failed. + restartNodeReadyAgainTimeout = 5 * time.Minute + + // How long a pod is allowed to become "running" and "ready" after a node + // restart before test is considered failed. + restartPodReadyAgainTimeout = 5 * time.Minute +) + +var _ = Describe("Restart", func() { + var c *client.Client + var ps *podStore + + BeforeEach(func() { + var err error + c, err = loadClient() + Expect(err).NotTo(HaveOccurred()) + ps = newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.Everything()) + }) + + AfterEach(func() { + ps.Stop() + }) + + It("should restart all nodes and ensure all nodes and pods recover", func() { + // This test requires the ability to restart all nodes, so the provider + // check must be identical to that call. + provider := testContext.Provider + nn := testContext.CloudConfig.NumNodes + if !providerIs("gce") { + By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider)) + return + } + + By("ensuring all nodes are ready") + nodeNamesBefore, err := checkNodesReady(c, nodeReadyInitialTimeout, nn) + Expect(err).NotTo(HaveOccurred()) + Logf("Got the following nodes before restart: %v", nodeNamesBefore) + + By("ensuring all pods are running and ready") + pods := ps.List() + podNamesBefore := make([]string, len(pods)) + for i, p := range pods { + podNamesBefore[i] = p.ObjectMeta.Name + } + if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) { + Failf("At least one pod wasn't running and ready at test start.") + } + + By("restarting all of the nodes") + err = restartNodes(provider, restartPerNodeTimeout) + Expect(err).NotTo(HaveOccurred()) + + By("ensuring all nodes are ready after the restart") + nodeNamesAfter, err := checkNodesReady(c, restartNodeReadyAgainTimeout, nn) + Expect(err).NotTo(HaveOccurred()) + Logf("Got the following nodes after restart: %v", nodeNamesAfter) + + // Make sure that we have the same number of nodes. We're not checking + // that the names match because that's implementation specific. + By("ensuring the same number of nodes exist after the restart") + if len(nodeNamesBefore) != len(nodeNamesAfter) { + Failf("Had %d nodes before nodes were restarted, but now only have %d", + len(nodeNamesBefore), len(nodeNamesAfter)) + } + + // Make sure that we have the same number of pods. We're not checking + // that the names match because they are recreated with different names + // across node restarts. + By("ensuring the same number of pods are running and ready after restart") + podCheckStart := time.Now() + podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout) + Expect(err).NotTo(HaveOccurred()) + remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart) + if !checkPodsRunningReady(c, podNamesAfter, remaining) { + Failf("At least one pod wasn't running and ready after the restart.") + } + }) +}) + +// waitForNPods tries to list pods using c until it finds expect of them, +// returning their names if it can do so before timeout. +func waitForNPods(ps *podStore, expect int, timeout time.Duration) ([]string, error) { + // Loop until we find expect pods or timeout is passed. + var pods []*api.Pod + var errLast error + found := wait.Poll(poll, timeout, func() (bool, error) { + pods = ps.List() + if len(pods) != expect { + errLast = fmt.Errorf("expected to find %d pods but found only %d", expect, len(pods)) + Logf("Error getting pods: %v", errLast) + return false, nil + } + return true, nil + }) == nil + // Extract the names of all found pods. + podNames := make([]string, len(pods)) + for i, p := range pods { + podNames[i] = p.ObjectMeta.Name + } + if !found { + return podNames, fmt.Errorf("couldn't find %d pods within %v; last error: %v", + expect, timeout, errLast) + } + return podNames, nil +} + +// checkNodesReady waits up to nt for expect nodes accessed by c to be ready, +// returning an error if this doesn't happen in time. It returns the names of +// nodes it finds. +func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) { + // First, keep getting all of the nodes until we get the number we expect. + var nodeList *api.NodeList + var errLast error + start := time.Now() + found := wait.Poll(poll, nt, func() (bool, error) { + // Even though listNodes(...) has its own retries, a rolling-update + // (GCE/GKE implementation of restart) can complete before the apiserver + // knows about all of the nodes. Thus, we retry the list nodes call + // until we get the expected number of nodes. + nodeList, errLast = listNodes(c, labels.Everything(), fields.Everything()) + if errLast != nil { + return false, nil + } + if len(nodeList.Items) != expect { + errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items)) + Logf("%v", errLast) + return false, nil + } + return true, nil + }) == nil + nodeNames := make([]string, len(nodeList.Items)) + for i, n := range nodeList.Items { + nodeNames[i] = n.ObjectMeta.Name + } + if !found { + return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", + expect, nt, errLast) + } + + // Next, ensure in parallel that all the nodes are ready. We subtract the + // time we spent waiting above. + timeout := nt - time.Since(start) + result := make(chan bool, len(nodeList.Items)) + for _, n := range nodeNames { + n := n + go func() { result <- waitForNodeToBeReady(c, n, timeout) }() + } + failed := false + // TODO(mbforbes): Change to `for range` syntax once we support only Go + // >= 1.4. + for i := range nodeList.Items { + _ = i + if !<-result { + failed = true + } + } + if failed { + return nodeNames, fmt.Errorf("at least one node failed to be ready") + } + return nodeNames, nil +} + +// restartNodes uses provider to do a restart of all nodes in the cluster, +// allowing up to nt per node. +func restartNodes(provider string, nt time.Duration) error { + switch provider { + case "gce": + return migRollingUpdate(nt) + default: + return fmt.Errorf("restartNodes(...) not implemented for %s", provider) + } +} + +// migRollingUpdate starts a MIG rolling update and waits up to nt times the +// nubmer of nodes for it to complete. +func migRollingUpdate(nt time.Duration) error { + By("getting the name of the template for the managed instance group") + templ, err := migTemplate() + if err != nil { + return fmt.Errorf("couldn't get MIG template name: %v", err) + } + + By("starting the managed instance group rolling update") + id, err := migRollingUpdateStart(templ, nt) + if err != nil { + return fmt.Errorf("couldn't start the MIG rolling update: %v", err) + } + + By("polling the managed instance group rolling update until it completes") + if err := migRollingUpdatePoll(id, nt); err != nil { + return fmt.Errorf("err waiting until update completed: %v", err) + } + + return nil +} + +// migTemlate (GCE/GKE-only) returns the name of the MIG template that the +// nodes of the cluster use. +func migTemplate() (string, error) { + var errLast error + var templ string + key := "instanceTemplate" + if wait.Poll(poll, singleCallTimeout, func() (bool, error) { + // TODO(mbforbes): make this hit the compute API directly instead of + // shelling out to gcloud. + o, err := exec.Command("gcloud", "preview", "managed-instance-groups", + fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), + "describe", + testContext.CloudConfig.NodeInstanceGroup).CombinedOutput() + if err != nil { + errLast = fmt.Errorf("gcloud preview managed-instance-groups describe call failed with err: %v", err) + return false, nil + } + output := string(o) + + // The 'describe' call probably succeeded; parse the output and try to + // find the line that looks like "instanceTemplate: url/to/" and + // return . + if val := parseKVLines(output, key); len(val) > 0 { + url := strings.Split(val, "/") + templ = url[len(url)-1] + Logf("MIG group %s using template: %s", testContext.CloudConfig.NodeInstanceGroup, templ) + return true, nil + } + errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output) + return false, nil + }) != nil { + return "", fmt.Errorf("migTemplate() failed with last error: %v", errLast) + } + return templ, nil +} + +// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ +// as the new template, waiting up to nt per node, and returns the ID of that +// update. +func migRollingUpdateStart(templ string, nt time.Duration) (string, error) { + var errLast error + var id string + prefix, suffix := "Started [", "]." + if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) { + // TODO(mbforbes): make this hit the compute API directly instead of + // shelling out to gcloud. + o, err := exec.Command("gcloud", "preview", "rolling-updates", + fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), + "start", + // Required args. + fmt.Sprintf("--group=%s", testContext.CloudConfig.NodeInstanceGroup), + fmt.Sprintf("--template=%s", templ), + // Optional args to fine-tune behavior. + fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())), + // NOTE: We can speed up this process by increasing + // --max-num-concurrent-instances. + fmt.Sprintf("--max-num-concurrent-instances=%d", 1), + fmt.Sprintf("--max-num-failed-instances=%d", 0), + fmt.Sprintf("--min-instance-update-time=%ds", 0)).CombinedOutput() + if err != nil { + errLast = fmt.Errorf("gcloud preview rolling-updates call failed with err: %v", err) + return false, nil + } + output := string(o) + + // The 'start' call probably succeeded; parse the output and try to find + // the line that looks like "Started [url/to/]." and return . + for _, line := range strings.Split(output, "\n") { + // As a sanity check, ensure the line starts with prefix and ends + // with suffix. + if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) { + continue + } + url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/") + id = url[len(url)-1] + Logf("Started MIG rolling update; ID: %s", id) + return true, nil + } + errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s", + prefix, suffix, output) + return false, nil + }); err != nil { + return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast) + } + return id, nil +} + +// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling +// update with ID id until it is complete. It returns an error if this takes +// longer than nt times the number of nodes. +func migRollingUpdatePoll(id string, nt time.Duration) error { + // Two keys and a val. + status, progress, done := "status", "statusMessage", "ROLLED_OUT" + start, timeout := time.Now(), nt*time.Duration(testContext.CloudConfig.NumNodes) + var errLast error + Logf("Waiting up to %v for MIG rolling update to complete.", timeout) + if wait.Poll(restartPoll, timeout, func() (bool, error) { + o, err := exec.Command("gcloud", "preview", "rolling-updates", + fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), + fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), + "describe", + id).CombinedOutput() + if err != nil { + errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err) + Logf("%v", errLast) + return false, nil + } + output := string(o) + + // The 'describe' call probably succeeded; parse the output and try to + // find the line that looks like "status: " and see whether it's + // done. + Logf("Waiting for MIG rolling update: %s (%v elapsed)", + parseKVLines(output, progress), time.Since(start)) + if st := parseKVLines(output, status); st == done { + return true, nil + } + return false, nil + }) != nil { + return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) + } + return nil +} + +// parseKVLines parses output that looks like lines containing ": " +// and returns if is found. Otherwise, it returns the empty string. +func parseKVLines(output, key string) string { + delim := ":" + key = key + delim + for _, line := range strings.Split(output, "\n") { + pieces := strings.SplitAfterN(line, delim, 2) + if len(pieces) != 2 { + continue + } + k, v := pieces[0], pieces[1] + if k == key { + return strings.TrimSpace(v) + } + } + return "" +} diff --git a/test/e2e/util.go b/test/e2e/util.go index b662a204294..00afda28f36 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -61,15 +61,28 @@ const ( // String used to mark pod deletion nonExist = "NonExist" - // How often to poll pods. - podPoll = 5 * time.Second + // How often to poll pods and nodes. + poll = 5 * time.Second // service accounts are provisioned after namespace creation // a service account is required to support pod creation in a namespace as part of admission control serviceAccountProvisionTimeout = 2 * time.Minute - // How often to poll for service accounts - serviceAccountPoll = 5 * time.Second + // How long to try single API calls (like 'get' or 'list'). Used to prevent + // transient failures from failing tests. + singleCallTimeout = 30 * time.Second + + // How long nodes have to be "ready" when a test begins. They should already + // be "ready" before the test starts, so this is small. + nodeReadyInitialTimeout = 20 * time.Second + + // How long pods have to be "ready" when a test begins. They should already + // be "ready" before the test starts, so this is small. + podReadyBeforeTimeout = 20 * time.Second + + // How wide to print pod names, by default. Useful for aligning printing to + // quickly scan through output. + podPrintWidth = 55 ) type CloudConfig struct { @@ -220,7 +233,6 @@ func podRunningReady(p *api.Pod) (bool, error) { if !podReady(p) { return false, fmt.Errorf("pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v", p.ObjectMeta.Name, p.Spec.NodeName, api.PodReady, api.ConditionTrue, p.Status.Conditions) - } return true, nil } @@ -235,15 +247,16 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro if err != nil { return err } + start := time.Now() Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready", timeout, minPods, ns) - for start := time.Now(); time.Since(start) < timeout; time.Sleep(podPoll) { + if wait.Poll(poll, timeout, func() (bool, error) { // We get the new list of pods in every iteration beause more pods come // online during startup and we want to ensure they are also checked. podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything()) if err != nil { Logf("Error getting pods in namespace '%s': %v", ns, err) - continue + return false, nil } nOk, badPods := 0, []api.Pod{} for _, pod := range podList.Items { @@ -256,14 +269,17 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)", nOk, len(podList.Items), ns, int(time.Since(start).Seconds())) if nOk == len(podList.Items) && nOk >= minPods { - return nil + return true, nil } logPodStates(badPods) + return false, nil + }) != nil { + return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout) } - return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout) + return nil } -func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, poll, timeout time.Duration) error { +func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, timeout time.Duration) error { Logf("Waiting up to %v for service account %s to be provisioned in ns %s", timeout, serviceAccountName, ns) for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { _, err := c.ServiceAccounts(ns).Get(serviceAccountName) @@ -277,20 +293,23 @@ func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName s return fmt.Errorf("Service account %s in namespace %s not ready within %v", serviceAccountName, ns, timeout) } -func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error { - Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc) +func waitForPodCondition(c *client.Client, ns, podName, desc string, timeout time.Duration, condition podCondition) error { + Logf("Waiting up to %[1]v for pod %-[2]*[3]s status to be %[4]s", timeout, podPrintWidth, podName, desc) for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { pod, err := c.Pods(ns).Get(podName) if err != nil { - Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err) + // Aligning this text makes it much more readable + Logf("Get pod %-[1]*[2]s in namespace '%[3]s' failed, ignoring for %[4]v. Error: %[5]v", + podPrintWidth, podName, ns, poll, err) continue } done, err := condition(pod) if done { return err } - Logf("Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)", - podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start)) + Logf("Waiting for pod %-[1]*[2]s in namespace '%[3]s' status to be '%[4]s'"+ + "(found phase: %[5]q, readiness: %[6]t) (%[7]v elapsed)", + podPrintWidth, podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start)) } return fmt.Errorf("gave up waiting for pod '%s' to be '%s' after %v", podName, desc, timeout) } @@ -299,7 +318,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeo // the default service account is what is associated with pods when they do not specify a service account // as a result, pods are not able to be provisioned in a namespace until the service account is provisioned func waitForDefaultServiceAccountInNamespace(c *client.Client, namespace string) error { - return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountPoll, serviceAccountProvisionTimeout) + return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountProvisionTimeout) } // waitForPersistentVolumePhase waits for a PersistentVolume to be in a specific phase or until timeout occurs, whichever comes first. @@ -343,7 +362,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error) } func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error { - return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, namespace, podName, "running", podStartTimeout, func(pod *api.Pod) (bool, error) { if pod.Status.Phase == api.PodRunning { return true, nil } @@ -360,7 +379,7 @@ func waitForPodRunning(c *client.Client, podName string) error { // waitForPodNotPending returns an error if it took too long for the pod to go out of pending state. func waitForPodNotPending(c *client.Client, ns, podName string) error { - return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, ns, podName, "!pending", podStartTimeout, func(pod *api.Pod) (bool, error) { if pod.Status.Phase != api.PodPending { Logf("Saw pod '%s' in namespace '%s' out of pending state (found '%q')", podName, ns, pod.Status.Phase) return true, nil @@ -371,7 +390,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error { // waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long. func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error { - return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) { + return waitForPodCondition(c, namespace, podName, "success or failure", podStartTimeout, func(pod *api.Pod) (bool, error) { // Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632 ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName) if !ok { @@ -1062,6 +1081,19 @@ func DeleteRC(c *client.Client, ns, name string) error { return err } +// Convenient wrapper around listing nodes supporting retries. +func listNodes(c *client.Client, label labels.Selector, field fields.Selector) (*api.NodeList, error) { + var nodes *api.NodeList + var errLast error + if wait.Poll(poll, singleCallTimeout, func() (bool, error) { + nodes, errLast = c.Nodes().List(label, field) + return errLast == nil, nil + }) != nil { + return nil, fmt.Errorf("listNodes() failed with last error: %v", errLast) + } + return nodes, nil +} + // FailedContainers inspects all containers in a pod and returns failure // information for containers that have failed or been restarted. // A map is returned where the key is the containerID and the value is a @@ -1187,6 +1219,74 @@ func getSigner(provider string) (ssh.Signer, error) { return util.MakePrivateKeySigner(key) } +// checkPodsRunning returns whether all pods whose names are listed in podNames +// are running and ready. +func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool { + np, desc := len(podNames), "running and ready" + Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames) + result := make(chan bool, len(podNames)) + for ix := range podNames { + // Launch off pod readiness checkers. + go func(name string) { + err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady) + result <- err == nil + }(podNames[ix]) + } + // Wait for them all to finish. + success := true + // TODO(mbforbes): Change to `for range` syntax and remove logging once we + // support only Go >= 1.4. + for _, podName := range podNames { + if !<-result { + Logf("Pod %-[1]*[2]s failed to be %[3]s.", podPrintWidth, podName, desc) + success = false + } + } + Logf("Wanted all %d pods to be %s. Result: %t. Pods: %v", np, desc, success, podNames) + return success +} + +// waitForNodeToBeReady returns whether node name is ready within timeout. +func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool { + return waitForNodeToBe(c, name, true, timeout) +} + +// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the +// readiness condition is anything but ready, e.g false or unknown) within +// timeout. +func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool { + return waitForNodeToBe(c, name, false, timeout) +} + +// waitForNodeToBe returns whether node name's readiness state matches wantReady +// within timeout. If wantReady is true, it will ensure the node is ready; if +// it's false, it ensures the node is in any state other than ready (e.g. not +// ready or unknown). +func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool { + Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady) + for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { + node, err := c.Nodes().Get(name) + if err != nil { + Logf("Couldn't get node %s", name) + continue + } + + // Check the node readiness condition (logging all). + for i, cond := range node.Status.Conditions { + Logf("Node %s condition %d/%d: type: %v, status: %v", + name, i+1, len(node.Status.Conditions), cond.Type, cond.Status) + // Ensure that the condition type is readiness and the status + // matches as desired. + if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady { + Logf("Successfully found node %s readiness to be %t", name, wantReady) + return true + } + } + } + Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout) + return false +} + // LatencyMetrics stores data about request latency at a given quantile // broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services). type LatencyMetric struct {