From ed91d5564b84cb8aa7228172dcaca279ee9d684b Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Sat, 7 Nov 2015 21:53:36 -0800 Subject: [PATCH] Refactor the reboot test to print accurate information about node failures As well as events from the kube-system namespace --- test/e2e/reboot.go | 82 +++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index 9e31863192d..ee44fe6992d 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -18,6 +18,7 @@ package e2e import ( "fmt" + "sync" "time" "k8s.io/kubernetes/pkg/api" @@ -26,6 +27,7 @@ import ( "k8s.io/kubernetes/pkg/labels" . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" ) const ( @@ -43,16 +45,33 @@ const ( ) var _ = Describe("Reboot", func() { - f := NewFramework("reboot") + var f *Framework BeforeEach(func() { // These tests requires SSH to nodes, so the provider check should be identical to there // (the limiting factor is the implementation of util.go's getSigner(...)). // Cluster must support node reboot - SkipUnlessProviderIs("gce", "gke", "aws") + SkipUnlessProviderIs(providersWithSSH...) }) + AfterEach(func() { + if CurrentGinkgoTestDescription().Failed { + // Most of the reboot tests just make sure that addon/system pods are running, so dump + // events for the kube-system namespace on failures + namespaceName := api.NamespaceSystem + By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName)) + events, err := f.Client.Events(namespaceName).List(labels.Everything(), fields.Everything()) + Expect(err).NotTo(HaveOccurred()) + + for _, e := range events.Items { + Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message) + } + } + }) + + f = NewFramework("reboot") + It("each node by ordering clean reboot and ensure they function upon restart", func() { // clean shutdown and restart // We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted. @@ -100,22 +119,32 @@ func testReboot(c *client.Client, rebootCmd string) { if err != nil { Failf("Error getting nodes: %v", err) } - result := make(chan bool, len(nodelist.Items)) - for _, n := range nodelist.Items { - go rebootNode(c, testContext.Provider, n.ObjectMeta.Name, rebootCmd, result) + result := make([]bool, len(nodelist.Items)) + wg := sync.WaitGroup{} + wg.Add(len(nodelist.Items)) + + failed := false + for ix := range nodelist.Items { + go func(ix int) { + defer wg.Done() + n := nodelist.Items[ix] + result[ix] = rebootNode(c, testContext.Provider, n.ObjectMeta.Name, rebootCmd) + if !result[ix] { + failed = true + } + }(ix) } // Wait for all to finish and check the final result. - failed := false - // TODO(a-robinson): Change to `for range` syntax and remove logging once - // we support only Go >= 1.4. - for _, n := range nodelist.Items { - if !<-result { - Failf("Node %s failed reboot test.", n.ObjectMeta.Name) - failed = true - } - } + wg.Wait() + if failed { + for ix := range nodelist.Items { + n := nodelist.Items[ix] + if !result[ix] { + Logf("Node %s failed reboot test.", n.ObjectMeta.Name) + } + } Failf("Test failed; at least one node failed to reboot in the time given.") } } @@ -149,7 +178,7 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error { // // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. -func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { +func rebootNode(c *client.Client, provider, name, rebootCmd string) bool { // Setup ns := api.NamespaceSystem ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) @@ -160,14 +189,12 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan node, err := c.Nodes().Get(name) if err != nil { Logf("Couldn't get node %s", name) - result <- false - return + return false } // Node sanity check: ensure it is "ready". if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { - result <- false - return + return false } // Get all the pods on the node that don't have liveness probe set. @@ -191,36 +218,31 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { - result <- false - return + return false } // Reboot the node. if err = issueSSHCommand(node, provider, rebootCmd); err != nil { Logf("Error while issuing ssh command: %v", err) - result <- false - return + return false } // Wait for some kind of "not ready" status. if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { - result <- false - return + return false } // Wait for some kind of "ready" status. if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { - result <- false - return + return false } // Ensure all of the pods that we found on this node before the reboot are // running / healthy. if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { - result <- false - return + return false } Logf("Reboot successful on node %s", name) - result <- true + return true }