From 881ff89d9275b2d74dae8cee83228578c8fe8064 Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Fri, 20 Feb 2015 13:55:20 -0800 Subject: [PATCH] Deflake cadvisor e2e by adding retry loop Wait up to 5 minutes for cadvisor to start on nodes to reduce test failures from running immediately after cluster turnup. --- test/e2e/cadvisor.go | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/test/e2e/cadvisor.go b/test/e2e/cadvisor.go index 27dc943cb0f..d1f833c2ccb 100644 --- a/test/e2e/cadvisor.go +++ b/test/e2e/cadvisor.go @@ -34,21 +34,31 @@ var _ = Describe("Cadvisor", func() { expectNoError(err) }) - It("cadvisor should be healthy on every node.", func() { - CheckCadvisorHealthOnAllNodes(c) + It("should be healthy on every node.", func() { + CheckCadvisorHealthOnAllNodes(c, 5*time.Minute) }) }) -func CheckCadvisorHealthOnAllNodes(c *client.Client) { +func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) { By("getting list of nodes") nodeList, err := c.Nodes().List() expectNoError(err) - for _, node := range nodeList.Items { - // cadvisor is not accessible directly unless its port (4194 by default) is exposed. - // Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally. - statsResource := fmt.Sprintf("api/v1beta1/proxy/minions/%s/stats/", node.Name) - By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource)) - _, err = c.Get().AbsPath(statsResource).Timeout(1 * time.Second).Do().Raw() - expectNoError(err) + var errors []error + for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) { + errors = []error{} + for _, node := range nodeList.Items { + // cadvisor is not accessible directly unless its port (4194 by default) is exposed. + // Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally. + statsResource := fmt.Sprintf("api/v1beta1/proxy/minions/%s/stats/", node.Name) + By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource)) + _, err = c.Get().AbsPath(statsResource).Timeout(1 * time.Second).Do().Raw() + if err != nil { + errors = append(errors, err) + } + } + if len(errors) == 0 { + return + } } + Failf("Timed out after %v waiting for cadvisor to be healthy on all nodes. Errors:\n%v", timeout, errors) }