Deflake cadvisor e2e by adding retry loop

Wait up to 5 minutes for cadvisor to start on nodes to reduce
test failures from running immediately after cluster turnup.
This commit is contained in:
Jeff Lowdermilk 2015-02-20 13:55:20 -08:00
parent c2c86b0a6d
commit 881ff89d92

View File

@ -34,21 +34,31 @@ var _ = Describe("Cadvisor", func() {
expectNoError(err)
})
It("cadvisor should be healthy on every node.", func() {
CheckCadvisorHealthOnAllNodes(c)
It("should be healthy on every node.", func() {
CheckCadvisorHealthOnAllNodes(c, 5*time.Minute)
})
})
func CheckCadvisorHealthOnAllNodes(c *client.Client) {
func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) {
By("getting list of nodes")
nodeList, err := c.Nodes().List()
expectNoError(err)
for _, node := range nodeList.Items {
// cadvisor is not accessible directly unless its port (4194 by default) is exposed.
// Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally.
statsResource := fmt.Sprintf("api/v1beta1/proxy/minions/%s/stats/", node.Name)
By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource))
_, err = c.Get().AbsPath(statsResource).Timeout(1 * time.Second).Do().Raw()
expectNoError(err)
var errors []error
for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
errors = []error{}
for _, node := range nodeList.Items {
// cadvisor is not accessible directly unless its port (4194 by default) is exposed.
// Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally.
statsResource := fmt.Sprintf("api/v1beta1/proxy/minions/%s/stats/", node.Name)
By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource))
_, err = c.Get().AbsPath(statsResource).Timeout(1 * time.Second).Do().Raw()
if err != nil {
errors = append(errors, err)
}
}
if len(errors) == 0 {
return
}
}
Failf("Timed out after %v waiting for cadvisor to be healthy on all nodes. Errors:\n%v", timeout, errors)
}