From 8ce6c2db96a9db8d8d10094fae3e278cc47226c2 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Mon, 6 Jul 2015 00:38:53 -0400 Subject: [PATCH 1/4] e2e: Check that node is healthy before isolating from master We simulate network failure, check the node is offline, and then check it recovers. This adds a check that the node is online before simulating network failure. --- test/e2e/resize_nodes.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index f3488187cdf..0ef0434a0e0 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -375,6 +375,9 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica } }() + Logf("Waiting for node %s to be ready", node.Name) + waitForNodeToBe(c, node.Name, true, 2*time.Minute) + // The command will block all outgoing network traffic from the node to the master // When multi-master is implemented, this test will have to be improved to block // network traffic to all masters. From 638a19eb2970c30a0f80b9f6c15763e95ca720d3 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Mon, 6 Jul 2015 08:32:56 -0400 Subject: [PATCH 2/4] Spelling (non-code): test/e2e/resize_nodes.go --- test/e2e/resize_nodes.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 0ef0434a0e0..683259db0b2 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -40,7 +40,7 @@ const serveHostnameImage = "gcr.io/google_containers/serve_hostname:1.1" func resizeGroup(size int) error { if testContext.Provider == "gce" || testContext.Provider == "gke" { - // TODO: make this hit the compute API directly instread of shelling out to gcloud. + // TODO: make this hit the compute API directly instead of shelling out to gcloud. // TODO: make gce/gke implement InstanceGroups, so we can eliminate the per-provider logic output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone, "resize", testContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--new-size=%v", size)).CombinedOutput() @@ -60,7 +60,7 @@ func resizeGroup(size int) error { func groupSize() (int, error) { if testContext.Provider == "gce" || testContext.Provider == "gke" { - // TODO: make this hit the compute API directly instread of shelling out to gcloud. + // TODO: make this hit the compute API directly instead of shelling out to gcloud. // TODO: make gce/gke implement InstanceGroups, so we can eliminate the per-provider logic output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone, "describe", testContext.CloudConfig.NodeInstanceGroup).CombinedOutput() @@ -381,7 +381,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica // The command will block all outgoing network traffic from the node to the master // When multi-master is implemented, this test will have to be improved to block // network traffic to all masters. - // We could also block network traffic from the master(s)s to this node, + // We could also block network traffic from the master(s) to this node, // but blocking it one way is sufficient for this test. dropCmd := fmt.Sprintf("sudo iptables --insert %s", iptablesRule) if _, _, code, err := SSH(dropCmd, host, testContext.Provider); code != 0 || err != nil { @@ -399,7 +399,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica err := verifyPods(c, ns, rcName, true, replicas) Expect(err).NotTo(HaveOccurred()) - // network traffic is unblocked in a defered function + // network traffic is unblocked in a deferred function } var _ = Describe("Nodes", func() { @@ -455,7 +455,7 @@ var _ = Describe("Nodes", func() { It("should be able to delete nodes", func() { // Create a replication controller for a service that serves its hostname. - // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + // The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-delete-node" replicas := testContext.CloudConfig.NumNodes newRCByName(c, ns, name, replicas) @@ -478,7 +478,7 @@ var _ = Describe("Nodes", func() { // TODO: Bug here - testName is not correct It("should be able to add nodes", func() { // Create a replication controller for a service that serves its hostname. - // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + // The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-add-node" newSVCByName(c, ns, name) replicas := testContext.CloudConfig.NumNodes @@ -519,7 +519,7 @@ var _ = Describe("Nodes", func() { "AND allows scheduling of pods on a minion after it rejoins the cluster", func() { // Create a replication controller for a service that serves its hostname. - // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + // The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-net" newSVCByName(c, ns, name) replicas := testContext.CloudConfig.NumNodes From 9cd746e74df2661e44a46b110258cb088a75d301 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Mon, 6 Jul 2015 08:33:46 -0400 Subject: [PATCH 3/4] e2e: check return values in resize_nodes We weren't actually testing the return values in resize_nodes; and I don't believe that was deliberate. --- test/e2e/resize_nodes.go | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 683259db0b2..3cbcd6ee2e7 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -376,7 +376,9 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica }() Logf("Waiting for node %s to be ready", node.Name) - waitForNodeToBe(c, node.Name, true, 2*time.Minute) + if !waitForNodeToBe(c, node.Name, true, 2*time.Minute) { + Failf("Node did not become ready") + } // The command will block all outgoing network traffic from the node to the master // When multi-master is implemented, this test will have to be improved to block @@ -390,13 +392,16 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica } Logf("Waiting for node %s to be not ready", node.Name) - waitForNodeToBe(c, node.Name, false, 2*time.Minute) + if !waitForNodeToBe(c, node.Name, false, 2*time.Minute) { + Failf("Node did not become not-ready") + } Logf("Waiting for pod %s to be removed", podNameToDisappear) - waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear) + err := waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear) + Expect(err).NotTo(HaveOccurred()) By("verifying whether the pod from the unreachable node is recreated") - err := verifyPods(c, ns, rcName, true, replicas) + err = verifyPods(c, ns, rcName, true, replicas) Expect(err).NotTo(HaveOccurred()) // network traffic is unblocked in a deferred function @@ -539,7 +544,9 @@ var _ = Describe("Nodes", func() { By(fmt.Sprintf("block network traffic from node %s", node.Name)) performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node) Logf("Waiting for node %s to be ready", node.Name) - waitForNodeToBe(c, node.Name, true, 2*time.Minute) + if !waitForNodeToBe(c, node.Name, true, 2*time.Minute) { + Failf("Node did not become ready") + } By("verify whether new pods can be created on the re-attached node") // increasing the RC size is not a valid way to test this From 62bb38af214a3773294212b9ebd529fd254d23fa Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Sun, 26 Jul 2015 22:24:26 -0400 Subject: [PATCH 4/4] e2e: Improve logging while waiting for node readiness Per suggestions by mbforbes --- test/e2e/resize_nodes.go | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 3cbcd6ee2e7..9a2f39566bb 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -38,6 +38,9 @@ import ( const serveHostnameImage = "gcr.io/google_containers/serve_hostname:1.1" +const resizeNodeReadyTimeout = 2 * time.Minute +const resizeNodeNotReadyTimeout = 2 * time.Minute + func resizeGroup(size int) error { if testContext.Provider == "gce" || testContext.Provider == "gke" { // TODO: make this hit the compute API directly instead of shelling out to gcloud. @@ -101,7 +104,8 @@ func groupSize() (int, error) { } func waitForGroupSize(size int) error { - for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(5 * time.Second) { + timeout := 4 * time.Minute + for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) { currentSize, err := groupSize() if err != nil { Logf("Failed to get node instance group size: %v", err) @@ -114,7 +118,7 @@ func waitForGroupSize(size int) error { Logf("Node instance group has reached the desired size %d", size) return nil } - return fmt.Errorf("timeout waiting for node instance group size to be %d", size) + return fmt.Errorf("timeout waiting %v for node instance group size to be %d", timeout, size) } func waitForClusterSize(c *client.Client, size int) error { @@ -136,7 +140,7 @@ func waitForClusterSize(c *client.Client, size int) error { } Logf("Waiting for cluster size %d, current size %d", size, len(nodes.Items)) } - return fmt.Errorf("timeout waiting for cluster size to be %d", size) + return fmt.Errorf("timeout waiting %v for cluster size to be %d", timeout, size) } func svcByName(name string) *api.Service { @@ -254,9 +258,10 @@ func resizeRC(c *client.Client, ns, name string, replicas int) error { } func podsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) { + timeout := time.Minute // List the pods, making sure we observe all the replicas. label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) - for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) { + for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) { pods, err := c.Pods(ns).List(label, fields.Everything()) if err != nil { return nil, err @@ -267,7 +272,7 @@ func podsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, return pods, nil } } - return nil, fmt.Errorf("Pod name %s: Gave up waiting for %d pods to come up", name, replicas) + return nil, fmt.Errorf("Pod name %s: Gave up waiting %v for %d pods to come up", name, timeout, replicas) } func podsRunning(c *client.Client, pods *api.PodList) []error { @@ -375,9 +380,9 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica } }() - Logf("Waiting for node %s to be ready", node.Name) - if !waitForNodeToBe(c, node.Name, true, 2*time.Minute) { - Failf("Node did not become ready") + Logf("Waiting %v to ensure node %s is ready before beginning test...", resizeNodeReadyTimeout, node.Name) + if !waitForNodeToBe(c, node.Name, true, resizeNodeReadyTimeout) { + Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout) } // The command will block all outgoing network traffic from the node to the master @@ -391,9 +396,9 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica dropCmd, node.Name, code, err) } - Logf("Waiting for node %s to be not ready", node.Name) - if !waitForNodeToBe(c, node.Name, false, 2*time.Minute) { - Failf("Node did not become not-ready") + Logf("Waiting %v for node %s to be not ready after simulated network failure", resizeNodeNotReadyTimeout, node.Name) + if !waitForNodeToBe(c, node.Name, false, resizeNodeNotReadyTimeout) { + Failf("Node %s did not become not-ready within %v", node.Name, resizeNodeNotReadyTimeout) } Logf("Waiting for pod %s to be removed", podNameToDisappear) @@ -543,9 +548,9 @@ var _ = Describe("Nodes", func() { By(fmt.Sprintf("block network traffic from node %s", node.Name)) performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node) - Logf("Waiting for node %s to be ready", node.Name) - if !waitForNodeToBe(c, node.Name, true, 2*time.Minute) { - Failf("Node did not become ready") + Logf("Waiting %v for node %s to be ready once temporary network failure ends", resizeNodeReadyTimeout, node.Name) + if !waitForNodeToBe(c, node.Name, true, resizeNodeReadyTimeout) { + Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout) } By("verify whether new pods can be created on the re-attached node")