From c458ee9dcbf558f133ee583699243b197c9e6d12 Mon Sep 17 00:00:00 2001
From: Mike Danese <mikedanese@gmail.com>
Date: Wed, 3 Jun 2015 16:18:26 -0700
Subject: [PATCH] add e2e test for etcd failure

---
 test/e2e/etcd_failure.go | 143 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 test/e2e/etcd_failure.go

diff --git a/test/e2e/etcd_failure.go b/test/e2e/etcd_failure.go
new file mode 100644
index 00000000000..be123c3605e
--- /dev/null
+++ b/test/e2e/etcd_failure.go
@@ -0,0 +1,143 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Etcd failure", func() {
+
+	framework := Framework{BaseName: "etcd-failure"}
+
+	BeforeEach(func() {
+		framework.beforeEach()
+
+		Expect(RunRC(RCConfig{
+			Client:    framework.Client,
+			Name:      "baz",
+			Namespace: framework.Namespace.Name,
+			Image:     "nginx",
+			Replicas:  1,
+		})).NotTo(HaveOccurred())
+	})
+
+	AfterEach(framework.afterEach)
+
+	It("should recover from network partition with master", func() {
+		etcdFailTest(
+			framework,
+			"sudo iptables -A INPUT -p tcp --destination-port 4001 -j DROP",
+			"sudo iptables -D INPUT -p tcp --destination-port 4001 -j DROP",
+			false,
+		)
+	})
+
+	It("should recover from SIGKILL", func() {
+		etcdFailTest(
+			framework,
+			"pgrep etcd | xargs -I {} sudo kill -9 {}",
+			"echo 'do nothing. monit should restart etcd.'",
+			true,
+		)
+	})
+})
+
+func etcdFailTest(framework Framework, failCommand, fixCommand string, repeat bool) {
+	// This test requires SSH, so the provider check should be identical to
+	// those tests.
+	if !providerIs("gce", "gke") {
+		By(fmt.Sprintf("Skippingt test, which is not implemented for %s", testContext.Provider))
+		return
+	}
+
+	doEtcdFailure(failCommand, fixCommand, repeat)
+
+	checkExistingRCRecovers(framework)
+
+	ServeImageOrFail(framework.Client, "basic", "gcr.io/google_containers/serve_hostname:1.1")
+}
+
+// For this duration, etcd will be failed by executing a failCommand on the master.
+// If repeat is true, the failCommand will be called at a rate of once per second for
+// the failure duration. If repeat is false, failCommand will only be called once at the
+// beginning of the failure duration. After this duration, we execute a fixCommand on the
+// master and go on to assert that etcd and kubernetes components recover.
+const etcdFailureDuration = 20 * time.Second
+
+func doEtcdFailure(failCommand, fixCommand string, repeat bool) {
+	By("failing etcd")
+
+	if repeat {
+		stop := make(chan struct{}, 1)
+		go util.Until(func() {
+			defer GinkgoRecover()
+			masterExec(failCommand)
+		}, 1*time.Second, stop)
+		time.Sleep(etcdFailureDuration)
+		stop <- struct{}{}
+	} else {
+		masterExec(failCommand)
+		time.Sleep(etcdFailureDuration)
+	}
+	masterExec(fixCommand)
+}
+
+func masterExec(cmd string) {
+	stdout, stderr, code, err := SSH(cmd, getMasterHost()+":22", testContext.Provider)
+	Expect(err).NotTo(HaveOccurred())
+	if code != 0 {
+		Failf("master exec command, '%v' failed with exitcode %v: \n\tstdout: %v\n\tstderr: %v", cmd, code, stdout, stderr)
+	}
+}
+
+func checkExistingRCRecovers(f Framework) {
+	By("assert that the pre-existing replication controller recovers")
+	podClient := f.Client.Pods(f.Namespace.Name)
+	rcSelector := labels.Set{"name": "baz"}.AsSelector()
+
+	By("deleting pods from existing replication controller")
+	pods, err := podClient.List(rcSelector, fields.Everything())
+	Expect(err).NotTo(HaveOccurred())
+	Expect(len(pods.Items) > 0).Should(BeTrue())
+	for _, pod := range pods.Items {
+		err = podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+		Expect(err).NotTo(HaveOccurred())
+	}
+
+	By("waiting for replication controller to recover")
+	expectNoError(wait.Poll(time.Millisecond*500, time.Second*30, func() (bool, error) {
+		pods, err := podClient.List(rcSelector, fields.Everything())
+		Expect(err).NotTo(HaveOccurred())
+		for _, pod := range pods.Items {
+			if api.IsPodReady(&pod) {
+				return true, nil
+			}
+		}
+		return false, nil
+	}))
+}