Workardoun KubeProxy failures in test framework

2025-07-19 09:52:49 +00:00 · 2016-06-24 10:27:50 +02:00 · 2016-06-24 10:27:50 +02:00 · 77679b0437
commit 77679b0437
parent eedc438da9
5 changed files with 39 additions and 5 deletions
--- a/test/e2e/e2e.go
+++ b/test/e2e/e2e.go
@ -125,7 +125,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 	// test pods from running, and tests that ensure all pods are running and
 	// ready will fail).
 	podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout
-	if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels); err != nil {
+	if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels, true); err != nil {
 		framework.DumpAllNamespaceInfo(c, api.NamespaceSystem)
 		framework.LogFailedContainers(c, api.NamespaceSystem)
 		framework.RunKubernetesServiceTestContainer(c, framework.TestContext.RepoRoot, api.NamespaceDefault)
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@ -554,7 +554,7 @@ func WaitForPodsSuccess(c *client.Client, ns string, successPodLabels map[string
 // even if there are minPods pods, some of which are in Running/Ready
 // and some in Success. This is to allow the client to decide if "Success"
 // means "Ready" or not.
-func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
+func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string, restartDockerOnFailures bool) error {
 	ignoreSelector := labels.SelectorFromSet(ignoreLabels)
 	start := time.Now()
 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
@ -567,6 +567,10 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
 		wg.Done()
 	}()

+	// We will be restarting all not-ready kubeProxies every 5 minutes,
+	// to workaround #25543 issue.
+	badKubeProxySince := make(map[string]time.Time)
+
 	if wait.PollImmediate(Poll, timeout, func() (bool, error) {
 		// We get the new list of pods and replication controllers in every
 		// iteration because more pods come online during startup and we want to
@ -597,6 +601,8 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
 				if hasReplicationControllersForPod(rcList, pod) {
 					replicaOk++
 				}
+				// If the pod is healthy, remove it from bad ones.
+				delete(badKubeProxySince, pod.Name)
 			} else {
 				if pod.Status.Phase != api.PodFailed {
 					Logf("The status of Pod %s is %s, waiting for it to be either Running or Failed", pod.ObjectMeta.Name, pod.Status.Phase)
@ -609,6 +615,34 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
 			}
 		}

+		// Try to repair all KubeProxies that are not-ready long enough by restarting Docker:
+		// see https://github.com/kubernetes/kubernetes/issues/24295#issuecomment-218920357
+		// for exact details.
+		if restartDockerOnFailures {
+			for _, badPod := range badPods {
+				name := badPod.Name
+				if len(name) > 10 && name[:10] == "kube-proxy" {
+					if _, ok := badKubeProxySince[name]; !ok {
+						badKubeProxySince[name] = time.Now()
+					}
+					if time.Since(badKubeProxySince[name]) > 5*time.Minute {
+						node, err := c.Nodes().Get(badPod.Spec.NodeName)
+						if err != nil {
+							Logf("Couldn't get node: %v", err)
+							continue
+						}
+						err = IssueSSHCommand("sudo service docker restart", TestContext.Provider, node)
+						if err != nil {
+							Logf("Couldn't restart docker on %s: %v", name, err)
+							continue
+						}
+						Logf("Docker on %s node restarted", badPod.Spec.NodeName)
+						delete(badKubeProxySince, name)
+					}
+				}
+			}
+		}
+
 		Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
 			nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
 		Logf("expected %d pod replicas in namespace '%s', %d are Running and Ready.", replicas, ns, replicaOk)
--- a/test/e2e/mesos.go
+++ b/test/e2e/mesos.go
@ -69,7 +69,7 @@ var _ = framework.KubeDescribe("Mesos", func() {

 		const ns = "static-pods"
 		numpods := int32(len(nodelist.Items))
-		framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}),
+		framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}, false),
 			fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
 	})

--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@ -424,7 +424,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 			// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until
 			// the cluster is restored to health.
 			By("waiting for system pods to successfully restart")
-			err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels)
+			err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels, false)
 			Expect(err).NotTo(HaveOccurred())
 			By("waiting for image prepulling pods to complete")
 			framework.WaitForPodsSuccess(c, api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout)
--- a/test/e2e/scheduler_predicates.go
+++ b/test/e2e/scheduler_predicates.go
@ -222,7 +222,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() {
 			}
 		}

-		err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels)
+		err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels, false)
 		Expect(err).NotTo(HaveOccurred())

 		for _, node := range nodeList.Items {