Workardoun KubeProxy failures in test framework

This commit is contained in:
Wojciech Tyczynski 2016-06-24 10:27:50 +02:00
parent eedc438da9
commit 77679b0437
5 changed files with 39 additions and 5 deletions

View File

@ -125,7 +125,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
// test pods from running, and tests that ensure all pods are running and
// ready will fail).
podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout
if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels); err != nil {
if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels, true); err != nil {
framework.DumpAllNamespaceInfo(c, api.NamespaceSystem)
framework.LogFailedContainers(c, api.NamespaceSystem)
framework.RunKubernetesServiceTestContainer(c, framework.TestContext.RepoRoot, api.NamespaceDefault)

View File

@ -554,7 +554,7 @@ func WaitForPodsSuccess(c *client.Client, ns string, successPodLabels map[string
// even if there are minPods pods, some of which are in Running/Ready
// and some in Success. This is to allow the client to decide if "Success"
// means "Ready" or not.
func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string, restartDockerOnFailures bool) error {
ignoreSelector := labels.SelectorFromSet(ignoreLabels)
start := time.Now()
Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
@ -567,6 +567,10 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
wg.Done()
}()
// We will be restarting all not-ready kubeProxies every 5 minutes,
// to workaround #25543 issue.
badKubeProxySince := make(map[string]time.Time)
if wait.PollImmediate(Poll, timeout, func() (bool, error) {
// We get the new list of pods and replication controllers in every
// iteration because more pods come online during startup and we want to
@ -597,6 +601,8 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
if hasReplicationControllersForPod(rcList, pod) {
replicaOk++
}
// If the pod is healthy, remove it from bad ones.
delete(badKubeProxySince, pod.Name)
} else {
if pod.Status.Phase != api.PodFailed {
Logf("The status of Pod %s is %s, waiting for it to be either Running or Failed", pod.ObjectMeta.Name, pod.Status.Phase)
@ -609,6 +615,34 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
}
}
// Try to repair all KubeProxies that are not-ready long enough by restarting Docker:
// see https://github.com/kubernetes/kubernetes/issues/24295#issuecomment-218920357
// for exact details.
if restartDockerOnFailures {
for _, badPod := range badPods {
name := badPod.Name
if len(name) > 10 && name[:10] == "kube-proxy" {
if _, ok := badKubeProxySince[name]; !ok {
badKubeProxySince[name] = time.Now()
}
if time.Since(badKubeProxySince[name]) > 5*time.Minute {
node, err := c.Nodes().Get(badPod.Spec.NodeName)
if err != nil {
Logf("Couldn't get node: %v", err)
continue
}
err = IssueSSHCommand("sudo service docker restart", TestContext.Provider, node)
if err != nil {
Logf("Couldn't restart docker on %s: %v", name, err)
continue
}
Logf("Docker on %s node restarted", badPod.Spec.NodeName)
delete(badKubeProxySince, name)
}
}
}
}
Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
Logf("expected %d pod replicas in namespace '%s', %d are Running and Ready.", replicas, ns, replicaOk)

View File

@ -69,7 +69,7 @@ var _ = framework.KubeDescribe("Mesos", func() {
const ns = "static-pods"
numpods := int32(len(nodelist.Items))
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}),
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}, false),
fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
})

View File

@ -424,7 +424,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
// Many e2e tests assume that the cluster is fully healthy before they start. Wait until
// the cluster is restored to health.
By("waiting for system pods to successfully restart")
err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels)
err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels, false)
Expect(err).NotTo(HaveOccurred())
By("waiting for image prepulling pods to complete")
framework.WaitForPodsSuccess(c, api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout)

View File

@ -222,7 +222,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() {
}
}
err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels)
err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels, false)
Expect(err).NotTo(HaveOccurred())
for _, node := range nodeList.Items {