mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-19 09:52:49 +00:00
Workardoun KubeProxy failures in test framework
This commit is contained in:
parent
eedc438da9
commit
77679b0437
@ -125,7 +125,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
|
||||
// test pods from running, and tests that ensure all pods are running and
|
||||
// ready will fail).
|
||||
podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout
|
||||
if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels); err != nil {
|
||||
if err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels, true); err != nil {
|
||||
framework.DumpAllNamespaceInfo(c, api.NamespaceSystem)
|
||||
framework.LogFailedContainers(c, api.NamespaceSystem)
|
||||
framework.RunKubernetesServiceTestContainer(c, framework.TestContext.RepoRoot, api.NamespaceDefault)
|
||||
|
@ -554,7 +554,7 @@ func WaitForPodsSuccess(c *client.Client, ns string, successPodLabels map[string
|
||||
// even if there are minPods pods, some of which are in Running/Ready
|
||||
// and some in Success. This is to allow the client to decide if "Success"
|
||||
// means "Ready" or not.
|
||||
func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
|
||||
func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string, restartDockerOnFailures bool) error {
|
||||
ignoreSelector := labels.SelectorFromSet(ignoreLabels)
|
||||
start := time.Now()
|
||||
Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
|
||||
@ -567,6 +567,10 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
// We will be restarting all not-ready kubeProxies every 5 minutes,
|
||||
// to workaround #25543 issue.
|
||||
badKubeProxySince := make(map[string]time.Time)
|
||||
|
||||
if wait.PollImmediate(Poll, timeout, func() (bool, error) {
|
||||
// We get the new list of pods and replication controllers in every
|
||||
// iteration because more pods come online during startup and we want to
|
||||
@ -597,6 +601,8 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
|
||||
if hasReplicationControllersForPod(rcList, pod) {
|
||||
replicaOk++
|
||||
}
|
||||
// If the pod is healthy, remove it from bad ones.
|
||||
delete(badKubeProxySince, pod.Name)
|
||||
} else {
|
||||
if pod.Status.Phase != api.PodFailed {
|
||||
Logf("The status of Pod %s is %s, waiting for it to be either Running or Failed", pod.ObjectMeta.Name, pod.Status.Phase)
|
||||
@ -609,6 +615,34 @@ func WaitForPodsRunningReady(c *client.Client, ns string, minPods int32, timeout
|
||||
}
|
||||
}
|
||||
|
||||
// Try to repair all KubeProxies that are not-ready long enough by restarting Docker:
|
||||
// see https://github.com/kubernetes/kubernetes/issues/24295#issuecomment-218920357
|
||||
// for exact details.
|
||||
if restartDockerOnFailures {
|
||||
for _, badPod := range badPods {
|
||||
name := badPod.Name
|
||||
if len(name) > 10 && name[:10] == "kube-proxy" {
|
||||
if _, ok := badKubeProxySince[name]; !ok {
|
||||
badKubeProxySince[name] = time.Now()
|
||||
}
|
||||
if time.Since(badKubeProxySince[name]) > 5*time.Minute {
|
||||
node, err := c.Nodes().Get(badPod.Spec.NodeName)
|
||||
if err != nil {
|
||||
Logf("Couldn't get node: %v", err)
|
||||
continue
|
||||
}
|
||||
err = IssueSSHCommand("sudo service docker restart", TestContext.Provider, node)
|
||||
if err != nil {
|
||||
Logf("Couldn't restart docker on %s: %v", name, err)
|
||||
continue
|
||||
}
|
||||
Logf("Docker on %s node restarted", badPod.Spec.NodeName)
|
||||
delete(badKubeProxySince, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
|
||||
nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
|
||||
Logf("expected %d pod replicas in namespace '%s', %d are Running and Ready.", replicas, ns, replicaOk)
|
||||
|
@ -69,7 +69,7 @@ var _ = framework.KubeDescribe("Mesos", func() {
|
||||
|
||||
const ns = "static-pods"
|
||||
numpods := int32(len(nodelist.Items))
|
||||
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}),
|
||||
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}, false),
|
||||
fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
|
||||
})
|
||||
|
||||
|
@ -424,7 +424,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
|
||||
// Many e2e tests assume that the cluster is fully healthy before they start. Wait until
|
||||
// the cluster is restored to health.
|
||||
By("waiting for system pods to successfully restart")
|
||||
err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels)
|
||||
err := framework.WaitForPodsRunningReady(c, api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels, false)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
By("waiting for image prepulling pods to complete")
|
||||
framework.WaitForPodsSuccess(c, api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout)
|
||||
|
@ -222,7 +222,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() {
|
||||
}
|
||||
}
|
||||
|
||||
err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels)
|
||||
err = framework.WaitForPodsRunningReady(c, api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels, false)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
for _, node := range nodeList.Items {
|
||||
|
Loading…
Reference in New Issue
Block a user