mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-30 21:30:16 +00:00 
			
		
		
		
	Allow for not-ready pods in large clusters
This commit is contained in:
		| @@ -137,7 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { | |||||||
| 	// test pods from running, and tests that ensure all pods are running and | 	// test pods from running, and tests that ensure all pods are running and | ||||||
| 	// ready will fail). | 	// ready will fail). | ||||||
| 	podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout | 	podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout | ||||||
| 	if err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels, true); err != nil { | 	// TODO: In large clusters, we often observe a non-starting pods due to | ||||||
|  | 	// #41007. To avoid those pods preventing the whole test runs (and just | ||||||
|  | 	// wasting the whole run), we allow for some not-ready pods (with the | ||||||
|  | 	// number equal to the number of allowed not-ready nodes). | ||||||
|  | 	if err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), podStartupTimeout, framework.ImagePullerLabels, true); err != nil { | ||||||
| 		framework.DumpAllNamespaceInfo(c, metav1.NamespaceSystem) | 		framework.DumpAllNamespaceInfo(c, metav1.NamespaceSystem) | ||||||
| 		framework.LogFailedContainers(c, metav1.NamespaceSystem, framework.Logf) | 		framework.LogFailedContainers(c, metav1.NamespaceSystem, framework.Logf) | ||||||
| 		runKubernetesServiceTestContainer(c, metav1.NamespaceDefault) | 		runKubernetesServiceTestContainer(c, metav1.NamespaceDefault) | ||||||
|   | |||||||
| @@ -495,8 +495,7 @@ func WaitForPodsSuccess(c clientset.Interface, ns string, successPodLabels map[s | |||||||
| // and some in Success. This is to allow the client to decide if "Success" | // and some in Success. This is to allow the client to decide if "Success" | ||||||
| // means "Ready" or not. | // means "Ready" or not. | ||||||
| // If skipSucceeded is true, any pods that are Succeeded are not counted. | // If skipSucceeded is true, any pods that are Succeeded are not counted. | ||||||
| func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string, skipSucceeded bool) error { | func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration, ignoreLabels map[string]string, skipSucceeded bool) error { | ||||||
|  |  | ||||||
| 	ignoreSelector := labels.SelectorFromSet(ignoreLabels) | 	ignoreSelector := labels.SelectorFromSet(ignoreLabels) | ||||||
| 	start := time.Now() | 	start := time.Now() | ||||||
| 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready", | 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready", | ||||||
| @@ -504,6 +503,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti | |||||||
| 	wg := sync.WaitGroup{} | 	wg := sync.WaitGroup{} | ||||||
| 	wg.Add(1) | 	wg.Add(1) | ||||||
| 	var waitForSuccessError error | 	var waitForSuccessError error | ||||||
|  | 	var ignoreNotReady bool | ||||||
| 	badPods := []v1.Pod{} | 	badPods := []v1.Pod{} | ||||||
| 	desiredPods := 0 | 	desiredPods := 0 | ||||||
| 	go func() { | 	go func() { | ||||||
| @@ -544,6 +544,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti | |||||||
| 			return false, nil | 			return false, nil | ||||||
| 		} | 		} | ||||||
| 		nOk := int32(0) | 		nOk := int32(0) | ||||||
|  | 		notReady := int32(0) | ||||||
| 		badPods = []v1.Pod{} | 		badPods = []v1.Pod{} | ||||||
| 		desiredPods = len(podList.Items) | 		desiredPods = len(podList.Items) | ||||||
| 		for _, pod := range podList.Items { | 		for _, pod := range podList.Items { | ||||||
| @@ -564,6 +565,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti | |||||||
| 				return false, errors.New("unexpected Succeeded pod state") | 				return false, errors.New("unexpected Succeeded pod state") | ||||||
| 			case pod.Status.Phase != v1.PodFailed: | 			case pod.Status.Phase != v1.PodFailed: | ||||||
| 				Logf("The status of Pod %s is %s (Ready = false), waiting for it to be either Running (with Ready = true) or Failed", pod.ObjectMeta.Name, pod.Status.Phase) | 				Logf("The status of Pod %s is %s (Ready = false), waiting for it to be either Running (with Ready = true) or Failed", pod.ObjectMeta.Name, pod.Status.Phase) | ||||||
|  | 				notReady++ | ||||||
| 				badPods = append(badPods, pod) | 				badPods = append(badPods, pod) | ||||||
| 			default: | 			default: | ||||||
| 				if _, ok := pod.Annotations[v1.CreatedByAnnotation]; !ok { | 				if _, ok := pod.Annotations[v1.CreatedByAnnotation]; !ok { | ||||||
| @@ -581,11 +583,15 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti | |||||||
| 		if replicaOk == replicas && nOk >= minPods && len(badPods) == 0 { | 		if replicaOk == replicas && nOk >= minPods && len(badPods) == 0 { | ||||||
| 			return true, nil | 			return true, nil | ||||||
| 		} | 		} | ||||||
|  | 		ignoreNotReady = (notReady <= allowedNotReadyPods) | ||||||
| 		logPodStates(badPods) | 		logPodStates(badPods) | ||||||
| 		return false, nil | 		return false, nil | ||||||
| 	}) != nil { | 	}) != nil { | ||||||
|  | 		if !ignoreNotReady { | ||||||
| 			return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "RUNNING and READY", timeout)) | 			return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "RUNNING and READY", timeout)) | ||||||
| 		} | 		} | ||||||
|  | 		Logf("Number of not-ready pods is allowed.") | ||||||
|  | 	} | ||||||
| 	wg.Wait() | 	wg.Wait() | ||||||
| 	if waitForSuccessError != nil { | 	if waitForSuccessError != nil { | ||||||
| 		return waitForSuccessError | 		return waitForSuccessError | ||||||
|   | |||||||
| @@ -68,7 +68,7 @@ var _ = framework.KubeDescribe("Mesos", func() { | |||||||
| 		nodelist := framework.GetReadySchedulableNodesOrDie(client) | 		nodelist := framework.GetReadySchedulableNodesOrDie(client) | ||||||
| 		const ns = "static-pods" | 		const ns = "static-pods" | ||||||
| 		numpods := int32(len(nodelist.Items)) | 		numpods := int32(len(nodelist.Items)) | ||||||
| 		framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}, false), | 		framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, 0, wait.ForeverTestTimeout, map[string]string{}, false), | ||||||
| 			fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods)) | 			fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods)) | ||||||
| 	}) | 	}) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -237,7 +237,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() { | |||||||
| 			// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until | 			// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until | ||||||
| 			// the cluster is restored to health. | 			// the cluster is restored to health. | ||||||
| 			By("waiting for system pods to successfully restart") | 			By("waiting for system pods to successfully restart") | ||||||
| 			err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels, true) | 			err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout, ignoreLabels, true) | ||||||
| 			Expect(err).NotTo(HaveOccurred()) | 			Expect(err).NotTo(HaveOccurred()) | ||||||
| 			By("waiting for image prepulling pods to complete") | 			By("waiting for image prepulling pods to complete") | ||||||
| 			framework.WaitForPodsSuccess(c, metav1.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout) | 			framework.WaitForPodsSuccess(c, metav1.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout) | ||||||
|   | |||||||
| @@ -90,7 +90,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() { | |||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		err = framework.WaitForPodsRunningReady(cs, metav1.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels, true) | 		err = framework.WaitForPodsRunningReady(cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout, ignoreLabels, true) | ||||||
| 		Expect(err).NotTo(HaveOccurred()) | 		Expect(err).NotTo(HaveOccurred()) | ||||||
|  |  | ||||||
| 		for _, node := range nodeList.Items { | 		for _, node := range nodeList.Items { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user