diff --git a/test/e2e/cloud/gcp/node_lease.go b/test/e2e/cloud/gcp/node_lease.go index 8f74902f47c..d85a901365b 100644 --- a/test/e2e/cloud/gcp/node_lease.go +++ b/test/e2e/cloud/gcp/node_lease.go @@ -38,7 +38,7 @@ import ( var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { f := framework.NewDefaultFramework("node-lease-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - var systemPodsNo int32 + var systemPodsNo int var c clientset.Interface var ns string var group string @@ -49,7 +49,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { ns = f.Namespace.Name systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) framework.ExpectNoError(err) - systemPodsNo = int32(len(systemPods)) + systemPodsNo = len(systemPods) if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) } else { @@ -98,7 +98,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { // Many e2e tests assume that the cluster is fully healthy before they start. Wait until // the cluster is restored to health. ginkgo.By("waiting for system pods to successfully restart") - err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) + err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) }) diff --git a/test/e2e/cloud/gcp/resize_nodes.go b/test/e2e/cloud/gcp/resize_nodes.go index 916d761fbcd..0d32f93f719 100644 --- a/test/e2e/cloud/gcp/resize_nodes.go +++ b/test/e2e/cloud/gcp/resize_nodes.go @@ -47,7 +47,7 @@ func resizeRC(ctx context.Context, c clientset.Interface, ns, name string, repli var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { f := framework.NewDefaultFramework("resize-nodes") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - var systemPodsNo int32 + var systemPodsNo int var c clientset.Interface var ns string var group string @@ -57,7 +57,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { ns = f.Namespace.Name systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) framework.ExpectNoError(err) - systemPodsNo = int32(len(systemPods)) + systemPodsNo = len(systemPods) if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) } else { @@ -99,7 +99,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { // Many e2e tests assume that the cluster is fully healthy before they start. Wait until // the cluster is restored to health. ginkgo.By("waiting for system pods to successfully restart") - err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) + err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) }) }) diff --git a/test/e2e/common/node/container_probe.go b/test/e2e/common/node/container_probe.go index 31122ccc581..534fc26bf97 100644 --- a/test/e2e/common/node/container_probe.go +++ b/test/e2e/common/node/container_probe.go @@ -612,7 +612,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -694,7 +694,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -1359,7 +1359,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -1452,7 +1452,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false diff --git a/test/e2e/common/node/pods.go b/test/e2e/common/node/pods.go index 69037c6e65f..7aa1521b81c 100644 --- a/test/e2e/common/node/pods.go +++ b/test/e2e/common/node/pods.go @@ -873,7 +873,7 @@ var _ = SIGDescribe("Pods", func() { // wait as required for all 3 pods to be running ginkgo.By("waiting for all 3 pods to be running") - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, f.Timeouts.PodStart) framework.ExpectNoError(err, "3 pods not found running.") // delete Collection of pods with a label in the current namespace diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index a37a6f13d14..262069829a8 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -226,7 +226,7 @@ func setupSuite(ctx context.Context) { // #41007. To avoid those pods preventing the whole test runs (and just // wasting the whole run), we allow for some not-ready pods (with the // number equal to the number of allowed not-ready nodes). - if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil { + if err := e2epod.WaitForAlmostAllPodsReady(ctx, c, metav1.NamespaceSystem, framework.TestContext.MinStartupPods, framework.TestContext.AllowedNotReadyNodes, timeouts.SystemPodsStartup); err != nil { e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem) e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf) framework.Failf("Error waiting for all pods to be running and ready: %v", err) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 4c7bab772e9..6d2cfe9865a 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -99,17 +99,22 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher { }).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase) } -// WaitForPodsRunningReady waits up to timeout to ensure that all pods in -// namespace ns are either running and ready, or failed but controlled by a -// controller. Also, it ensures that at least minPods are running and -// ready. It has separate behavior from other 'wait for' pods functions in -// that it requests the list of pods on every iteration. This is useful, for -// example, in cluster startup, because the number of pods increases while -// waiting. All pods that are in SUCCESS state are not counted. +// WaitForAlmostAllReady waits up to timeout for the following conditions: +// 1. At least minPods Pods in Namespace ns are Running and Ready +// 2. All Pods in Namespace ns are either Ready or Succeeded +// 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready +// +// After the timeout has elapsed, an error is returned if the number of Pods in a Pending Phase +// is greater than allowedNotReadyPods. +// +// It is generally recommended to use WaitForPodsRunningReady instead of this function +// whenever possible, because its behavior is more intuitive. Similar to WaitForPodsRunningReady, +// this function requests the list of pods on every iteration, making it useful for situations +// where the set of Pods is likely changing, such as during cluster startup. // // If minPods or allowedNotReadyPods are -1, this method returns immediately // without waiting. -func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration) error { +func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int, timeout time.Duration) error { if minPods == -1 || allowedNotReadyPods == -1 { return nil } @@ -126,14 +131,12 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri Pods []v1.Pod } - // notReady is -1 for any failure other than a timeout. - // Otherwise it is the number of pods that we were still - // waiting for. - notReady := int32(-1) + nOk := 0 + badPods := []v1.Pod{} + otherPods := []v1.Pod{} + succeededPods := []string{} err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { - // Reset notReady at the start of a poll attempt. - notReady = -1 rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) if err != nil { @@ -163,11 +166,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri replicaOk += rs.Status.ReadyReplicas } - nOk := int32(0) - notReady = int32(0) - failedPods := []v1.Pod{} - otherPods := []v1.Pod{} - succeededPods := []string{} + nOk = 0 + badPods = []v1.Pod{} + otherPods = []v1.Pod{} + succeededPods = []string{} for _, pod := range s.Pods { res, err := testutils.PodRunningReady(&pod) switch { @@ -179,14 +181,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri case pod.Status.Phase == v1.PodFailed: // ignore failed pods that are controlled by some controller if metav1.GetControllerOf(&pod) == nil { - failedPods = append(failedPods, pod) + badPods = append(badPods, pod) } default: - notReady++ otherPods = append(otherPods, pod) } } - done := replicaOk == replicas && nOk >= minPods && (len(failedPods)+len(otherPods)) == 0 + done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0 if done { return nil, nil } @@ -200,8 +201,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri if len(succeededPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) } - if len(failedPods) > 0 { - buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(failedPods, 1))) + if len(badPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) } if len(otherPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) @@ -211,13 +212,79 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri })) // An error might not be fatal. - if err != nil && notReady >= 0 && notReady <= allowedNotReadyPods { - framework.Logf("Number of not-ready pods (%d) is below the allowed threshold (%d).", notReady, allowedNotReadyPods) + if len(otherPods) <= allowedNotReadyPods { return nil } return err } +// WaitForPodsRunningReady waits up to timeout for the following conditions: +// 1. At least minPods Pods in Namespace ns are Running and Ready +// 2. No Pods in Namespace ns are Failed and not owned by a controller or Pending +// +// An error is returned if either of these conditions are not met within the timeout. +// +// It has separate behavior from other 'wait for' pods functions in +// that it requests the list of pods on every iteration. This is useful, for +// example, in cluster startup, because the number of pods increases while +// waiting. All pods that are in SUCCESS state are not counted. +func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error { + + return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) { + + podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err) + } + return podList.Items, nil + })).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) { + + nOk := 0 + badPods := []v1.Pod{} + otherPods := []v1.Pod{} + succeededPods := []string{} + + for _, pod := range pods { + res, err := testutils.PodRunningReady(&pod) + switch { + case res && err == nil: + nOk++ + case pod.Status.Phase == v1.PodSucceeded: + // ignore succeeded pods + succeededPods = append(succeededPods, pod.Name) + case pod.Status.Phase == v1.PodFailed: + // ignore failed pods that are controlled by some controller + if metav1.GetControllerOf(&pod) == nil { + badPods = append(badPods, pod) + } + default: + otherPods = append(otherPods, pod) + } + } + if nOk >= minPods && len(badPods)+len(otherPods) == 0 { + return nil, nil + } + + // Delayed formatting of a failure message. + return func() string { + var buffer strings.Builder + buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready \n", minPods, ns)) + buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(pods))) + if len(succeededPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) + } + if len(badPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) + } + if len(otherPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) + } + return buffer.String() + }, nil + })) + +} + // WaitForPodCondition waits a pods to be matched to the given condition. // The condition callback may use gomega.StopTrying to abort early. func WaitForPodCondition(ctx context.Context, c clientset.Interface, ns, podName, conditionDesc string, timeout time.Duration, condition podCondition) error { diff --git a/test/e2e/scheduling/priorities.go b/test/e2e/scheduling/priorities.go index 21f83d39784..636ec4da34b 100644 --- a/test/e2e/scheduling/priorities.go +++ b/test/e2e/scheduling/priorities.go @@ -109,7 +109,7 @@ var _ = SIGDescribe("SchedulerPriorities", framework.WithSerial(), func() { err = framework.CheckTestingNSDeletedExcept(ctx, cs, ns) framework.ExpectNoError(err) - err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout) + err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) // skip if the most utilized node has less than the cri-o minMemLimit available diff --git a/test/e2e/windows/host_process.go b/test/e2e/windows/host_process.go index 1d794313ba4..98a3c6c45c0 100644 --- a/test/e2e/windows/host_process.go +++ b/test/e2e/windows/host_process.go @@ -657,7 +657,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio ginkgo.By("Waiting for the pod to start running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + framework.ExpectNoError(err) ginkgo.By("Getting container stats for pod") statsChecked := false @@ -711,7 +712,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio pc.Create(ctx, pod) ginkgo.By("Waiting for pod to run") - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, 3*time.Minute) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute) + framework.ExpectNoError(err) ginkgo.By("Waiting for 60 seconds") // We wait an additional 60 seconds after the pod is Running because the diff --git a/test/e2e/windows/hyperv.go b/test/e2e/windows/hyperv.go index f066e24dedb..3744d6e48fa 100644 --- a/test/e2e/windows/hyperv.go +++ b/test/e2e/windows/hyperv.go @@ -95,7 +95,8 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn pc.Create(ctx, hypervPod) ginkgo.By("waiting for the pod to be running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + framework.ExpectNoError(err) ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers") diff --git a/test/e2e/windows/kubelet_stats.go b/test/e2e/windows/kubelet_stats.go index 3570bb01daf..64ac0a8e59e 100644 --- a/test/e2e/windows/kubelet_stats.go +++ b/test/e2e/windows/kubelet_stats.go @@ -60,7 +60,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk ginkgo.By("Waiting up to 3 minutes for pods to be running") timeout := 3 * time.Minute - err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout) framework.ExpectNoError(err) ginkgo.By("Getting kubelet stats 5 times and checking average duration") @@ -152,7 +152,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() { ginkgo.By("Waiting up to 3 minutes for pods to be running") timeout := 3 * time.Minute - err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, timeout) framework.ExpectNoError(err) ginkgo.By("Getting kubelet stats 1 time")