From 273cd03c017c091f95ae563c9b83c777cb9089fd Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Tue, 2 Apr 2024 23:40:33 -0500 Subject: [PATCH 1/9] Cleanup WaitForPodsRunningReady: fail for bad pods and reword log message --- test/e2e/framework/pod/wait.go | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 4c7bab772e9..9f013156cd5 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -126,14 +126,12 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri Pods []v1.Pod } - // notReady is -1 for any failure other than a timeout. - // Otherwise it is the number of pods that we were still - // waiting for. - notReady := int32(-1) + nOk := int32(0) + badPods := []v1.Pod{} + otherPods := []v1.Pod{} + succeededPods := []string{} err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { - // Reset notReady at the start of a poll attempt. - notReady = -1 rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) if err != nil { @@ -163,11 +161,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri replicaOk += rs.Status.ReadyReplicas } - nOk := int32(0) - notReady = int32(0) - failedPods := []v1.Pod{} - otherPods := []v1.Pod{} - succeededPods := []string{} + nOk = 0 + badPods = []v1.Pod{} + otherPods = []v1.Pod{} + succeededPods = []string{} for _, pod := range s.Pods { res, err := testutils.PodRunningReady(&pod) switch { @@ -179,14 +176,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri case pod.Status.Phase == v1.PodFailed: // ignore failed pods that are controlled by some controller if metav1.GetControllerOf(&pod) == nil { - failedPods = append(failedPods, pod) + badPods = append(badPods, pod) } default: - notReady++ otherPods = append(otherPods, pod) } } - done := replicaOk == replicas && nOk >= minPods && (len(failedPods)+len(otherPods)) == 0 + done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0 if done { return nil, nil } @@ -200,8 +196,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri if len(succeededPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) } - if len(failedPods) > 0 { - buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(failedPods, 1))) + if len(badPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) } if len(otherPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) @@ -211,8 +207,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri })) // An error might not be fatal. - if err != nil && notReady >= 0 && notReady <= allowedNotReadyPods { - framework.Logf("Number of not-ready pods (%d) is below the allowed threshold (%d).", notReady, allowedNotReadyPods) + if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods { + framework.Logf( + "Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d", + nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods) return nil } return err From 2537c10453eb705eb858b8d9ebc86e00193916dc Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Fri, 5 Apr 2024 22:10:07 -0500 Subject: [PATCH 2/9] Improve logging and comments in WaitForPodsRunningReady --- test/e2e/framework/pod/wait.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 9f013156cd5..dcf82705d70 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -171,10 +171,11 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri case res && err == nil: nOk++ case pod.Status.Phase == v1.PodSucceeded: - // it doesn't make sense to wait for this pod + // ignore succeeded pods succeededPods = append(succeededPods, pod.Name) case pod.Status.Phase == v1.PodFailed: // ignore failed pods that are controlled by some controller + // failed pods without a controller result in an error if metav1.GetControllerOf(&pod) == nil { badPods = append(badPods, pod) } @@ -209,10 +210,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri // An error might not be fatal. if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods { framework.Logf( - "Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d", + "Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d, so we continue without error.", nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods) return nil + } else if err != nil && len(badPods) != 0 { + framework.Logf("Error: %d pods failed and were not controlled by some controller.", len(badPods)) } + return err } From fe251cb7373e68fa9997ab5d6188ebf307babff7 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Fri, 5 Apr 2024 23:47:10 -0500 Subject: [PATCH 3/9] Explicitity set contract for WaitForPodsRunningReady, and rewrite accordingly --- test/e2e/framework/pod/wait.go | 41 ++++++++++------------------------ 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index dcf82705d70..33a624c9d61 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -99,10 +99,14 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher { }).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase) } -// WaitForPodsRunningReady waits up to timeout to ensure that all pods in -// namespace ns are either running and ready, or failed but controlled by a -// controller. Also, it ensures that at least minPods are running and -// ready. It has separate behavior from other 'wait for' pods functions in +// WaitForPodsRunningReady waits up to timeout for the following conditions: +// 1. At least minPods Pods in Namespace ns are Running and Ready +// 2. No more than allowedNotReadyPods Pods in Namespace ns are not either +// Ready, Succeeded, or Failed with a Controller. +// +// # An error is returned if either of these conditions are not met within the timeout +// +// It has separate behavior from other 'wait for' pods functions in // that it requests the list of pods on every iteration. This is useful, for // example, in cluster startup, because the number of pods increases while // waiting. All pods that are in SUCCESS state are not counted. @@ -131,7 +135,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri otherPods := []v1.Pod{} succeededPods := []string{} - err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { + return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) if err != nil { @@ -151,15 +155,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri Pods: podList.Items, }, nil })).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) { - replicas, replicaOk := int32(0), int32(0) - for _, rc := range s.ReplicationControllers { - replicas += *rc.Spec.Replicas - replicaOk += rc.Status.ReadyReplicas - } - for _, rs := range s.ReplicaSets { - replicas += *rs.Spec.Replicas - replicaOk += rs.Status.ReadyReplicas - } nOk = 0 badPods = []v1.Pod{} @@ -175,7 +170,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri succeededPods = append(succeededPods, pod.Name) case pod.Status.Phase == v1.PodFailed: // ignore failed pods that are controlled by some controller - // failed pods without a controller result in an error + // TODO either document why failures with controllers are allowed while + // failures without controllers are not, or remove this check if metav1.GetControllerOf(&pod) == nil { badPods = append(badPods, pod) } @@ -183,8 +179,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri otherPods = append(otherPods, pod) } } - done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0 - if done { + if nOk >= minPods && len(badPods)+len(otherPods) <= int(allowedNotReadyPods) { return nil, nil } @@ -193,7 +188,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri var buffer strings.Builder buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods)) buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods))) - buffer.WriteString(fmt.Sprintf("Expected %d pod replicas, %d are Running and Ready.\n", replicas, replicaOk)) if len(succeededPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) } @@ -207,17 +201,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri }, nil })) - // An error might not be fatal. - if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods { - framework.Logf( - "Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d, so we continue without error.", - nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods) - return nil - } else if err != nil && len(badPods) != 0 { - framework.Logf("Error: %d pods failed and were not controlled by some controller.", len(badPods)) - } - - return err } // WaitForPodCondition waits a pods to be matched to the given condition. From fcdf67a815c9ac4f14413df16cffb6f16346fd44 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Sat, 13 Apr 2024 18:07:09 -0500 Subject: [PATCH 4/9] Add WaitForAlmostAllPodsReady function, similar to previous WaitForPodsRunningReady function --- test/e2e/framework/pod/wait.go | 125 +++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 33a624c9d61..924f6058605 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -99,6 +99,131 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher { }).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase) } +// WaitForAlmostAllReady waits up to timeout for the following conditions: +// 1. At least minPods Pods in Namespace ns are Running and Ready +// 2. All Pods in Namespace ns are either Ready or Succeeded +// 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready +// +// After the timeout has elapsed, an error is returned if and only if either of +// the following conditions hold: +// 1. The number of Pods in Namespace ns that are not Succeeded or Failed is less than minPods +// 2. The number of Pods in Namespace ns that are not Ready, Succeeded, or Failed +// is greater than to allowedNotReadyPods +// +// Roughly speaking, this means that not "too many" Pods are not Ready, where the tolerance +// for "too many" is controlled by allowedNotReadyPods. +// +// It is generally recommended to use WaitForPodsRunningReady instead of this function +// whenever possible, because it is easier to understand. Similar to WaitForPodsRunningReady, +// this function requests the list of pods on every iteration, making it useful for situations +// where the set of Pods is likely changing, such as during cluster startup. +// +// If minPods or allowedNotReadyPods are -1, this method returns immediately +// without waiting. +func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int, timeout time.Duration) error { + if minPods == -1 || allowedNotReadyPods == -1 { + return nil + } + + // We get the new list of pods, replication controllers, and replica + // sets in every iteration because more pods come online during startup + // and we want to ensure they are also checked. + // + // This struct gets populated while polling, then gets checked, and in + // case of a timeout is included in the failure message. + type state struct { + ReplicationControllers []v1.ReplicationController + ReplicaSets []appsv1.ReplicaSet + Pods []v1.Pod + } + + nOk := 0 + badPods := []v1.Pod{} + otherPods := []v1.Pod{} + succeededPods := []string{} + + err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { + + rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("listing replication controllers in namespace %s: %w", ns, err) + } + rsList, err := c.AppsV1().ReplicaSets(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("listing replication sets in namespace %s: %w", ns, err) + } + podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err) + } + return &state{ + ReplicationControllers: rcList.Items, + ReplicaSets: rsList.Items, + Pods: podList.Items, + }, nil + })).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) { + replicas, replicaOk := int32(0), int32(0) + for _, rc := range s.ReplicationControllers { + replicas += *rc.Spec.Replicas + replicaOk += rc.Status.ReadyReplicas + } + for _, rs := range s.ReplicaSets { + replicas += *rs.Spec.Replicas + replicaOk += rs.Status.ReadyReplicas + } + + nOk = 0 + badPods = []v1.Pod{} + otherPods = []v1.Pod{} + succeededPods = []string{} + for _, pod := range s.Pods { + res, err := testutils.PodRunningReady(&pod) + switch { + case res && err == nil: + nOk++ + case pod.Status.Phase == v1.PodSucceeded: + // it doesn't make sense to wait for this pod + succeededPods = append(succeededPods, pod.Name) + case pod.Status.Phase == v1.PodFailed: + // ignore failed pods that are controlled by some controller + if metav1.GetControllerOf(&pod) == nil { + badPods = append(badPods, pod) + } + default: + otherPods = append(otherPods, pod) + } + } + done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0 + if done { + return nil, nil + } + + // Delayed formatting of a failure message. + return func() string { + var buffer strings.Builder + buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods)) + buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods))) + buffer.WriteString(fmt.Sprintf("Expected %d pod replicas, %d are Running and Ready.\n", replicas, replicaOk)) + if len(succeededPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) + } + if len(badPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) + } + if len(otherPods) > 0 { + buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) + } + return buffer.String() + }, nil + })) + + // An error might not be fatal. + if nOk+len(otherPods) >= minPods && len(otherPods) <= allowedNotReadyPods { + return nil + } + return err +} + // WaitForPodsRunningReady waits up to timeout for the following conditions: // 1. At least minPods Pods in Namespace ns are Running and Ready // 2. No more than allowedNotReadyPods Pods in Namespace ns are not either From 272a055a46bf8a707f1658a76b722a0165e663a3 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Sat, 13 Apr 2024 18:08:05 -0500 Subject: [PATCH 5/9] Use new WaitForAlmostAllPodsReady function ins e2e setup --- test/e2e/e2e.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index a37a6f13d14..262069829a8 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -226,7 +226,7 @@ func setupSuite(ctx context.Context) { // #41007. To avoid those pods preventing the whole test runs (and just // wasting the whole run), we allow for some not-ready pods (with the // number equal to the number of allowed not-ready nodes). - if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil { + if err := e2epod.WaitForAlmostAllPodsReady(ctx, c, metav1.NamespaceSystem, framework.TestContext.MinStartupPods, framework.TestContext.AllowedNotReadyNodes, timeouts.SystemPodsStartup); err != nil { e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem) e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf) framework.Failf("Error waiting for all pods to be running and ready: %v", err) From bcf42255bbc09ce7d37dfef3b18000c238107e15 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Sat, 13 Apr 2024 18:52:27 -0500 Subject: [PATCH 6/9] Refactor and simplify WaitForPodsRunningReady; update callers to use new interface --- test/e2e/cloud/gcp/node_lease.go | 6 +-- test/e2e/cloud/gcp/resize_nodes.go | 6 +-- test/e2e/common/node/container_probe.go | 8 ++-- test/e2e/common/node/pods.go | 2 +- test/e2e/framework/pod/wait.go | 55 +++++-------------------- test/e2e/scheduling/priorities.go | 2 +- test/e2e/windows/host_process.go | 4 +- test/e2e/windows/hyperv.go | 2 +- test/e2e/windows/kubelet_stats.go | 4 +- 9 files changed, 28 insertions(+), 61 deletions(-) diff --git a/test/e2e/cloud/gcp/node_lease.go b/test/e2e/cloud/gcp/node_lease.go index 58aab52d89c..b0d6d8a90bf 100644 --- a/test/e2e/cloud/gcp/node_lease.go +++ b/test/e2e/cloud/gcp/node_lease.go @@ -38,7 +38,7 @@ import ( var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { f := framework.NewDefaultFramework("node-lease-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - var systemPodsNo int32 + var systemPodsNo int var c clientset.Interface var ns string var group string @@ -48,7 +48,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { ns = f.Namespace.Name systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) framework.ExpectNoError(err) - systemPodsNo = int32(len(systemPods)) + systemPodsNo = len(systemPods) if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) } else { @@ -97,7 +97,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { // Many e2e tests assume that the cluster is fully healthy before they start. Wait until // the cluster is restored to health. ginkgo.By("waiting for system pods to successfully restart") - err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) + err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) }) diff --git a/test/e2e/cloud/gcp/resize_nodes.go b/test/e2e/cloud/gcp/resize_nodes.go index 916d761fbcd..0d32f93f719 100644 --- a/test/e2e/cloud/gcp/resize_nodes.go +++ b/test/e2e/cloud/gcp/resize_nodes.go @@ -47,7 +47,7 @@ func resizeRC(ctx context.Context, c clientset.Interface, ns, name string, repli var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { f := framework.NewDefaultFramework("resize-nodes") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - var systemPodsNo int32 + var systemPodsNo int var c clientset.Interface var ns string var group string @@ -57,7 +57,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { ns = f.Namespace.Name systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) framework.ExpectNoError(err) - systemPodsNo = int32(len(systemPods)) + systemPodsNo = len(systemPods) if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) } else { @@ -99,7 +99,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { // Many e2e tests assume that the cluster is fully healthy before they start. Wait until // the cluster is restored to health. ginkgo.By("waiting for system pods to successfully restart") - err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) + err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) }) }) diff --git a/test/e2e/common/node/container_probe.go b/test/e2e/common/node/container_probe.go index 31122ccc581..534fc26bf97 100644 --- a/test/e2e/common/node/container_probe.go +++ b/test/e2e/common/node/container_probe.go @@ -612,7 +612,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -694,7 +694,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -1359,7 +1359,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false @@ -1452,7 +1452,7 @@ done }) // verify pods are running and ready - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) framework.ExpectNoError(err) // Shutdown pod. Readiness should change to false diff --git a/test/e2e/common/node/pods.go b/test/e2e/common/node/pods.go index 69037c6e65f..7aa1521b81c 100644 --- a/test/e2e/common/node/pods.go +++ b/test/e2e/common/node/pods.go @@ -873,7 +873,7 @@ var _ = SIGDescribe("Pods", func() { // wait as required for all 3 pods to be running ginkgo.By("waiting for all 3 pods to be running") - err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, f.Timeouts.PodStart) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, f.Timeouts.PodStart) framework.ExpectNoError(err, "3 pods not found running.") // delete Collection of pods with a label in the current namespace diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 924f6058605..f4b43ff523a 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -226,66 +226,35 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st // WaitForPodsRunningReady waits up to timeout for the following conditions: // 1. At least minPods Pods in Namespace ns are Running and Ready -// 2. No more than allowedNotReadyPods Pods in Namespace ns are not either -// Ready, Succeeded, or Failed with a Controller. +// 2. No Pods in Namespace ns are not either Ready, Succeeded, or Failed with a Controller // -// # An error is returned if either of these conditions are not met within the timeout +// An error is returned if either of these conditions are not met within the timeout. // // It has separate behavior from other 'wait for' pods functions in // that it requests the list of pods on every iteration. This is useful, for // example, in cluster startup, because the number of pods increases while // waiting. All pods that are in SUCCESS state are not counted. -// -// If minPods or allowedNotReadyPods are -1, this method returns immediately -// without waiting. -func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration) error { - if minPods == -1 || allowedNotReadyPods == -1 { - return nil - } +func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error { - // We get the new list of pods, replication controllers, and replica - // sets in every iteration because more pods come online during startup - // and we want to ensure they are also checked. - // - // This struct gets populated while polling, then gets checked, and in - // case of a timeout is included in the failure message. - type state struct { - ReplicationControllers []v1.ReplicationController - ReplicaSets []appsv1.ReplicaSet - Pods []v1.Pod - } - - nOk := int32(0) + nOk := 0 badPods := []v1.Pod{} otherPods := []v1.Pod{} succeededPods := []string{} - return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { + return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) { - rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("listing replication controllers in namespace %s: %w", ns, err) - } - rsList, err := c.AppsV1().ReplicaSets(ns).List(ctx, metav1.ListOptions{}) - if err != nil { - return nil, fmt.Errorf("listing replication sets in namespace %s: %w", ns, err) - } podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err) } - return &state{ - ReplicationControllers: rcList.Items, - ReplicaSets: rsList.Items, - Pods: podList.Items, - }, nil - })).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) { + return podList.Items, nil + })).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) { nOk = 0 badPods = []v1.Pod{} otherPods = []v1.Pod{} succeededPods = []string{} - for _, pod := range s.Pods { + for _, pod := range pods { res, err := testutils.PodRunningReady(&pod) switch { case res && err == nil: @@ -295,8 +264,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri succeededPods = append(succeededPods, pod.Name) case pod.Status.Phase == v1.PodFailed: // ignore failed pods that are controlled by some controller - // TODO either document why failures with controllers are allowed while - // failures without controllers are not, or remove this check if metav1.GetControllerOf(&pod) == nil { badPods = append(badPods, pod) } @@ -304,15 +271,15 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri otherPods = append(otherPods, pod) } } - if nOk >= minPods && len(badPods)+len(otherPods) <= int(allowedNotReadyPods) { + if nOk >= minPods && len(badPods)+len(otherPods) <= 0 { return nil, nil } // Delayed formatting of a failure message. return func() string { var buffer strings.Builder - buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods)) - buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods))) + buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready \n", minPods, ns)) + buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(pods))) if len(succeededPods) > 0 { buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) } diff --git a/test/e2e/scheduling/priorities.go b/test/e2e/scheduling/priorities.go index 21f83d39784..636ec4da34b 100644 --- a/test/e2e/scheduling/priorities.go +++ b/test/e2e/scheduling/priorities.go @@ -109,7 +109,7 @@ var _ = SIGDescribe("SchedulerPriorities", framework.WithSerial(), func() { err = framework.CheckTestingNSDeletedExcept(ctx, cs, ns) framework.ExpectNoError(err) - err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout) + err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) framework.ExpectNoError(err) // skip if the most utilized node has less than the cri-o minMemLimit available diff --git a/test/e2e/windows/host_process.go b/test/e2e/windows/host_process.go index 1d794313ba4..4af3c2c79c2 100644 --- a/test/e2e/windows/host_process.go +++ b/test/e2e/windows/host_process.go @@ -657,7 +657,7 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio ginkgo.By("Waiting for the pod to start running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) + e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) ginkgo.By("Getting container stats for pod") statsChecked := false @@ -711,7 +711,7 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio pc.Create(ctx, pod) ginkgo.By("Waiting for pod to run") - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, 3*time.Minute) + e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute) ginkgo.By("Waiting for 60 seconds") // We wait an additional 60 seconds after the pod is Running because the diff --git a/test/e2e/windows/hyperv.go b/test/e2e/windows/hyperv.go index f066e24dedb..1b304936f35 100644 --- a/test/e2e/windows/hyperv.go +++ b/test/e2e/windows/hyperv.go @@ -95,7 +95,7 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn pc.Create(ctx, hypervPod) ginkgo.By("waiting for the pod to be running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) + e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers") diff --git a/test/e2e/windows/kubelet_stats.go b/test/e2e/windows/kubelet_stats.go index 3570bb01daf..64ac0a8e59e 100644 --- a/test/e2e/windows/kubelet_stats.go +++ b/test/e2e/windows/kubelet_stats.go @@ -60,7 +60,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk ginkgo.By("Waiting up to 3 minutes for pods to be running") timeout := 3 * time.Minute - err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout) framework.ExpectNoError(err) ginkgo.By("Getting kubelet stats 5 times and checking average duration") @@ -152,7 +152,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() { ginkgo.By("Waiting up to 3 minutes for pods to be running") timeout := 3 * time.Minute - err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, timeout) framework.ExpectNoError(err) ginkgo.By("Getting kubelet stats 1 time") From 3476833367a7507e5d86a71fb6ea77df1b81055e Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Sat, 13 Apr 2024 19:52:12 -0500 Subject: [PATCH 7/9] Check err in Windows e2e tests --- test/e2e/windows/host_process.go | 6 ++++-- test/e2e/windows/hyperv.go | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/test/e2e/windows/host_process.go b/test/e2e/windows/host_process.go index 4af3c2c79c2..98a3c6c45c0 100644 --- a/test/e2e/windows/host_process.go +++ b/test/e2e/windows/host_process.go @@ -657,7 +657,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio ginkgo.By("Waiting for the pod to start running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + framework.ExpectNoError(err) ginkgo.By("Getting container stats for pod") statsChecked := false @@ -711,7 +712,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio pc.Create(ctx, pod) ginkgo.By("Waiting for pod to run") - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute) + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute) + framework.ExpectNoError(err) ginkgo.By("Waiting for 60 seconds") // We wait an additional 60 seconds after the pod is Running because the diff --git a/test/e2e/windows/hyperv.go b/test/e2e/windows/hyperv.go index 1b304936f35..3744d6e48fa 100644 --- a/test/e2e/windows/hyperv.go +++ b/test/e2e/windows/hyperv.go @@ -95,7 +95,8 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn pc.Create(ctx, hypervPod) ginkgo.By("waiting for the pod to be running") timeout := 3 * time.Minute - e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) + framework.ExpectNoError(err) ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers") From 62b9e832ccd814e8ec47b0baa895fa7b21137729 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Tue, 16 Apr 2024 22:43:09 -0500 Subject: [PATCH 8/9] Small clean and comment rewording of WaitForPodsRunningReady --- test/e2e/framework/pod/wait.go | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index f4b43ff523a..bdb68513883 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -226,7 +226,7 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st // WaitForPodsRunningReady waits up to timeout for the following conditions: // 1. At least minPods Pods in Namespace ns are Running and Ready -// 2. No Pods in Namespace ns are not either Ready, Succeeded, or Failed with a Controller +// 2. No Pods in Namespace ns are Failed and not owned by a controller or Pending // // An error is returned if either of these conditions are not met within the timeout. // @@ -236,11 +236,6 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st // waiting. All pods that are in SUCCESS state are not counted. func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error { - nOk := 0 - badPods := []v1.Pod{} - otherPods := []v1.Pod{} - succeededPods := []string{} - return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) { podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) @@ -250,10 +245,11 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri return podList.Items, nil })).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) { - nOk = 0 - badPods = []v1.Pod{} - otherPods = []v1.Pod{} - succeededPods = []string{} + nOk := 0 + badPods := []v1.Pod{} + otherPods := []v1.Pod{} + succeededPods := []string{} + for _, pod := range pods { res, err := testutils.PodRunningReady(&pod) switch { @@ -271,7 +267,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri otherPods = append(otherPods, pod) } } - if nOk >= minPods && len(badPods)+len(otherPods) <= 0 { + if nOk >= minPods && len(badPods)+len(otherPods) == 0 { return nil, nil } From 5b3f48d263246f9b7a445b3066422b3ef69a2817 Mon Sep 17 00:00:00 2001 From: Matt Karrmann Date: Wed, 17 Apr 2024 18:38:16 -0500 Subject: [PATCH 9/9] Revert check at end of WaitForAlmostAllPodsReady to only consider Pending pods --- test/e2e/framework/pod/wait.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index bdb68513883..6d2cfe9865a 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -104,17 +104,11 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher { // 2. All Pods in Namespace ns are either Ready or Succeeded // 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready // -// After the timeout has elapsed, an error is returned if and only if either of -// the following conditions hold: -// 1. The number of Pods in Namespace ns that are not Succeeded or Failed is less than minPods -// 2. The number of Pods in Namespace ns that are not Ready, Succeeded, or Failed -// is greater than to allowedNotReadyPods -// -// Roughly speaking, this means that not "too many" Pods are not Ready, where the tolerance -// for "too many" is controlled by allowedNotReadyPods. +// After the timeout has elapsed, an error is returned if the number of Pods in a Pending Phase +// is greater than allowedNotReadyPods. // // It is generally recommended to use WaitForPodsRunningReady instead of this function -// whenever possible, because it is easier to understand. Similar to WaitForPodsRunningReady, +// whenever possible, because its behavior is more intuitive. Similar to WaitForPodsRunningReady, // this function requests the list of pods on every iteration, making it useful for situations // where the set of Pods is likely changing, such as during cluster startup. // @@ -218,7 +212,7 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st })) // An error might not be fatal. - if nOk+len(otherPods) >= minPods && len(otherPods) <= allowedNotReadyPods { + if len(otherPods) <= allowedNotReadyPods { return nil } return err