From 273cd03c017c091f95ae563c9b83c777cb9089fd Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Tue, 2 Apr 2024 23:40:33 -0500
Subject: [PATCH 1/9] Cleanup WaitForPodsRunningReady: fail for bad pods and
 reword log message

---
 test/e2e/framework/pod/wait.go | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index 4c7bab772e9..9f013156cd5 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -126,14 +126,12 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 		Pods                   []v1.Pod
 	}
 
-	// notReady is -1 for any failure other than a timeout.
-	// Otherwise it is the number of pods that we were still
-	// waiting for.
-	notReady := int32(-1)
+	nOk := int32(0)
+	badPods := []v1.Pod{}
+	otherPods := []v1.Pod{}
+	succeededPods := []string{}
 
 	err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) {
-		// Reset notReady at the start of a poll attempt.
-		notReady = -1
 
 		rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{})
 		if err != nil {
@@ -163,11 +161,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			replicaOk += rs.Status.ReadyReplicas
 		}
 
-		nOk := int32(0)
-		notReady = int32(0)
-		failedPods := []v1.Pod{}
-		otherPods := []v1.Pod{}
-		succeededPods := []string{}
+		nOk = 0
+		badPods = []v1.Pod{}
+		otherPods = []v1.Pod{}
+		succeededPods = []string{}
 		for _, pod := range s.Pods {
 			res, err := testutils.PodRunningReady(&pod)
 			switch {
@@ -179,14 +176,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			case pod.Status.Phase == v1.PodFailed:
 				// ignore failed pods that are controlled by some controller
 				if metav1.GetControllerOf(&pod) == nil {
-					failedPods = append(failedPods, pod)
+					badPods = append(badPods, pod)
 				}
 			default:
-				notReady++
 				otherPods = append(otherPods, pod)
 			}
 		}
-		done := replicaOk == replicas && nOk >= minPods && (len(failedPods)+len(otherPods)) == 0
+		done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0
 		if done {
 			return nil, nil
 		}
@@ -200,8 +196,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			if len(succeededPods) > 0 {
 				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1)))
 			}
-			if len(failedPods) > 0 {
-				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(failedPods, 1)))
+			if len(badPods) > 0 {
+				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1)))
 			}
 			if len(otherPods) > 0 {
 				buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1)))
@@ -211,8 +207,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 	}))
 
 	// An error might not be fatal.
-	if err != nil && notReady >= 0 && notReady <= allowedNotReadyPods {
-		framework.Logf("Number of not-ready pods (%d) is below the allowed threshold (%d).", notReady, allowedNotReadyPods)
+	if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods {
+		framework.Logf(
+			"Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d",
+			nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods)
 		return nil
 	}
 	return err

From 2537c10453eb705eb858b8d9ebc86e00193916dc Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Fri, 5 Apr 2024 22:10:07 -0500
Subject: [PATCH 2/9] Improve logging and comments in WaitForPodsRunningReady

---
 test/e2e/framework/pod/wait.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index 9f013156cd5..dcf82705d70 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -171,10 +171,11 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			case res && err == nil:
 				nOk++
 			case pod.Status.Phase == v1.PodSucceeded:
-				// it doesn't make sense to wait for this pod
+				// ignore succeeded pods
 				succeededPods = append(succeededPods, pod.Name)
 			case pod.Status.Phase == v1.PodFailed:
 				// ignore failed pods that are controlled by some controller
+				// failed pods without a controller result in an error
 				if metav1.GetControllerOf(&pod) == nil {
 					badPods = append(badPods, pod)
 				}
@@ -209,10 +210,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 	// An error might not be fatal.
 	if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods {
 		framework.Logf(
-			"Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d",
+			"Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d, so we continue without error.",
 			nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods)
 		return nil
+	} else if err != nil && len(badPods) != 0 {
+		framework.Logf("Error: %d pods failed and were not controlled by some controller.", len(badPods))
 	}
+
 	return err
 }
 

From fe251cb7373e68fa9997ab5d6188ebf307babff7 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Fri, 5 Apr 2024 23:47:10 -0500
Subject: [PATCH 3/9] Explicitity set contract for WaitForPodsRunningReady, and
 rewrite accordingly

---
 test/e2e/framework/pod/wait.go | 41 ++++++++++------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index dcf82705d70..33a624c9d61 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -99,10 +99,14 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher {
 	}).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase)
 }
 
-// WaitForPodsRunningReady waits up to timeout to ensure that all pods in
-// namespace ns are either running and ready, or failed but controlled by a
-// controller. Also, it ensures that at least minPods are running and
-// ready. It has separate behavior from other 'wait for' pods functions in
+// WaitForPodsRunningReady waits up to timeout for the following conditions:
+//  1. At least minPods Pods in Namespace ns are Running and Ready
+//  2. No more than allowedNotReadyPods Pods in Namespace ns are not either
+//     Ready, Succeeded, or Failed with a Controller.
+//
+// # An error is returned if either of these conditions are not met within the timeout
+//
+// It has separate behavior from other 'wait for' pods functions in
 // that it requests the list of pods on every iteration. This is useful, for
 // example, in cluster startup, because the number of pods increases while
 // waiting. All pods that are in SUCCESS state are not counted.
@@ -131,7 +135,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 	otherPods := []v1.Pod{}
 	succeededPods := []string{}
 
-	err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) {
+	return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) {
 
 		rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{})
 		if err != nil {
@@ -151,15 +155,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			Pods:                   podList.Items,
 		}, nil
 	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) {
-		replicas, replicaOk := int32(0), int32(0)
-		for _, rc := range s.ReplicationControllers {
-			replicas += *rc.Spec.Replicas
-			replicaOk += rc.Status.ReadyReplicas
-		}
-		for _, rs := range s.ReplicaSets {
-			replicas += *rs.Spec.Replicas
-			replicaOk += rs.Status.ReadyReplicas
-		}
 
 		nOk = 0
 		badPods = []v1.Pod{}
@@ -175,7 +170,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 				succeededPods = append(succeededPods, pod.Name)
 			case pod.Status.Phase == v1.PodFailed:
 				// ignore failed pods that are controlled by some controller
-				// failed pods without a controller result in an error
+				// TODO either document why failures with controllers are allowed while
+				// 	failures without controllers are not, or remove this check
 				if metav1.GetControllerOf(&pod) == nil {
 					badPods = append(badPods, pod)
 				}
@@ -183,8 +179,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 				otherPods = append(otherPods, pod)
 			}
 		}
-		done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0
-		if done {
+		if nOk >= minPods && len(badPods)+len(otherPods) <= int(allowedNotReadyPods) {
 			return nil, nil
 		}
 
@@ -193,7 +188,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 			var buffer strings.Builder
 			buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods))
 			buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods)))
-			buffer.WriteString(fmt.Sprintf("Expected %d pod replicas, %d are Running and Ready.\n", replicas, replicaOk))
 			if len(succeededPods) > 0 {
 				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1)))
 			}
@@ -207,17 +201,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 		}, nil
 	}))
 
-	// An error might not be fatal.
-	if len(badPods) == 0 && nOk < minPods && nOk+allowedNotReadyPods >= minPods {
-		framework.Logf(
-			"Only %d Pods, instead of the expected %d, are Ready, but this exceeds the minimum threshold of %d - %d = %d, so we continue without error.",
-			nOk, minPods, minPods-allowedNotReadyPods, allowedNotReadyPods, minPods)
-		return nil
-	} else if err != nil && len(badPods) != 0 {
-		framework.Logf("Error: %d pods failed and were not controlled by some controller.", len(badPods))
-	}
-
-	return err
 }
 
 // WaitForPodCondition waits a pods to be matched to the given condition.

From fcdf67a815c9ac4f14413df16cffb6f16346fd44 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Sat, 13 Apr 2024 18:07:09 -0500
Subject: [PATCH 4/9] Add WaitForAlmostAllPodsReady function, similar to
 previous WaitForPodsRunningReady function

---
 test/e2e/framework/pod/wait.go | 125 +++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index 33a624c9d61..924f6058605 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -99,6 +99,131 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher {
 	}).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase)
 }
 
+// WaitForAlmostAllReady waits up to timeout for the following conditions:
+// 1. At least minPods Pods in Namespace ns are Running and Ready
+// 2. All Pods in Namespace ns are either Ready or Succeeded
+// 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready
+//
+// After the timeout has elapsed, an error is returned if and only if either of
+// the following conditions hold:
+//  1. The number of Pods in Namespace ns that are not Succeeded or Failed is less than minPods
+//  2. The number of Pods in Namespace ns that are not Ready, Succeeded, or Failed
+//     is greater than to allowedNotReadyPods
+//
+// Roughly speaking, this means that not "too many" Pods are not Ready, where the tolerance
+// for "too many" is controlled by allowedNotReadyPods.
+//
+// It is generally recommended to use WaitForPodsRunningReady instead of this function
+// whenever possible, because it is easier to understand. Similar to WaitForPodsRunningReady,
+// this function requests the list of pods on every iteration, making it useful for situations
+// where the set of Pods is likely changing, such as during cluster startup.
+//
+// If minPods or allowedNotReadyPods are -1, this method returns immediately
+// without waiting.
+func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int, timeout time.Duration) error {
+	if minPods == -1 || allowedNotReadyPods == -1 {
+		return nil
+	}
+
+	// We get the new list of pods, replication controllers, and replica
+	// sets in every iteration because more pods come online during startup
+	// and we want to ensure they are also checked.
+	//
+	// This struct gets populated while polling, then gets checked, and in
+	// case of a timeout is included in the failure message.
+	type state struct {
+		ReplicationControllers []v1.ReplicationController
+		ReplicaSets            []appsv1.ReplicaSet
+		Pods                   []v1.Pod
+	}
+
+	nOk := 0
+	badPods := []v1.Pod{}
+	otherPods := []v1.Pod{}
+	succeededPods := []string{}
+
+	err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) {
+
+		rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("listing replication controllers in namespace %s: %w", ns, err)
+		}
+		rsList, err := c.AppsV1().ReplicaSets(ns).List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("listing replication sets in namespace %s: %w", ns, err)
+		}
+		podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err)
+		}
+		return &state{
+			ReplicationControllers: rcList.Items,
+			ReplicaSets:            rsList.Items,
+			Pods:                   podList.Items,
+		}, nil
+	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) {
+		replicas, replicaOk := int32(0), int32(0)
+		for _, rc := range s.ReplicationControllers {
+			replicas += *rc.Spec.Replicas
+			replicaOk += rc.Status.ReadyReplicas
+		}
+		for _, rs := range s.ReplicaSets {
+			replicas += *rs.Spec.Replicas
+			replicaOk += rs.Status.ReadyReplicas
+		}
+
+		nOk = 0
+		badPods = []v1.Pod{}
+		otherPods = []v1.Pod{}
+		succeededPods = []string{}
+		for _, pod := range s.Pods {
+			res, err := testutils.PodRunningReady(&pod)
+			switch {
+			case res && err == nil:
+				nOk++
+			case pod.Status.Phase == v1.PodSucceeded:
+				// it doesn't make sense to wait for this pod
+				succeededPods = append(succeededPods, pod.Name)
+			case pod.Status.Phase == v1.PodFailed:
+				// ignore failed pods that are controlled by some controller
+				if metav1.GetControllerOf(&pod) == nil {
+					badPods = append(badPods, pod)
+				}
+			default:
+				otherPods = append(otherPods, pod)
+			}
+		}
+		done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0
+		if done {
+			return nil, nil
+		}
+
+		// Delayed formatting of a failure message.
+		return func() string {
+			var buffer strings.Builder
+			buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods))
+			buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods)))
+			buffer.WriteString(fmt.Sprintf("Expected %d pod replicas, %d are Running and Ready.\n", replicas, replicaOk))
+			if len(succeededPods) > 0 {
+				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1)))
+			}
+			if len(badPods) > 0 {
+				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1)))
+			}
+			if len(otherPods) > 0 {
+				buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1)))
+			}
+			return buffer.String()
+		}, nil
+	}))
+
+	// An error might not be fatal.
+	if nOk+len(otherPods) >= minPods && len(otherPods) <= allowedNotReadyPods {
+		return nil
+	}
+	return err
+}
+
 // WaitForPodsRunningReady waits up to timeout for the following conditions:
 //  1. At least minPods Pods in Namespace ns are Running and Ready
 //  2. No more than allowedNotReadyPods Pods in Namespace ns are not either

From 272a055a46bf8a707f1658a76b722a0165e663a3 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Sat, 13 Apr 2024 18:08:05 -0500
Subject: [PATCH 5/9] Use new WaitForAlmostAllPodsReady function ins e2e setup

---
 test/e2e/e2e.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go
index a37a6f13d14..262069829a8 100644
--- a/test/e2e/e2e.go
+++ b/test/e2e/e2e.go
@@ -226,7 +226,7 @@ func setupSuite(ctx context.Context) {
 	// #41007. To avoid those pods preventing the whole test runs (and just
 	// wasting the whole run), we allow for some not-ready pods (with the
 	// number equal to the number of allowed not-ready nodes).
-	if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil {
+	if err := e2epod.WaitForAlmostAllPodsReady(ctx, c, metav1.NamespaceSystem, framework.TestContext.MinStartupPods, framework.TestContext.AllowedNotReadyNodes, timeouts.SystemPodsStartup); err != nil {
 		e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem)
 		e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf)
 		framework.Failf("Error waiting for all pods to be running and ready: %v", err)

From bcf42255bbc09ce7d37dfef3b18000c238107e15 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Sat, 13 Apr 2024 18:52:27 -0500
Subject: [PATCH 6/9] Refactor and simplify WaitForPodsRunningReady; update
 callers to use new interface

---
 test/e2e/cloud/gcp/node_lease.go        |  6 +--
 test/e2e/cloud/gcp/resize_nodes.go      |  6 +--
 test/e2e/common/node/container_probe.go |  8 ++--
 test/e2e/common/node/pods.go            |  2 +-
 test/e2e/framework/pod/wait.go          | 55 +++++--------------------
 test/e2e/scheduling/priorities.go       |  2 +-
 test/e2e/windows/host_process.go        |  4 +-
 test/e2e/windows/hyperv.go              |  2 +-
 test/e2e/windows/kubelet_stats.go       |  4 +-
 9 files changed, 28 insertions(+), 61 deletions(-)

diff --git a/test/e2e/cloud/gcp/node_lease.go b/test/e2e/cloud/gcp/node_lease.go
index 58aab52d89c..b0d6d8a90bf 100644
--- a/test/e2e/cloud/gcp/node_lease.go
+++ b/test/e2e/cloud/gcp/node_lease.go
@@ -38,7 +38,7 @@ import (
 var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() {
 	f := framework.NewDefaultFramework("node-lease-test")
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
-	var systemPodsNo int32
+	var systemPodsNo int
 	var c clientset.Interface
 	var ns string
 	var group string
@@ -48,7 +48,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() {
 		ns = f.Namespace.Name
 		systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{})
 		framework.ExpectNoError(err)
-		systemPodsNo = int32(len(systemPods))
+		systemPodsNo = len(systemPods)
 		if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
 			framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup)
 		} else {
@@ -97,7 +97,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() {
 			// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until
 			// the cluster is restored to health.
 			ginkgo.By("waiting for system pods to successfully restart")
-			err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout)
+			err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout)
 			framework.ExpectNoError(err)
 		})
 
diff --git a/test/e2e/cloud/gcp/resize_nodes.go b/test/e2e/cloud/gcp/resize_nodes.go
index 916d761fbcd..0d32f93f719 100644
--- a/test/e2e/cloud/gcp/resize_nodes.go
+++ b/test/e2e/cloud/gcp/resize_nodes.go
@@ -47,7 +47,7 @@ func resizeRC(ctx context.Context, c clientset.Interface, ns, name string, repli
 var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() {
 	f := framework.NewDefaultFramework("resize-nodes")
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
-	var systemPodsNo int32
+	var systemPodsNo int
 	var c clientset.Interface
 	var ns string
 	var group string
@@ -57,7 +57,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() {
 		ns = f.Namespace.Name
 		systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{})
 		framework.ExpectNoError(err)
-		systemPodsNo = int32(len(systemPods))
+		systemPodsNo = len(systemPods)
 		if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
 			framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup)
 		} else {
@@ -99,7 +99,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() {
 				// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until
 				// the cluster is restored to health.
 				ginkgo.By("waiting for system pods to successfully restart")
-				err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout)
+				err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout)
 				framework.ExpectNoError(err)
 			})
 		})
diff --git a/test/e2e/common/node/container_probe.go b/test/e2e/common/node/container_probe.go
index 31122ccc581..534fc26bf97 100644
--- a/test/e2e/common/node/container_probe.go
+++ b/test/e2e/common/node/container_probe.go
@@ -612,7 +612,7 @@ done
 		})
 
 		// verify pods are running and ready
-		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart)
 		framework.ExpectNoError(err)
 
 		// Shutdown pod. Readiness should change to false
@@ -694,7 +694,7 @@ done
 		})
 
 		// verify pods are running and ready
-		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart)
 		framework.ExpectNoError(err)
 
 		// Shutdown pod. Readiness should change to false
@@ -1359,7 +1359,7 @@ done
 		})
 
 		// verify pods are running and ready
-		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart)
 		framework.ExpectNoError(err)
 
 		// Shutdown pod. Readiness should change to false
@@ -1452,7 +1452,7 @@ done
 		})
 
 		// verify pods are running and ready
-		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart)
 		framework.ExpectNoError(err)
 
 		// Shutdown pod. Readiness should change to false
diff --git a/test/e2e/common/node/pods.go b/test/e2e/common/node/pods.go
index 69037c6e65f..7aa1521b81c 100644
--- a/test/e2e/common/node/pods.go
+++ b/test/e2e/common/node/pods.go
@@ -873,7 +873,7 @@ var _ = SIGDescribe("Pods", func() {
 
 		// wait as required for all 3 pods to be running
 		ginkgo.By("waiting for all 3 pods to be running")
-		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, f.Timeouts.PodStart)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, f.Timeouts.PodStart)
 		framework.ExpectNoError(err, "3 pods not found running.")
 
 		// delete Collection of pods with a label in the current namespace
diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index 924f6058605..f4b43ff523a 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -226,66 +226,35 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st
 
 // WaitForPodsRunningReady waits up to timeout for the following conditions:
 //  1. At least minPods Pods in Namespace ns are Running and Ready
-//  2. No more than allowedNotReadyPods Pods in Namespace ns are not either
-//     Ready, Succeeded, or Failed with a Controller.
+//  2. No Pods in Namespace ns are not either Ready, Succeeded, or Failed with a Controller
 //
-// # An error is returned if either of these conditions are not met within the timeout
+// An error is returned if either of these conditions are not met within the timeout.
 //
 // It has separate behavior from other 'wait for' pods functions in
 // that it requests the list of pods on every iteration. This is useful, for
 // example, in cluster startup, because the number of pods increases while
 // waiting. All pods that are in SUCCESS state are not counted.
-//
-// If minPods or allowedNotReadyPods are -1, this method returns immediately
-// without waiting.
-func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration) error {
-	if minPods == -1 || allowedNotReadyPods == -1 {
-		return nil
-	}
+func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error {
 
-	// We get the new list of pods, replication controllers, and replica
-	// sets in every iteration because more pods come online during startup
-	// and we want to ensure they are also checked.
-	//
-	// This struct gets populated while polling, then gets checked, and in
-	// case of a timeout is included in the failure message.
-	type state struct {
-		ReplicationControllers []v1.ReplicationController
-		ReplicaSets            []appsv1.ReplicaSet
-		Pods                   []v1.Pod
-	}
-
-	nOk := int32(0)
+	nOk := 0
 	badPods := []v1.Pod{}
 	otherPods := []v1.Pod{}
 	succeededPods := []string{}
 
-	return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) {
+	return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) {
 
-		rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{})
-		if err != nil {
-			return nil, fmt.Errorf("listing replication controllers in namespace %s: %w", ns, err)
-		}
-		rsList, err := c.AppsV1().ReplicaSets(ns).List(ctx, metav1.ListOptions{})
-		if err != nil {
-			return nil, fmt.Errorf("listing replication sets in namespace %s: %w", ns, err)
-		}
 		podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{})
 		if err != nil {
 			return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err)
 		}
-		return &state{
-			ReplicationControllers: rcList.Items,
-			ReplicaSets:            rsList.Items,
-			Pods:                   podList.Items,
-		}, nil
-	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(s *state) (func() string, error) {
+		return podList.Items, nil
+	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) {
 
 		nOk = 0
 		badPods = []v1.Pod{}
 		otherPods = []v1.Pod{}
 		succeededPods = []string{}
-		for _, pod := range s.Pods {
+		for _, pod := range pods {
 			res, err := testutils.PodRunningReady(&pod)
 			switch {
 			case res && err == nil:
@@ -295,8 +264,6 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 				succeededPods = append(succeededPods, pod.Name)
 			case pod.Status.Phase == v1.PodFailed:
 				// ignore failed pods that are controlled by some controller
-				// TODO either document why failures with controllers are allowed while
-				// 	failures without controllers are not, or remove this check
 				if metav1.GetControllerOf(&pod) == nil {
 					badPods = append(badPods, pod)
 				}
@@ -304,15 +271,15 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 				otherPods = append(otherPods, pod)
 			}
 		}
-		if nOk >= minPods && len(badPods)+len(otherPods) <= int(allowedNotReadyPods) {
+		if nOk >= minPods && len(badPods)+len(otherPods) <= 0 {
 			return nil, nil
 		}
 
 		// Delayed formatting of a failure message.
 		return func() string {
 			var buffer strings.Builder
-			buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready (except for %d).\n", minPods, ns, allowedNotReadyPods))
-			buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(s.Pods)))
+			buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready \n", minPods, ns))
+			buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(pods)))
 			if len(succeededPods) > 0 {
 				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1)))
 			}
diff --git a/test/e2e/scheduling/priorities.go b/test/e2e/scheduling/priorities.go
index 21f83d39784..636ec4da34b 100644
--- a/test/e2e/scheduling/priorities.go
+++ b/test/e2e/scheduling/priorities.go
@@ -109,7 +109,7 @@ var _ = SIGDescribe("SchedulerPriorities", framework.WithSerial(), func() {
 
 		err = framework.CheckTestingNSDeletedExcept(ctx, cs, ns)
 		framework.ExpectNoError(err)
-		err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout)
+		err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout)
 		framework.ExpectNoError(err)
 
 		// skip if the most utilized node has less than the cri-o minMemLimit available
diff --git a/test/e2e/windows/host_process.go b/test/e2e/windows/host_process.go
index 1d794313ba4..4af3c2c79c2 100644
--- a/test/e2e/windows/host_process.go
+++ b/test/e2e/windows/host_process.go
@@ -657,7 +657,7 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio
 
 		ginkgo.By("Waiting for the pod to start running")
 		timeout := 3 * time.Minute
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout)
+		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
 
 		ginkgo.By("Getting container stats for pod")
 		statsChecked := false
@@ -711,7 +711,7 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio
 		pc.Create(ctx, pod)
 
 		ginkgo.By("Waiting for pod to run")
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, 3*time.Minute)
+		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute)
 
 		ginkgo.By("Waiting for 60 seconds")
 		// We wait an additional 60 seconds after the pod is Running because the
diff --git a/test/e2e/windows/hyperv.go b/test/e2e/windows/hyperv.go
index f066e24dedb..1b304936f35 100644
--- a/test/e2e/windows/hyperv.go
+++ b/test/e2e/windows/hyperv.go
@@ -95,7 +95,7 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn
 		pc.Create(ctx, hypervPod)
 		ginkgo.By("waiting for the pod to be running")
 		timeout := 3 * time.Minute
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout)
+		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
 
 		ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers")
 
diff --git a/test/e2e/windows/kubelet_stats.go b/test/e2e/windows/kubelet_stats.go
index 3570bb01daf..64ac0a8e59e 100644
--- a/test/e2e/windows/kubelet_stats.go
+++ b/test/e2e/windows/kubelet_stats.go
@@ -60,7 +60,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
 
 				ginkgo.By("Waiting up to 3 minutes for pods to be running")
 				timeout := 3 * time.Minute
-				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, 0, timeout)
+				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout)
 				framework.ExpectNoError(err)
 
 				ginkgo.By("Getting kubelet stats 5 times and checking average duration")
@@ -152,7 +152,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() {
 
 				ginkgo.By("Waiting up to 3 minutes for pods to be running")
 				timeout := 3 * time.Minute
-				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, timeout)
+				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, timeout)
 				framework.ExpectNoError(err)
 
 				ginkgo.By("Getting kubelet stats 1 time")

From 3476833367a7507e5d86a71fb6ea77df1b81055e Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Sat, 13 Apr 2024 19:52:12 -0500
Subject: [PATCH 7/9] Check err in Windows e2e tests

---
 test/e2e/windows/host_process.go | 6 ++++--
 test/e2e/windows/hyperv.go       | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/e2e/windows/host_process.go b/test/e2e/windows/host_process.go
index 4af3c2c79c2..98a3c6c45c0 100644
--- a/test/e2e/windows/host_process.go
+++ b/test/e2e/windows/host_process.go
@@ -657,7 +657,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio
 
 		ginkgo.By("Waiting for the pod to start running")
 		timeout := 3 * time.Minute
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
+		err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
+		framework.ExpectNoError(err)
 
 		ginkgo.By("Getting container stats for pod")
 		statsChecked := false
@@ -711,7 +712,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio
 		pc.Create(ctx, pod)
 
 		ginkgo.By("Waiting for pod to run")
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute)
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute)
+		framework.ExpectNoError(err)
 
 		ginkgo.By("Waiting for 60 seconds")
 		// We wait an additional 60 seconds after the pod is Running because the
diff --git a/test/e2e/windows/hyperv.go b/test/e2e/windows/hyperv.go
index 1b304936f35..3744d6e48fa 100644
--- a/test/e2e/windows/hyperv.go
+++ b/test/e2e/windows/hyperv.go
@@ -95,7 +95,8 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn
 		pc.Create(ctx, hypervPod)
 		ginkgo.By("waiting for the pod to be running")
 		timeout := 3 * time.Minute
-		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
+		err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout)
+		framework.ExpectNoError(err)
 
 		ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers")
 

From 62b9e832ccd814e8ec47b0baa895fa7b21137729 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Tue, 16 Apr 2024 22:43:09 -0500
Subject: [PATCH 8/9] Small clean and comment rewording of
 WaitForPodsRunningReady

---
 test/e2e/framework/pod/wait.go | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index f4b43ff523a..bdb68513883 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -226,7 +226,7 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st
 
 // WaitForPodsRunningReady waits up to timeout for the following conditions:
 //  1. At least minPods Pods in Namespace ns are Running and Ready
-//  2. No Pods in Namespace ns are not either Ready, Succeeded, or Failed with a Controller
+//  2. No Pods in Namespace ns are Failed and not owned by a controller or Pending
 //
 // An error is returned if either of these conditions are not met within the timeout.
 //
@@ -236,11 +236,6 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st
 // waiting. All pods that are in SUCCESS state are not counted.
 func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error {
 
-	nOk := 0
-	badPods := []v1.Pod{}
-	otherPods := []v1.Pod{}
-	succeededPods := []string{}
-
 	return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) {
 
 		podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{})
@@ -250,10 +245,11 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 		return podList.Items, nil
 	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) {
 
-		nOk = 0
-		badPods = []v1.Pod{}
-		otherPods = []v1.Pod{}
-		succeededPods = []string{}
+		nOk := 0
+		badPods := []v1.Pod{}
+		otherPods := []v1.Pod{}
+		succeededPods := []string{}
+
 		for _, pod := range pods {
 			res, err := testutils.PodRunningReady(&pod)
 			switch {
@@ -271,7 +267,7 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri
 				otherPods = append(otherPods, pod)
 			}
 		}
-		if nOk >= minPods && len(badPods)+len(otherPods) <= 0 {
+		if nOk >= minPods && len(badPods)+len(otherPods) == 0 {
 			return nil, nil
 		}
 

From 5b3f48d263246f9b7a445b3066422b3ef69a2817 Mon Sep 17 00:00:00 2001
From: Matt Karrmann <mattkarrmann@gmail.com>
Date: Wed, 17 Apr 2024 18:38:16 -0500
Subject: [PATCH 9/9] Revert check at end of WaitForAlmostAllPodsReady to only
 consider Pending pods

---
 test/e2e/framework/pod/wait.go | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
index bdb68513883..6d2cfe9865a 100644
--- a/test/e2e/framework/pod/wait.go
+++ b/test/e2e/framework/pod/wait.go
@@ -104,17 +104,11 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher {
 // 2. All Pods in Namespace ns are either Ready or Succeeded
 // 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready
 //
-// After the timeout has elapsed, an error is returned if and only if either of
-// the following conditions hold:
-//  1. The number of Pods in Namespace ns that are not Succeeded or Failed is less than minPods
-//  2. The number of Pods in Namespace ns that are not Ready, Succeeded, or Failed
-//     is greater than to allowedNotReadyPods
-//
-// Roughly speaking, this means that not "too many" Pods are not Ready, where the tolerance
-// for "too many" is controlled by allowedNotReadyPods.
+// After the timeout has elapsed, an error is returned if the number of Pods in a Pending Phase
+// is greater than allowedNotReadyPods.
 //
 // It is generally recommended to use WaitForPodsRunningReady instead of this function
-// whenever possible, because it is easier to understand. Similar to WaitForPodsRunningReady,
+// whenever possible, because its behavior is more intuitive. Similar to WaitForPodsRunningReady,
 // this function requests the list of pods on every iteration, making it useful for situations
 // where the set of Pods is likely changing, such as during cluster startup.
 //
@@ -218,7 +212,7 @@ func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns st
 	}))
 
 	// An error might not be fatal.
-	if nOk+len(otherPods) >= minPods && len(otherPods) <= allowedNotReadyPods {
+	if len(otherPods) <= allowedNotReadyPods {
 		return nil
 	}
 	return err