Merge pull request #43090 from foxish/fix-network-partition-flake

Automatic merge from submit-queue (batch tested with PRs 42854, 43105, 43090) Add a timeout to allow replacement pod to become ready Hopefully fixes https://github.com/kubernetes/kubernetes/issues/37259 ``` I0314 04:26:02.562] Mar 14 04:26:02.562: INFO: Pod my-hostname-net-1bgrj still exists I0314 04:26:22.491] Mar 14 04:26:22.491: INFO: Waiting for pod my-hostname-net-1bgrj to disappear I0314 04:26:22.496] Mar 14 04:26:22.495: INFO: Pod my-hostname-net-1bgrj no longer exists I0314 04:26:22.496] STEP: verifying whether the pod from the unreachable node is recreated I0314 04:26:22.498] Mar 14 04:26:22.498: INFO: Pod name my-hostname-net: Found 3 pods out of 3 I0314 04:26:22.499] STEP: ensuring each pod is running I0314 04:26:22.499] STEP: trying to dial each unique pod I0314 04:26:22.579] Mar 14 04:26:22.579: INFO: Controller my-hostname-net: Got expected result from replica 1 [my-hostname-net-5jrdb]: "my-hostname-net-5jrdb", 1 of 3 required successes so far I0314 04:26:22.642] Mar 14 04:26:22.642: INFO: Controller my-hostname-net: Got expected result from replica 2 [my-hostname-net-mjf3c]: "my-hostname-net-mjf3c", 2 of 3 required successes so far I0314 04:31:22.645] Mar 14 04:31:22.644: INFO: Controller my-hostname-net: Failed to Get from replica 3 [my-hostname-net-rf46s]: Get https://35.184.87.178/api/v1/namespaces/e2e-tests-network-partition-s5gqt/pods/my-hostname-net-rf46s/proxy/: context deadline exceeded ``` The issue appears to be that we have a race between the pod being "running + ready" and being accessible via the APIServer proxy. cc @kow3ns @bowei @davidopp
2025-08-05 10:19:50 +00:00 · 2017-03-14 18:44:22 -07:00 · 2017-03-14 18:44:22 -07:00 · 586fd3374f
commit 586fd3374f
parent 8f9cba87a9 7698196fa5
1 changed files with 5 additions and 3 deletions
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@ -136,7 +136,7 @@ const (
 	// How long pods have to become scheduled onto nodes
 	podScheduledBeforeTimeout = PodListTimeout + (20 * time.Second)

-	podRespondingTimeout     = 2 * time.Minute
+	podRespondingTimeout     = 15 * time.Minute
 	ServiceRespondingTimeout = 2 * time.Minute
 	EndpointRegisterTimeout  = time.Minute

@ -1634,8 +1634,10 @@ func (r podProxyResponseChecker) CheckAllResponses() (done bool, err error) {
 		}
 		if err != nil {
 			if ctx.Err() != nil {
-				Failf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
-				return false, err
+				// We may encounter errors here because of a race between the pod readiness and apiserver
+				// proxy. So, we log the error and retry if this occurs.
+				Logf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
+				return false, nil
 			}
 			Logf("Controller %s: Failed to GET from replica %d [%s]: %v\npod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
 			continue