mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Merge pull request #43090 from foxish/fix-network-partition-flake
Automatic merge from submit-queue (batch tested with PRs 42854, 43105, 43090) Add a timeout to allow replacement pod to become ready Hopefully fixes https://github.com/kubernetes/kubernetes/issues/37259 ``` I0314 04:26:02.562] Mar 14 04:26:02.562: INFO: Pod my-hostname-net-1bgrj still exists I0314 04:26:22.491] Mar 14 04:26:22.491: INFO: Waiting for pod my-hostname-net-1bgrj to disappear I0314 04:26:22.496] Mar 14 04:26:22.495: INFO: Pod my-hostname-net-1bgrj no longer exists I0314 04:26:22.496] STEP: verifying whether the pod from the unreachable node is recreated I0314 04:26:22.498] Mar 14 04:26:22.498: INFO: Pod name my-hostname-net: Found 3 pods out of 3 I0314 04:26:22.499] STEP: ensuring each pod is running I0314 04:26:22.499] STEP: trying to dial each unique pod I0314 04:26:22.579] Mar 14 04:26:22.579: INFO: Controller my-hostname-net: Got expected result from replica 1 [my-hostname-net-5jrdb]: "my-hostname-net-5jrdb", 1 of 3 required successes so far I0314 04:26:22.642] Mar 14 04:26:22.642: INFO: Controller my-hostname-net: Got expected result from replica 2 [my-hostname-net-mjf3c]: "my-hostname-net-mjf3c", 2 of 3 required successes so far I0314 04:31:22.645] Mar 14 04:31:22.644: INFO: Controller my-hostname-net: Failed to Get from replica 3 [my-hostname-net-rf46s]: Get https://35.184.87.178/api/v1/namespaces/e2e-tests-network-partition-s5gqt/pods/my-hostname-net-rf46s/proxy/: context deadline exceeded ``` The issue appears to be that we have a race between the pod being "running + ready" and being accessible via the APIServer proxy. cc @kow3ns @bowei @davidopp
This commit is contained in:
commit
586fd3374f
@ -136,7 +136,7 @@ const (
|
||||
// How long pods have to become scheduled onto nodes
|
||||
podScheduledBeforeTimeout = PodListTimeout + (20 * time.Second)
|
||||
|
||||
podRespondingTimeout = 2 * time.Minute
|
||||
podRespondingTimeout = 15 * time.Minute
|
||||
ServiceRespondingTimeout = 2 * time.Minute
|
||||
EndpointRegisterTimeout = time.Minute
|
||||
|
||||
@ -1634,8 +1634,10 @@ func (r podProxyResponseChecker) CheckAllResponses() (done bool, err error) {
|
||||
}
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
Failf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
|
||||
return false, err
|
||||
// We may encounter errors here because of a race between the pod readiness and apiserver
|
||||
// proxy. So, we log the error and retry if this occurs.
|
||||
Logf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
|
||||
return false, nil
|
||||
}
|
||||
Logf("Controller %s: Failed to GET from replica %d [%s]: %v\npod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
|
||||
continue
|
||||
|
Loading…
Reference in New Issue
Block a user