Merge pull request #43090 from foxish/fix-network-partition-flake

Automatic merge from submit-queue (batch tested with PRs 42854, 43105, 43090)

Add a timeout to allow replacement pod to become ready

Hopefully fixes https://github.com/kubernetes/kubernetes/issues/37259

```
I0314 04:26:02.562] Mar 14 04:26:02.562: INFO: Pod my-hostname-net-1bgrj still exists
I0314 04:26:22.491] Mar 14 04:26:22.491: INFO: Waiting for pod my-hostname-net-1bgrj to disappear
I0314 04:26:22.496] Mar 14 04:26:22.495: INFO: Pod my-hostname-net-1bgrj no longer exists
I0314 04:26:22.496] STEP: verifying whether the pod from the unreachable node is recreated
I0314 04:26:22.498] Mar 14 04:26:22.498: INFO: Pod name my-hostname-net: Found 3 pods out of 3
I0314 04:26:22.499] STEP: ensuring each pod is running
I0314 04:26:22.499] STEP: trying to dial each unique pod
I0314 04:26:22.579] Mar 14 04:26:22.579: INFO: Controller my-hostname-net: Got expected result from replica 1 [my-hostname-net-5jrdb]: "my-hostname-net-5jrdb", 1 of 3 required successes so far
I0314 04:26:22.642] Mar 14 04:26:22.642: INFO: Controller my-hostname-net: Got expected result from replica 2 [my-hostname-net-mjf3c]: "my-hostname-net-mjf3c", 2 of 3 required successes so far
I0314 04:31:22.645] Mar 14 04:31:22.644: INFO: Controller my-hostname-net: Failed to Get from replica 3 [my-hostname-net-rf46s]: Get https://35.184.87.178/api/v1/namespaces/e2e-tests-network-partition-s5gqt/pods/my-hostname-net-rf46s/proxy/: context deadline exceeded
```

The issue appears to be that we have a race between the pod being "running + ready" and being accessible via the APIServer proxy.


cc @kow3ns @bowei @davidopp
This commit is contained in:
Kubernetes Submit Queue 2017-03-14 18:44:22 -07:00 committed by GitHub
commit 586fd3374f

View File

@ -136,7 +136,7 @@ const (
// How long pods have to become scheduled onto nodes
podScheduledBeforeTimeout = PodListTimeout + (20 * time.Second)
podRespondingTimeout = 2 * time.Minute
podRespondingTimeout = 15 * time.Minute
ServiceRespondingTimeout = 2 * time.Minute
EndpointRegisterTimeout = time.Minute
@ -1634,8 +1634,10 @@ func (r podProxyResponseChecker) CheckAllResponses() (done bool, err error) {
}
if err != nil {
if ctx.Err() != nil {
Failf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
return false, err
// We may encounter errors here because of a race between the pod readiness and apiserver
// proxy. So, we log the error and retry if this occurs.
Logf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
return false, nil
}
Logf("Controller %s: Failed to GET from replica %d [%s]: %v\npod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
continue