tests: Solve backoff tests flakiness

The container status is not constant, and can change over time in the
following order:

- Running: When kubelet reports the Pod as running. This state is missable if
  the container finishes its command faster than kubelet getting to report this
  state.
- Terminated: After the Container finished its command, it will enter the Terminated
  state, in which will remain for a short period of time, before kubelet will try
  to restart it.
- Waiting: When kubelet has to wait for the backoff period to expire before actually
  restarting the container.

Treating and handling each of these states when calculating the backoff period between
container restarts will make the tests more reliable.
This commit is contained in:
Claudiu Belu 2019-03-28 02:02:02 -07:00
parent 5716127cff
commit e44961e47d

View File

@ -109,6 +109,8 @@ func startPodAndGetBackOffs(podClient *framework.PodClient, pod *v1.Pod, sleepAm
func getRestartDelay(podClient *framework.PodClient, podName string, containerName string) (time.Duration, error) {
beginTime := time.Now()
var previousRestartCount int32 = -1
var previousFinishedAt time.Time
for time.Since(beginTime) < (2 * maxBackOffTolerance) { // may just miss the 1st MaxContainerBackOff delay
time.Sleep(time.Second)
pod, err := podClient.Get(podName, metav1.GetOptions{})
@ -119,11 +121,37 @@ func getRestartDelay(podClient *framework.PodClient, podName string, containerNa
continue
}
if status.State.Waiting == nil && status.State.Terminated != nil && status.LastTerminationState.Terminated != nil && status.State.Terminated.StartedAt.Time.After(beginTime) {
startedAt := status.State.Terminated.StartedAt.Time
finishedAt := status.LastTerminationState.Terminated.FinishedAt.Time
framework.Logf("getRestartDelay: restartCount = %d, finishedAt=%s restartedAt=%s (%s)", status.RestartCount, finishedAt, startedAt, startedAt.Sub(finishedAt))
return startedAt.Sub(finishedAt), nil
// the only case this happens is if this is the first time the Pod is running and there is no "Last State".
if status.LastTerminationState.Terminated == nil {
framework.Logf("Container's last state is not \"Terminated\".")
continue
}
if previousRestartCount == -1 {
if status.State.Running != nil {
// container is still Running, there is no "FinishedAt" time.
continue
} else if status.State.Terminated != nil {
previousFinishedAt = status.State.Terminated.FinishedAt.Time
} else {
previousFinishedAt = status.LastTerminationState.Terminated.FinishedAt.Time
}
previousRestartCount = status.RestartCount
}
// when the RestartCount is changed, the Containers will be in one of the following states:
//Running, Terminated, Waiting (it already is waiting for the backoff period to expire, and the last state details have been stored into status.LastTerminationState).
if status.RestartCount > previousRestartCount {
var startedAt time.Time
if status.State.Running != nil {
startedAt = status.State.Running.StartedAt.Time
} else if status.State.Terminated != nil {
startedAt = status.State.Terminated.StartedAt.Time
} else {
startedAt = status.LastTerminationState.Terminated.StartedAt.Time
}
framework.Logf("getRestartDelay: restartCount = %d, finishedAt=%s restartedAt=%s (%s)", status.RestartCount, previousFinishedAt, startedAt, startedAt.Sub(previousFinishedAt))
return startedAt.Sub(previousFinishedAt), nil
}
}
return 0, fmt.Errorf("timeout getting pod restart delay")