diff --git a/test/e2e/framework/statefulset_utils.go b/test/e2e/framework/statefulset_utils.go index 65345fe493b..51a6e869f18 100644 --- a/test/e2e/framework/statefulset_utils.go +++ b/test/e2e/framework/statefulset_utils.go @@ -135,7 +135,7 @@ func (s *StatefulSetTester) CheckMount(ss *apps.StatefulSet, mountPath string) e func (s *StatefulSetTester) ExecInStatefulPods(ss *apps.StatefulSet, cmd string) error { podList := s.GetPodList(ss) for _, statefulPod := range podList.Items { - stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3) + stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, StatefulPodTimeout) Logf("stdout of %v on %v: %v", cmd, statefulPod.Name, stdout) if err != nil { return err @@ -149,7 +149,7 @@ func (s *StatefulSetTester) CheckHostname(ss *apps.StatefulSet) error { cmd := "printf $(hostname)" podList := s.GetPodList(ss) for _, statefulPod := range podList.Items { - hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3) + hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, StatefulPodTimeout) if err != nil { return err } @@ -530,7 +530,7 @@ func (s *StatefulSetTester) BreakPodHttpProbe(ss *apps.StatefulSet, pod *v1.Pod) } // Ignore 'mv' errors to make this idempotent. cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path) - stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3) + stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, StatefulPodTimeout) Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout) return err } @@ -554,7 +554,7 @@ func (s *StatefulSetTester) RestorePodHttpProbe(ss *apps.StatefulSet, pod *v1.Po } // Ignore 'mv' errors to make this idempotent. cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path) - stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3) + stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, StatefulPodTimeout) Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout) return err } @@ -599,7 +599,7 @@ func (s *StatefulSetTester) ResumeNextPod(ss *apps.StatefulSet) { if resumedPod != "" { Failf("Found multiple paused stateful pods: %v and %v", pod.Name, resumedPod) } - _, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue", StatefulSetPoll, 3) + _, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue; sync", StatefulSetPoll, StatefulPodTimeout) ExpectNoError(err) Logf("Resumed pod %v", pod.Name) resumedPod = pod.Name diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 0530be56994..7fdb068545b 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -3311,20 +3311,23 @@ func RunHostCmdOrDie(ns, name, cmd string) string { return stdout } -// RunHostCmdWithRetries calls RunHostCmd until it succeeds or a built-in timeout expires. -// This can be used with idempotent commands to deflake transient connection issues. -func RunHostCmdWithRetries(ns, name, cmd string, interval time.Duration, maxTries int) (string, error) { - tries := 0 +// RunHostCmdWithRetries calls RunHostCmd and retries errors it thinks may be transient +// until it succeeds or the specified timeout expires. +// This can be used with idempotent commands to deflake transient Node issues. +func RunHostCmdWithRetries(ns, name, cmd string, interval, timeout time.Duration) (string, error) { + start := time.Now() for { out, err := RunHostCmd(ns, name, cmd) if err == nil { return out, nil } - tries++ - if tries >= maxTries { - return out, fmt.Errorf("RunHostCmd still failed after %d tries: %v", tries, err) + if elapsed := time.Since(start); elapsed > timeout { + return out, fmt.Errorf("RunHostCmd still failed after %v: %v", elapsed, err) } - Logf("Waiting %v to retry failed RunHostCmd (attempt %d): %v", interval, tries, err) + if !strings.Contains(err.Error(), "Error from server") { + return out, fmt.Errorf("Non-retryable RunHostCmd error: %v", err) + } + Logf("Waiting %v to retry failed RunHostCmd: %v", interval, err) time.Sleep(interval) } }