mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-31 23:37:01 +00:00
Merge pull request #52057 from enisoc/sts-deflake
Automatic merge from submit-queue StatefulSet: Deflake e2e RunHostCmd. The initial retry up to 20s was giving up too soon. I'm seeing this test flake because the Node rebooted and it takes ~2min to recover. Now StatefulSet RunHostCmd calls will use the same 5min timeout as with other Pod state checks. ref #48031
This commit is contained in:
commit
507af4b9c2
@ -135,7 +135,7 @@ func (s *StatefulSetTester) CheckMount(ss *apps.StatefulSet, mountPath string) e
|
||||
func (s *StatefulSetTester) ExecInStatefulPods(ss *apps.StatefulSet, cmd string) error {
|
||||
podList := s.GetPodList(ss)
|
||||
for _, statefulPod := range podList.Items {
|
||||
stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3)
|
||||
stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, StatefulPodTimeout)
|
||||
Logf("stdout of %v on %v: %v", cmd, statefulPod.Name, stdout)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -149,7 +149,7 @@ func (s *StatefulSetTester) CheckHostname(ss *apps.StatefulSet) error {
|
||||
cmd := "printf $(hostname)"
|
||||
podList := s.GetPodList(ss)
|
||||
for _, statefulPod := range podList.Items {
|
||||
hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3)
|
||||
hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, StatefulPodTimeout)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -530,7 +530,7 @@ func (s *StatefulSetTester) BreakPodHttpProbe(ss *apps.StatefulSet, pod *v1.Pod)
|
||||
}
|
||||
// Ignore 'mv' errors to make this idempotent.
|
||||
cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path)
|
||||
stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3)
|
||||
stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, StatefulPodTimeout)
|
||||
Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout)
|
||||
return err
|
||||
}
|
||||
@ -554,7 +554,7 @@ func (s *StatefulSetTester) RestorePodHttpProbe(ss *apps.StatefulSet, pod *v1.Po
|
||||
}
|
||||
// Ignore 'mv' errors to make this idempotent.
|
||||
cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path)
|
||||
stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3)
|
||||
stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, StatefulPodTimeout)
|
||||
Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout)
|
||||
return err
|
||||
}
|
||||
@ -599,7 +599,7 @@ func (s *StatefulSetTester) ResumeNextPod(ss *apps.StatefulSet) {
|
||||
if resumedPod != "" {
|
||||
Failf("Found multiple paused stateful pods: %v and %v", pod.Name, resumedPod)
|
||||
}
|
||||
_, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue", StatefulSetPoll, 3)
|
||||
_, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue; sync", StatefulSetPoll, StatefulPodTimeout)
|
||||
ExpectNoError(err)
|
||||
Logf("Resumed pod %v", pod.Name)
|
||||
resumedPod = pod.Name
|
||||
|
@ -3311,20 +3311,23 @@ func RunHostCmdOrDie(ns, name, cmd string) string {
|
||||
return stdout
|
||||
}
|
||||
|
||||
// RunHostCmdWithRetries calls RunHostCmd until it succeeds or a built-in timeout expires.
|
||||
// This can be used with idempotent commands to deflake transient connection issues.
|
||||
func RunHostCmdWithRetries(ns, name, cmd string, interval time.Duration, maxTries int) (string, error) {
|
||||
tries := 0
|
||||
// RunHostCmdWithRetries calls RunHostCmd and retries errors it thinks may be transient
|
||||
// until it succeeds or the specified timeout expires.
|
||||
// This can be used with idempotent commands to deflake transient Node issues.
|
||||
func RunHostCmdWithRetries(ns, name, cmd string, interval, timeout time.Duration) (string, error) {
|
||||
start := time.Now()
|
||||
for {
|
||||
out, err := RunHostCmd(ns, name, cmd)
|
||||
if err == nil {
|
||||
return out, nil
|
||||
}
|
||||
tries++
|
||||
if tries >= maxTries {
|
||||
return out, fmt.Errorf("RunHostCmd still failed after %d tries: %v", tries, err)
|
||||
if elapsed := time.Since(start); elapsed > timeout {
|
||||
return out, fmt.Errorf("RunHostCmd still failed after %v: %v", elapsed, err)
|
||||
}
|
||||
Logf("Waiting %v to retry failed RunHostCmd (attempt %d): %v", interval, tries, err)
|
||||
if !strings.Contains(err.Error(), "Error from server") {
|
||||
return out, fmt.Errorf("Non-retryable RunHostCmd error: %v", err)
|
||||
}
|
||||
Logf("Waiting %v to retry failed RunHostCmd: %v", interval, err)
|
||||
time.Sleep(interval)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user