From 0bd901afeaa55ebe8d634a97b97c84026fc270ae Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 16 Sep 2022 08:58:07 +0200 Subject: [PATCH 1/2] e2e framework: better error messages when pod has failed or completed Getting a message about "pod ran to completion" is confusing when the pod hasn't been able to start at all. The failed state now has a different message. To address the previous ambiguity, the success state is described as "ran to completion successfully". --- test/e2e/framework/pod/resource.go | 6 +++++- test/e2e/framework/pod/wait.go | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/test/e2e/framework/pod/resource.go b/test/e2e/framework/pod/resource.go index 4f3765dfb09..0adc985f3ca 100644 --- a/test/e2e/framework/pod/resource.go +++ b/test/e2e/framework/pod/resource.go @@ -41,7 +41,11 @@ import ( // errPodCompleted is returned by PodRunning or PodContainerRunning to indicate that // the pod has already reached completed state. -var errPodCompleted = fmt.Errorf("pod ran to completion") +var errPodCompleted = fmt.Errorf("pod ran to completion successfully") + +// errPodFailed is returned by PodRunning or PodContainerRunning to indicate that +// the pod has already reached a permanent failue state. +var errPodFailed = fmt.Errorf("pod failed permanently") // LabelLogOnPodFailure can be used to mark which Pods will have their logs logged in the case of // a test failure. By default, if there are no Pods with this label, only the first 5 Pods will diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index da490106f0c..5f97f7c0e61 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -403,7 +403,9 @@ func WaitTimeoutForPodRunningInNamespace(c clientset.Interface, podName, namespa switch pod.Status.Phase { case v1.PodRunning: return true, nil - case v1.PodFailed, v1.PodSucceeded: + case v1.PodFailed: + return false, errPodFailed + case v1.PodSucceeded: return false, errPodCompleted } return false, nil @@ -441,7 +443,10 @@ func WaitForPodNoLongerRunningInNamespace(c clientset.Interface, podName, namesp func WaitTimeoutForPodReadyInNamespace(c clientset.Interface, podName, namespace string, timeout time.Duration) error { return WaitForPodCondition(c, namespace, podName, "running and ready", timeout, func(pod *v1.Pod) (bool, error) { switch pod.Status.Phase { - case v1.PodFailed, v1.PodSucceeded: + case v1.PodFailed: + e2elog.Logf("The phase of Pod %s is %s which is unexpected, pod status: %#v", pod.Name, pod.Status.Phase, pod.Status) + return false, errPodFailed + case v1.PodSucceeded: e2elog.Logf("The phase of Pod %s is %s which is unexpected, pod status: %#v", pod.Name, pod.Status.Phase, pod.Status) return false, errPodCompleted case v1.PodRunning: From 004c1b644164601c99f51ce5a80b9fe9f5fd3534 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 16 Sep 2022 09:00:08 +0200 Subject: [PATCH 2/2] e2e framework: abort polling for running pod early When the pod state is failed or completed, the pod will never again enter the running state and polling can stop. --- test/e2e/framework/pod/resource.go | 5 ++-- test/e2e/framework/pod/wait.go | 39 +++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/test/e2e/framework/pod/resource.go b/test/e2e/framework/pod/resource.go index 0adc985f3ca..e99b4e12da7 100644 --- a/test/e2e/framework/pod/resource.go +++ b/test/e2e/framework/pod/resource.go @@ -18,6 +18,7 @@ package pod import ( "context" + "errors" "fmt" "os" "path/filepath" @@ -41,11 +42,11 @@ import ( // errPodCompleted is returned by PodRunning or PodContainerRunning to indicate that // the pod has already reached completed state. -var errPodCompleted = fmt.Errorf("pod ran to completion successfully") +var errPodCompleted = FinalError(errors.New("pod ran to completion successfully")) // errPodFailed is returned by PodRunning or PodContainerRunning to indicate that // the pod has already reached a permanent failue state. -var errPodFailed = fmt.Errorf("pod failed permanently") +var errPodFailed = FinalError(errors.New("pod failed permanently")) // LabelLogOnPodFailure can be used to mark which Pods will have their logs logged in the case of // a test failure. By default, if there are no Pods with this label, only the first 5 Pods will diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 5f97f7c0e61..8ffa969e0a2 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -81,6 +81,39 @@ func TimeoutError(msg string, observedObjects ...interface{}) *timeoutError { } } +// FinalError constructs an error that indicates to a poll function that +// polling can be stopped immediately because some permanent error has been +// encountered that is not going to go away. +// +// TODO (@pohly): move this into framework once the refactoring from +// https://github.com/kubernetes/kubernetes/pull/112043 allows it. Right now it +// leads to circular dependencies. +func FinalError(err error) error { + return &FinalErr{Err: err} +} + +type FinalErr struct { + Err error +} + +func (err *FinalErr) Error() string { + if err.Err != nil { + return fmt.Sprintf("final error: %s", err.Err.Error()) + } + return "final error, exact problem unknown" +} + +func (err *FinalErr) Unwrap() error { + return err.Err +} + +// IsFinal checks whether the error was marked as final by wrapping some error +// with FinalError. +func IsFinal(err error) bool { + var finalErr *FinalErr + return errors.As(err, &finalErr) +} + // maybeTimeoutError returns a TimeoutError if err is a timeout. Otherwise, wrap err. // taskFormat and taskArgs should be the task being performed when the error occurred, // e.g. "waiting for pod to be running". @@ -244,6 +277,8 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedN } // WaitForPodCondition waits a pods to be matched to the given condition. +// If the condition callback returns an error that matches FinalErr (checked with IsFinal), +// then polling aborts early. func WaitForPodCondition(c clientset.Interface, ns, podName, conditionDesc string, timeout time.Duration, condition podCondition) error { e2elog.Logf("Waiting up to %v for pod %q in namespace %q to be %q", timeout, podName, ns, conditionDesc) var ( @@ -268,8 +303,10 @@ func WaitForPodCondition(c clientset.Interface, ns, podName, conditionDesc strin } return true, err } else if err != nil { - // TODO(#109732): stop polling and return the error in this case. e2elog.Logf("Error evaluating pod condition %s: %v", conditionDesc, err) + if IsFinal(err) { + return false, err + } } return false, nil })