Merge pull request #113273 from bobbypage/restart_test_fix

test: Fix e2e_node restart_test flake
This commit is contained in:
Kubernetes Prow Robot 2022-11-04 05:14:14 -07:00 committed by GitHub
commit 6fe5429969
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 12 deletions

View File

@ -39,9 +39,11 @@ import (
"github.com/onsi/gomega" "github.com/onsi/gomega"
) )
// waitForPods waits for timeout duration, for podCount. type podCondition func(pod *v1.Pod) (bool, error)
// waitForPodsCondition waits for `podCount` number of pods to match a specific pod condition within a timeout duration.
// If the timeout is hit, it returns the list of currently running pods. // If the timeout is hit, it returns the list of currently running pods.
func waitForPods(f *framework.Framework, podCount int, timeout time.Duration) (runningPods []*v1.Pod) { func waitForPodsCondition(f *framework.Framework, podCount int, timeout time.Duration, condition podCondition) (runningPods []*v1.Pod) {
for start := time.Now(); time.Since(start) < timeout; time.Sleep(10 * time.Second) { for start := time.Now(); time.Since(start) < timeout; time.Sleep(10 * time.Second) {
podList, err := e2epod.NewPodClient(f).List(context.TODO(), metav1.ListOptions{}) podList, err := e2epod.NewPodClient(f).List(context.TODO(), metav1.ListOptions{})
if err != nil { if err != nil {
@ -52,7 +54,7 @@ func waitForPods(f *framework.Framework, podCount int, timeout time.Duration) (r
runningPods = []*v1.Pod{} runningPods = []*v1.Pod{}
for i := range podList.Items { for i := range podList.Items {
pod := podList.Items[i] pod := podList.Items[i]
if r, err := testutils.PodRunningReadyOrSucceeded(&pod); err != nil || !r { if r, err := condition(&pod); err != nil || !r {
continue continue
} }
runningPods = append(runningPods, &pod) runningPods = append(runningPods, &pod)
@ -94,7 +96,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
// Give the node some time to stabilize, assume pods that enter RunningReady within // Give the node some time to stabilize, assume pods that enter RunningReady within
// startTimeout fit on the node and the node is now saturated. // startTimeout fit on the node and the node is now saturated.
runningPods := waitForPods(f, podCount, startTimeout) runningPods := waitForPodsCondition(f, podCount, startTimeout, testutils.PodRunningReadyOrSucceeded)
if len(runningPods) < minPods { if len(runningPods) < minPods {
framework.Failf("Failed to start %d pods, cannot test that restarting container runtime doesn't leak IPs", minPods) framework.Failf("Failed to start %d pods, cannot test that restarting container runtime doesn't leak IPs", minPods)
} }
@ -126,7 +128,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
} }
ginkgo.By("Checking currently Running/Ready pods") ginkgo.By("Checking currently Running/Ready pods")
postRestartRunningPods := waitForPods(f, len(runningPods), recoverTimeout) postRestartRunningPods := waitForPodsCondition(f, len(runningPods), recoverTimeout, testutils.PodRunningReadyOrSucceeded)
if len(postRestartRunningPods) == 0 { if len(postRestartRunningPods) == 0 {
framework.Failf("Failed to start *any* pods after container runtime restart, this might indicate an IP leak") framework.Failf("Failed to start *any* pods after container runtime restart, this might indicate an IP leak")
} }
@ -157,7 +159,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
createBatchPodWithRateControl(f, restartAlwaysPods, podCreationInterval) createBatchPodWithRateControl(f, restartAlwaysPods, podCreationInterval)
defer deletePodsSync(f, restartAlwaysPods) defer deletePodsSync(f, restartAlwaysPods)
allPods := waitForPods(f, preRestartPodCount, startTimeout) allPods := waitForPodsCondition(f, preRestartPodCount, startTimeout, testutils.PodRunningReadyOrSucceeded)
if len(allPods) < preRestartPodCount { if len(allPods) < preRestartPodCount {
framework.Failf("Failed to run sufficient restartAlways pods, got %d but expected %d", len(allPods), preRestartPodCount) framework.Failf("Failed to run sufficient restartAlways pods, got %d but expected %d", len(allPods), preRestartPodCount)
} }
@ -175,7 +177,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
ginkgo.By("verifying restartAlways pods stay running", func() { ginkgo.By("verifying restartAlways pods stay running", func() {
for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) { for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) {
postRestartRunningPods := waitForPods(f, preRestartPodCount, recoverTimeout) postRestartRunningPods := waitForPodsCondition(f, preRestartPodCount, recoverTimeout, testutils.PodRunningReadyOrSucceeded)
if len(postRestartRunningPods) < preRestartPodCount { if len(postRestartRunningPods) < preRestartPodCount {
framework.Failf("fewer pods are running after systemd restart, got %d but expected %d", len(postRestartRunningPods), preRestartPodCount) framework.Failf("fewer pods are running after systemd restart, got %d but expected %d", len(postRestartRunningPods), preRestartPodCount)
} }
@ -188,7 +190,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
createBatchPodWithRateControl(f, postRestartPods, podCreationInterval) createBatchPodWithRateControl(f, postRestartPods, podCreationInterval)
defer deletePodsSync(f, postRestartPods) defer deletePodsSync(f, postRestartPods)
allPods = waitForPods(f, preRestartPodCount+postRestartPodCount, startTimeout) allPods = waitForPodsCondition(f, preRestartPodCount+postRestartPodCount, startTimeout, testutils.PodRunningReadyOrSucceeded)
if len(allPods) < preRestartPodCount+postRestartPodCount { if len(allPods) < preRestartPodCount+postRestartPodCount {
framework.Failf("Failed to run pods after restarting dbus, got %d but expected %d", len(allPods), preRestartPodCount+postRestartPodCount) framework.Failf("Failed to run pods after restarting dbus, got %d but expected %d", len(allPods), preRestartPodCount+postRestartPodCount)
} }
@ -223,8 +225,8 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
} }
createBatchPodWithRateControl(f, restartNeverPods, podCreationInterval) createBatchPodWithRateControl(f, restartNeverPods, podCreationInterval)
defer deletePodsSync(f, restartNeverPods) defer deletePodsSync(f, restartNeverPods)
completedPods := waitForPodsCondition(f, podCountRestartNever, startTimeout, testutils.PodSucceeded)
completedPods := waitForPods(f, podCountRestartNever, startTimeout)
if len(completedPods) < podCountRestartNever { if len(completedPods) < podCountRestartNever {
framework.Failf("Failed to run sufficient restartNever pods, got %d but expected %d", len(completedPods), podCountRestartNever) framework.Failf("Failed to run sufficient restartNever pods, got %d but expected %d", len(completedPods), podCountRestartNever)
} }
@ -241,7 +243,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
defer deletePodsSync(f, restartAlwaysPods) defer deletePodsSync(f, restartAlwaysPods)
numAllPods := podCountRestartNever + podCountRestartAlways numAllPods := podCountRestartNever + podCountRestartAlways
allPods := waitForPods(f, numAllPods, startTimeout) allPods := waitForPodsCondition(f, numAllPods, startTimeout, testutils.PodRunningReadyOrSucceeded)
if len(allPods) < numAllPods { if len(allPods) < numAllPods {
framework.Failf("Failed to run sufficient restartAlways pods, got %d but expected %d", len(allPods), numAllPods) framework.Failf("Failed to run sufficient restartAlways pods, got %d but expected %d", len(allPods), numAllPods)
} }
@ -257,7 +259,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
// will get an OutOfCpu error. // will get an OutOfCpu error.
ginkgo.By("verifying restartNever pods succeed and restartAlways pods stay running") ginkgo.By("verifying restartNever pods succeed and restartAlways pods stay running")
for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) { for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) {
postRestartRunningPods := waitForPods(f, numAllPods, recoverTimeout) postRestartRunningPods := waitForPodsCondition(f, numAllPods, recoverTimeout, testutils.PodRunningReadyOrSucceeded)
if len(postRestartRunningPods) < numAllPods { if len(postRestartRunningPods) < numAllPods {
framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods) framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods)
} }

View File

@ -19,7 +19,7 @@ package utils
import ( import (
"fmt" "fmt"
"k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
podutil "k8s.io/kubernetes/pkg/api/v1/pod" podutil "k8s.io/kubernetes/pkg/api/v1/pod"
) )
@ -52,6 +52,10 @@ func PodRunningReadyOrSucceeded(p *v1.Pod) (bool, error) {
return PodRunningReady(p) return PodRunningReady(p)
} }
func PodSucceeded(p *v1.Pod) (bool, error) {
return p.Status.Phase == v1.PodSucceeded, nil
}
// FailedContainers inspects all containers in a pod and returns failure // FailedContainers inspects all containers in a pod and returns failure
// information for containers that have failed or been restarted. // information for containers that have failed or been restarted.
// A map is returned where the key is the containerID and the value is a // A map is returned where the key is the containerID and the value is a