From e03d0f60ef27a777de278bb8969acf6c134b68f3 Mon Sep 17 00:00:00 2001
From: Laura Lorenz <lauralorenz@google.com>
Date: Tue, 12 Nov 2024 21:48:28 +0000
Subject: [PATCH] Orient tests to run faster, but tolerate infra slowdowns up
 to 5 minutes

Signed-off-by: Laura Lorenz <lauralorenz@google.com>
---
 test/e2e_node/container_restart_test.go | 14 ++++++++------
 test/e2e_node/image_pull_test.go        |  9 ++++-----
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go
index b7b5eacc4d1..5a2d530f3f2 100644
--- a/test/e2e_node/container_restart_test.go
+++ b/test/e2e_node/container_restart_test.go
@@ -27,6 +27,7 @@ import (
 
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
+	"github.com/pkg/errors"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 
 	v1 "k8s.io/api/core/v1"
@@ -59,7 +60,7 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(
 
 		ginkgo.It("Container restart backs off.", func(ctx context.Context) {
 			// 0s, 0s, 10s, 30s, 70s, 150s, 310s
-			doTest(ctx, f, 5, containerName, 7)
+			doTest(ctx, f, 3, containerName, 7)
 		})
 	})
 
@@ -82,8 +83,8 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(
 		})
 
 		ginkgo.It("Alternate restart backs off.", func(ctx context.Context) {
-			// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150, 180, 210)
-			doTest(ctx, f, 7, containerName, 10)
+			// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s
+			doTest(ctx, f, 3, containerName, 13)
 		})
 	})
 })
@@ -94,8 +95,9 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con
 	podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute)
 	gomega.Expect(podErr).To(gomega.HaveOccurred())
 
-	// Wait for 210s worth of backoffs to occur so we can confirm the backoff growth.
-	podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 210*time.Second, targetRestarts)
+	// Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow.
+	time.Sleep(30 * time.Second)
+	podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts)
 	gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures")
 
 	r, err := extractObservedBackoff(ctx, f, pod.Name, containerName)
@@ -117,7 +119,7 @@ func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName
 			}
 		}
 	}
-	return r, nil
+	return r, errors.Errorf("Could not find container status for container %s in pod %s", containerName, podName)
 }
 
 func newFailAlwaysPod() *v1.Pod {
diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go
index d759a883143..fce685e5264 100644
--- a/test/e2e_node/image_pull_test.go
+++ b/test/e2e_node/image_pull_test.go
@@ -257,16 +257,15 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func
 		isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error())
 		gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy"))
 
-		// Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth.
-		podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute)
+		podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 30*time.Second)
 		gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeatedly backing off image pulls")
 
 		e, err := getImagePullAttempts(ctx, f, pod.Name)
 		framework.ExpectNoError(err)
-		// 3 would take 10s best case
+		// 3 would take 10s best case.
 		gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3))
-		// 6 would take 150s best case
-		gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 6))
+		// 7 would take 310s best case, if the infra went slow.
+		gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 7))
 
 	})