From b4a8861af306b8b4feb8723254cf10af4206a076 Mon Sep 17 00:00:00 2001 From: Elana Hashman Date: Thu, 18 Nov 2021 14:57:22 -0800 Subject: [PATCH 1/2] Tweak resource requests for Kubelet restart test --- test/e2e_node/restart_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/e2e_node/restart_test.go b/test/e2e_node/restart_test.go index 505ec85a4b2..d1e7af884eb 100644 --- a/test/e2e_node/restart_test.go +++ b/test/e2e_node/restart_test.go @@ -150,6 +150,10 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { if numCpus < 1 { e2eskipper.Skipf("insufficient CPU available for kubelet restart test") } + if numCpus > 18 { + // 950m * 19 = 1805 CPUs -> not enough to block the scheduling of another 950m pod + e2eskipper.Skipf("test will return false positives on a machine with >18 cores") + } // create as many restartNever pods as there are allocatable CPU // nodes; if they are not correctly accounted for as terminated @@ -161,7 +165,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { pod.Spec.RestartPolicy = "Never" pod.Spec.Containers[0].Command = []string{"echo", "hi"} pod.Spec.Containers[0].Resources.Limits = v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("1"), + v1.ResourceCPU: resource.MustParse("950m"), // leave a little room for other workloads } } createBatchPodWithRateControl(f, restartNeverPods, podCreationInterval) From 6ddf86d422a3ca2d6547bd2b0fe03202e23f8817 Mon Sep 17 00:00:00 2001 From: Elana Hashman Date: Thu, 18 Nov 2021 14:59:01 -0800 Subject: [PATCH 2/2] Set startTimeout back to 3m, restore wait loop at end of test --- test/e2e_node/restart_test.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/e2e_node/restart_test.go b/test/e2e_node/restart_test.go index d1e7af884eb..f3ee800678d 100644 --- a/test/e2e_node/restart_test.go +++ b/test/e2e_node/restart_test.go @@ -73,7 +73,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { podCount = 100 podCreationInterval = 100 * time.Millisecond recoverTimeout = 5 * time.Minute - startTimeout = 5 * time.Minute + startTimeout = 3 * time.Minute // restartCount is chosen so even with minPods we exhaust the default // allocation of a /24. minPods = 50 @@ -165,7 +165,7 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { pod.Spec.RestartPolicy = "Never" pod.Spec.Containers[0].Command = []string{"echo", "hi"} pod.Spec.Containers[0].Resources.Limits = v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("950m"), // leave a little room for other workloads + v1.ResourceCPU: resource.MustParse("950m"), // leave a little room for other workloads } } createBatchPodWithRateControl(f, restartNeverPods, podCreationInterval) @@ -203,9 +203,11 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { // restart may think these old pods are consuming CPU and we // will get an OutOfCpu error. ginkgo.By("verifying restartNever pods succeed and restartAlways pods stay running") - postRestartRunningPods := waitForPods(f, numAllPods, recoverTimeout) - if len(postRestartRunningPods) < numAllPods { - framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods) + for start := time.Now(); time.Since(start) < startTimeout; time.Sleep(10 * time.Second) { + postRestartRunningPods := waitForPods(f, numAllPods, recoverTimeout) + if len(postRestartRunningPods) < numAllPods { + framework.Failf("less pods are running after node restart, got %d but expected %d", len(postRestartRunningPods), numAllPods) + } } }) })