e2e_node: Fix LocalStorage and PriorityLocalStorage eviction tests

Currently the storage eviction tests fail for a few reasons: - They re-enter storage exhaustion after pulling the images during cleanup (increasing test storage reqs, and adding verification for future diagnosis) - They were timing out, as in practice it seems that eviction takes just over 10 minutes on an n1-standard in many cases. I'm raising these to 15 to provide some padding. This should ideally bring these tests to passing on CI, as they've now passed locally for me several times with the remote GCE env. Follow up work involves diagnosing why these take so long, and restructuring them to be less finicky.
2026-01-04 23:17:50 +00:00 · 2021-08-11 16:49:47 +02:00
parent 9f7e079a5b
commit a8168ed543
1 changed files with 26 additions and 8 deletions
--- a/test/e2e_node/eviction_test.go
+++ b/test/e2e_node/eviction_test.go
@@ -167,17 +167,26 @@ var _ = SIGDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive][Node
 // Disk pressure is induced by running pods which consume disk space.
 var _ = SIGDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive][NodeFeature:Eviction]", func() {
 	f := framework.NewDefaultFramework("localstorage-eviction-test")
-	pressureTimeout := 10 * time.Minute
+	pressureTimeout := 15 * time.Minute
 	expectedNodeCondition := v1.NodeDiskPressure
 	expectedStarvedResource := v1.ResourceEphemeralStorage
 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
+
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("200Mi")
 			summary := eventuallyGetSummary()
-			availableBytes := *(summary.Node.Fs.AvailableBytes)
-			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))}
+
+			diskConsumedByTest := resource.MustParse("4Gi")
+			availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes)
+			evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10)
+
+			if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) {
+				e2eskipper.Skipf("Too little disk free on the host for the LocalStorageEviction test to run")
+			}
+
+			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold}
 			initialConfig.EvictionMinimumReclaim = map[string]string{}
 		})
+
 		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
 			{
 				evictionPriority: 1,
@@ -201,7 +210,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive][NodeF
 	expectedStarvedResource := v1.ResourceEphemeralStorage
 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("200Mi")
+			diskConsumed := resource.MustParse("4Gi")
 			summary := eventuallyGetSummary()
 			availableBytes := *(summary.Node.Fs.AvailableBytes)
 			if availableBytes <= uint64(diskConsumed.Value()) {
@@ -343,14 +352,14 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disru
 	f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test")
 	expectedNodeCondition := v1.NodeDiskPressure
 	expectedStarvedResource := v1.ResourceEphemeralStorage
-	pressureTimeout := 10 * time.Minute
+	pressureTimeout := 15 * time.Minute

 	highPriorityClassName := f.BaseName + "-high-priority"
 	highPriority := int32(999999999)

 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("350Mi")
+			diskConsumed := resource.MustParse("4Gi")
 			summary := eventuallyGetSummary()
 			availableBytes := *(summary.Node.Fs.AvailableBytes)
 			if availableBytes <= uint64(diskConsumed.Value()) {
@@ -545,7 +554,7 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe

 			// In case a test fails before verifying that NodeCondition no longer exist on the node,
 			// we should wait for the NodeCondition to disappear
-			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exist on the node", expectedNodeCondition))
+			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition))
 			gomega.Eventually(func() error {
 				if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) {
 					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
@@ -557,6 +566,15 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe
 			ginkgo.By("making sure we have all the required images for testing")
 			prePullImagesIfNeccecary()

+			// Ensure that the NodeCondition hasn't returned after pulling images
+			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition))
+			gomega.Eventually(func() error {
+				if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) {
+					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
+				}
+				return nil
+			}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
+
 			ginkgo.By("making sure we can start a new pod after the test")
 			podName := "test-admit-pod"
 			f.PodClient().CreateSync(&v1.Pod{