From a8168ed543565bf255a9ea3deb790530f01a3879 Mon Sep 17 00:00:00 2001
From: Danielle Lancashire <dani@builds.terrible.systems>
Date: Wed, 11 Aug 2021 16:49:47 +0200
Subject: [PATCH] e2e_node: Fix LocalStorage and PriorityLocalStorage eviction
 tests

Currently the storage eviction tests fail for a few reasons:
- They re-enter storage exhaustion after pulling the images during
  cleanup (increasing test storage reqs, and adding verification for
future diagnosis)
- They were timing out, as in practice it seems that eviction takes just
  over 10 minutes on an n1-standard in many cases. I'm raising these to
15 to provide some padding.

This should ideally bring these tests to passing on CI, as they've now
passed locally for me several times with the remote GCE env.

Follow up work involves diagnosing why these take so long, and
restructuring them to be less finicky.
---
 test/e2e_node/eviction_test.go | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go
index a190f737e72..c17fc08549d 100644
--- a/test/e2e_node/eviction_test.go
+++ b/test/e2e_node/eviction_test.go
@@ -167,17 +167,26 @@ var _ = SIGDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive][Node
 // Disk pressure is induced by running pods which consume disk space.
 var _ = SIGDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive][NodeFeature:Eviction]", func() {
 	f := framework.NewDefaultFramework("localstorage-eviction-test")
-	pressureTimeout := 10 * time.Minute
+	pressureTimeout := 15 * time.Minute
 	expectedNodeCondition := v1.NodeDiskPressure
 	expectedStarvedResource := v1.ResourceEphemeralStorage
 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
+
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("200Mi")
 			summary := eventuallyGetSummary()
-			availableBytes := *(summary.Node.Fs.AvailableBytes)
-			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))}
+
+			diskConsumedByTest := resource.MustParse("4Gi")
+			availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes)
+			evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10)
+
+			if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) {
+				e2eskipper.Skipf("Too little disk free on the host for the LocalStorageEviction test to run")
+			}
+
+			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold}
 			initialConfig.EvictionMinimumReclaim = map[string]string{}
 		})
+
 		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
 			{
 				evictionPriority: 1,
@@ -201,7 +210,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive][NodeF
 	expectedStarvedResource := v1.ResourceEphemeralStorage
 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("200Mi")
+			diskConsumed := resource.MustParse("4Gi")
 			summary := eventuallyGetSummary()
 			availableBytes := *(summary.Node.Fs.AvailableBytes)
 			if availableBytes <= uint64(diskConsumed.Value()) {
@@ -343,14 +352,14 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disru
 	f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test")
 	expectedNodeCondition := v1.NodeDiskPressure
 	expectedStarvedResource := v1.ResourceEphemeralStorage
-	pressureTimeout := 10 * time.Minute
+	pressureTimeout := 15 * time.Minute
 
 	highPriorityClassName := f.BaseName + "-high-priority"
 	highPriority := int32(999999999)
 
 	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
-			diskConsumed := resource.MustParse("350Mi")
+			diskConsumed := resource.MustParse("4Gi")
 			summary := eventuallyGetSummary()
 			availableBytes := *(summary.Node.Fs.AvailableBytes)
 			if availableBytes <= uint64(diskConsumed.Value()) {
@@ -545,7 +554,7 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe
 
 			// In case a test fails before verifying that NodeCondition no longer exist on the node,
 			// we should wait for the NodeCondition to disappear
-			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exist on the node", expectedNodeCondition))
+			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition))
 			gomega.Eventually(func() error {
 				if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) {
 					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
@@ -557,6 +566,15 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe
 			ginkgo.By("making sure we have all the required images for testing")
 			prePullImagesIfNeccecary()
 
+			// Ensure that the NodeCondition hasn't returned after pulling images
+			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition))
+			gomega.Eventually(func() error {
+				if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) {
+					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
+				}
+				return nil
+			}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
+
 			ginkgo.By("making sure we can start a new pod after the test")
 			podName := "test-admit-pod"
 			f.PodClient().CreateSync(&v1.Pod{