From a8168ed543565bf255a9ea3deb790530f01a3879 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Wed, 11 Aug 2021 16:49:47 +0200 Subject: [PATCH] e2e_node: Fix LocalStorage and PriorityLocalStorage eviction tests Currently the storage eviction tests fail for a few reasons: - They re-enter storage exhaustion after pulling the images during cleanup (increasing test storage reqs, and adding verification for future diagnosis) - They were timing out, as in practice it seems that eviction takes just over 10 minutes on an n1-standard in many cases. I'm raising these to 15 to provide some padding. This should ideally bring these tests to passing on CI, as they've now passed locally for me several times with the remote GCE env. Follow up work involves diagnosing why these take so long, and restructuring them to be less finicky. --- test/e2e_node/eviction_test.go | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index a190f737e72..c17fc08549d 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -167,17 +167,26 @@ var _ = SIGDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive][Node // Disk pressure is induced by running pods which consume disk space. var _ = SIGDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive][NodeFeature:Eviction]", func() { f := framework.NewDefaultFramework("localstorage-eviction-test") - pressureTimeout := 10 * time.Minute + pressureTimeout := 15 * time.Minute expectedNodeCondition := v1.NodeDiskPressure expectedStarvedResource := v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("200Mi") summary := eventuallyGetSummary() - availableBytes := *(summary.Node.Fs.AvailableBytes) - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} + + diskConsumedByTest := resource.MustParse("4Gi") + availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes) + evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10) + + if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) { + e2eskipper.Skipf("Too little disk free on the host for the LocalStorageEviction test to run") + } + + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold} initialConfig.EvictionMinimumReclaim = map[string]string{} }) + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ { evictionPriority: 1, @@ -201,7 +210,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive][NodeF expectedStarvedResource := v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("200Mi") + diskConsumed := resource.MustParse("4Gi") summary := eventuallyGetSummary() availableBytes := *(summary.Node.Fs.AvailableBytes) if availableBytes <= uint64(diskConsumed.Value()) { @@ -343,14 +352,14 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disru f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test") expectedNodeCondition := v1.NodeDiskPressure expectedStarvedResource := v1.ResourceEphemeralStorage - pressureTimeout := 10 * time.Minute + pressureTimeout := 15 * time.Minute highPriorityClassName := f.BaseName + "-high-priority" highPriority := int32(999999999) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("350Mi") + diskConsumed := resource.MustParse("4Gi") summary := eventuallyGetSummary() availableBytes := *(summary.Node.Fs.AvailableBytes) if availableBytes <= uint64(diskConsumed.Value()) { @@ -545,7 +554,7 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe // In case a test fails before verifying that NodeCondition no longer exist on the node, // we should wait for the NodeCondition to disappear - ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exist on the node", expectedNodeCondition)) + ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition)) gomega.Eventually(func() error { if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) { return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition) @@ -557,6 +566,15 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe ginkgo.By("making sure we have all the required images for testing") prePullImagesIfNeccecary() + // Ensure that the NodeCondition hasn't returned after pulling images + ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition)) + gomega.Eventually(func() error { + if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) { + return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition) + } + return nil + }, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil()) + ginkgo.By("making sure we can start a new pod after the test") podName := "test-admit-pod" f.PodClient().CreateSync(&v1.Pod{