From a8168ed543565bf255a9ea3deb790530f01a3879 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Wed, 11 Aug 2021 16:49:47 +0200 Subject: [PATCH 1/3] e2e_node: Fix LocalStorage and PriorityLocalStorage eviction tests Currently the storage eviction tests fail for a few reasons: - They re-enter storage exhaustion after pulling the images during cleanup (increasing test storage reqs, and adding verification for future diagnosis) - They were timing out, as in practice it seems that eviction takes just over 10 minutes on an n1-standard in many cases. I'm raising these to 15 to provide some padding. This should ideally bring these tests to passing on CI, as they've now passed locally for me several times with the remote GCE env. Follow up work involves diagnosing why these take so long, and restructuring them to be less finicky. --- test/e2e_node/eviction_test.go | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index a190f737e72..c17fc08549d 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -167,17 +167,26 @@ var _ = SIGDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive][Node // Disk pressure is induced by running pods which consume disk space. var _ = SIGDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive][NodeFeature:Eviction]", func() { f := framework.NewDefaultFramework("localstorage-eviction-test") - pressureTimeout := 10 * time.Minute + pressureTimeout := 15 * time.Minute expectedNodeCondition := v1.NodeDiskPressure expectedStarvedResource := v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("200Mi") summary := eventuallyGetSummary() - availableBytes := *(summary.Node.Fs.AvailableBytes) - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} + + diskConsumedByTest := resource.MustParse("4Gi") + availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes) + evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10) + + if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) { + e2eskipper.Skipf("Too little disk free on the host for the LocalStorageEviction test to run") + } + + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold} initialConfig.EvictionMinimumReclaim = map[string]string{} }) + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ { evictionPriority: 1, @@ -201,7 +210,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive][NodeF expectedStarvedResource := v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("200Mi") + diskConsumed := resource.MustParse("4Gi") summary := eventuallyGetSummary() availableBytes := *(summary.Node.Fs.AvailableBytes) if availableBytes <= uint64(diskConsumed.Value()) { @@ -343,14 +352,14 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disru f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test") expectedNodeCondition := v1.NodeDiskPressure expectedStarvedResource := v1.ResourceEphemeralStorage - pressureTimeout := 10 * time.Minute + pressureTimeout := 15 * time.Minute highPriorityClassName := f.BaseName + "-high-priority" highPriority := int32(999999999) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := resource.MustParse("350Mi") + diskConsumed := resource.MustParse("4Gi") summary := eventuallyGetSummary() availableBytes := *(summary.Node.Fs.AvailableBytes) if availableBytes <= uint64(diskConsumed.Value()) { @@ -545,7 +554,7 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe // In case a test fails before verifying that NodeCondition no longer exist on the node, // we should wait for the NodeCondition to disappear - ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exist on the node", expectedNodeCondition)) + ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition)) gomega.Eventually(func() error { if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) { return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition) @@ -557,6 +566,15 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe ginkgo.By("making sure we have all the required images for testing") prePullImagesIfNeccecary() + // Ensure that the NodeCondition hasn't returned after pulling images + ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition)) + gomega.Eventually(func() error { + if expectedNodeCondition != noPressure && hasNodeCondition(f, expectedNodeCondition) { + return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition) + } + return nil + }, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil()) + ginkgo.By("making sure we can start a new pod after the test") podName := "test-admit-pod" f.PodClient().CreateSync(&v1.Pod{ From b5c2d3b389605a6ac978029a1a1922973d1a6606 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 12 Aug 2021 17:40:08 +0200 Subject: [PATCH 2/3] e2e_node: eviction: Memory-backed Volumes seperation This commit fixes the LocalStorageCapacityIsolationEviction test by acknowledging that in its default configuration kubelet will no-longer evict memory-backed volume pods as they cannot use more than their assigned limit with SizeMemoryBackedVolumes enabled. To account for the old behaviour, we also add a test that explicitly disables the feature to test the behaviour of memory backed local volumes in those scenarios. That test can be removed when/if the feature gate is removed. --- test/e2e_node/eviction_test.go | 53 ++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index c17fc08549d..a0389a50492 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -238,6 +238,47 @@ var _ = SIGDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive][NodeF }) }) +// This test validates that in-memory EmptyDir's are evicted when the Kubelet does +// not have Sized Memory Volumes enabled. When Sized volumes are enabled, it's +// not possible to exhaust the quota. +var _ = SIGDescribe("LocalStorageCapacityIsolationMemoryBackedVolumeEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation][NodeFeature:Eviction]", func() { + f := framework.NewDefaultFramework("localstorage-eviction-test") + evictionTestTimeout := 7 * time.Minute + ginkgo.Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + // setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty) + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"} + initialConfig.FeatureGates["SizeMemoryBackedVolumes"] = false + }) + + sizeLimit := resource.MustParse("100Mi") + useOverLimit := 200 /* Mb */ + useUnderLimit := 80 /* Mb */ + containerLimit := v1.ResourceList{v1.ResourceEphemeralStorage: sizeLimit} + + runEvictionTest(f, evictionTestTimeout, noPressure, noStarvedResource, logDiskMetrics, []podEvictSpec{ + { + evictionPriority: 1, // Should be evicted due to disk limit + pod: diskConsumingPod("emptydir-memory-over-volume-sizelimit", useOverLimit, &v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, + }, v1.ResourceRequirements{}), + }, + { + evictionPriority: 0, // Should not be evicted, as container limits do not account for memory backed volumes + pod: diskConsumingPod("emptydir-memory-over-container-sizelimit", useOverLimit, &v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory"}, + }, v1.ResourceRequirements{Limits: containerLimit}), + }, + { + evictionPriority: 0, + pod: diskConsumingPod("emptydir-memory-innocent", useUnderLimit, &v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, + }, v1.ResourceRequirements{}), + }, + }) + }) +}) + // LocalStorageCapacityIsolationEviction tests that container and volume local storage limits are enforced through evictions var _ = SIGDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation][NodeFeature:Eviction]", func() { f := framework.NewDefaultFramework("localstorage-eviction-test") @@ -259,12 +300,6 @@ var _ = SIGDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disr EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit}, }, v1.ResourceRequirements{}), }, - { - evictionPriority: 1, // This pod should be evicted because of memory emptyDir usage violation - pod: diskConsumingPod("emptydir-memory-sizelimit", useOverLimit, &v1.VolumeSource{ - EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}), - }, { evictionPriority: 1, // This pod should cross the container limit by writing to its writable layer. pod: diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}), @@ -274,6 +309,12 @@ var _ = SIGDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disr pod: diskConsumingPod("container-emptydir-disk-limit", useOverLimit, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{Limits: containerLimit}), }, + { + evictionPriority: 0, // This pod should not be evicted because MemoryBackedVolumes cannot use more space than is allocated to them since SizeMemoryBackedVolumes was enabled + pod: diskConsumingPod("emptydir-memory-sizelimit", useOverLimit, &v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, + }, v1.ResourceRequirements{}), + }, { evictionPriority: 0, // This pod should not be evicted because it uses less than its limit pod: diskConsumingPod("emptydir-disk-below-sizelimit", useUnderLimit, &v1.VolumeSource{ From 7b913370681993a76bc4bf3670eb13a7146f5cc3 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 12 Aug 2021 17:44:57 +0200 Subject: [PATCH 3/3] e2e_node: eviction: Include names of pending-eviction pods in error --- test/e2e_node/eviction_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index a0389a50492..068b48f0a8e 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -658,6 +658,7 @@ func verifyEvictionOrdering(f *framework.Framework, testSpecs []podEvictSpec) er ginkgo.By("checking eviction ordering and ensuring important pods don't fail") done := true + pendingPods := []string{} for _, priorityPodSpec := range testSpecs { var priorityPod v1.Pod for _, p := range updatedPods { @@ -700,13 +701,14 @@ func verifyEvictionOrdering(f *framework.Framework, testSpecs []podEvictSpec) er // If a pod that is not evictionPriority 0 has not been evicted, we are not done if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed { + pendingPods = append(pendingPods, priorityPod.ObjectMeta.Name) done = false } } if done { return nil } - return fmt.Errorf("pods that should be evicted are still running") + return fmt.Errorf("pods that should be evicted are still running: %#v", pendingPods) } func verifyEvictionEvents(f *framework.Framework, testSpecs []podEvictSpec, expectedStarvedResource v1.ResourceName) {