diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index b8f763712cb..90b1038cf15 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -233,7 +233,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act activePods := podFunc() // make observations and get a function to derive pod usage stats relative to those observations. - observations, statsFunc, err := makeSignalObservations(m.summaryProvider, capacityProvider, activePods, *m.dedicatedImageFs) + observations, statsFunc, err := makeSignalObservations(m.summaryProvider, capacityProvider, activePods) if err != nil { glog.Errorf("eviction manager: unexpected err: %v", err) return nil diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 488e54d2e9e..f33aa0ca530 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -1436,297 +1436,3 @@ func TestAllocatableMemoryPressure(t *testing.T) { } } } - -// TestAllocatableNodeFsPressure -func TestAllocatableNodeFsPressure(t *testing.T) { - utilfeature.DefaultFeatureGate.Set("LocalStorageCapacityIsolation=True") - enablePodPriority(true) - podMaker := makePodWithDiskStats - summaryStatsMaker := makeDiskStats - - podsToMake := []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), logsFsUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "2Gi"), rootFsUsed: "1750Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "100Mi"}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[0] - activePodsFunc := func() []*v1.Pod { - return pods - } - - fakeClock := clock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(newEphemeralStorageResourceList("6Gi", "1000m", "10Gi"), newEphemeralStorageResourceList("1Gi", "1000m", "10Gi")) - diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalAllocatableNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1Ki"), - }, - }, - }, - } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Gi", "6Gi", podStats)} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - - // create a best effort pod to test admission - bestEffortPodToAdmit, _ := podMaker("best-admit", defaultPriority, newEphemeralStorageResourceList("", "", ""), newEphemeralStorageResourceList("", "", ""), "0Gi", "", "") - burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newEphemeralStorageResourceList("1Gi", "", ""), newEphemeralStorageResourceList("1Gi", "", ""), "1Gi", "", "") - - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } - - // try to admit our pods (they should succeed) - expected := []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) - } - } - - // induce disk pressure! - fakeClock.Step(1 * time.Minute) - pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), "2000Mi", "", "") - podStats[pod] = podStat - pods = append(pods, pod) - summaryProvider.result = summaryStatsMaker("6Gi", "6Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure") - } - - // check the right pod was killed - if podKiller.pod != podToEvict { - t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, podToEvict.Name) - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } - // reset state - podKiller.pod = nil - podKiller.gracePeriodOverride = nil - - // try to admit our pod (should fail) - expected = []bool{false, false} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) - } - } - - // reduce disk pressure - fakeClock.Step(1 * time.Minute) - pods[5] = pods[len(pods)-1] - pods = pods[:len(pods)-1] - - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure") - } - - // try to admit our pod (should fail) - expected = []bool{false, false} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) - } - } - - // move the clock past transition period to ensure that we stop reporting pressure - fakeClock.Step(5 * time.Minute) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } - - // no pod should have been killed - if podKiller.pod != nil { - t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } - - // all pods should admit now - expected = []bool{true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) - } - } -} - -func TestNodeReclaimForAllocatableFuncs(t *testing.T) { - utilfeature.DefaultFeatureGate.Set("LocalStorageCapacityIsolation=True") - enablePodPriority(true) - podMaker := makePodWithDiskStats - summaryStatsMaker := makeDiskStats - podsToMake := []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), logsFsUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "2Gi"), rootFsUsed: "1750Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "100Mi"}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[0] - activePodsFunc := func() []*v1.Pod { - return pods - } - - fakeClock := clock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(newEphemeralStorageResourceList("6Gi", "1000m", "10Gi"), newEphemeralStorageResourceList("1Gi", "1000m", "10Gi")) - imageGcFree := resource.MustParse("800Mi") - diskGC := &mockDiskGC{imageBytesFreed: imageGcFree.Value(), err: nil} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalAllocatableNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("10Mi"), - }, - }, - }, - } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("6Gi", "6Gi", podStats)} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } - - // induce hard threshold - fakeClock.Step(1 * time.Minute) - - pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), "2000Mi", "", "") - podStats[pod] = podStat - pods = append(pods, pod) - summaryProvider.result = summaryStatsMaker("6Gi", "6Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure since soft threshold was met") - } - - // verify image gc was invoked - if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked { - t.Fatalf("Manager should have invoked image gc") - } - - // verify no pod was killed because image gc was sufficient - if podKiller.pod == nil { - t.Fatalf("Manager should have killed a pod, but not killed") - } - // check the right pod was killed - if podKiller.pod != podToEvict { - t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, podToEvict.Name) - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } - - // reset state - diskGC.imageGCInvoked = false - diskGC.containerGCInvoked = false - podKiller.pod = nil - podKiller.gracePeriodOverride = nil - - // reduce disk pressure - fakeClock.Step(1 * time.Minute) - pods[5] = pods[len(pods)-1] - pods = pods[:len(pods)-1] - - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure") - } - - // move the clock past transition period to ensure that we stop reporting pressure - fakeClock.Step(5 * time.Minute) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) - - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } - - // no pod should have been killed - if podKiller.pod != nil { - t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } - - // no image gc should have occurred - if diskGC.imageGCInvoked || diskGC.containerGCInvoked { - t.Errorf("Manager chose to perform image gc when it was not neeed") - } - - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } -} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 50d388a5af4..f364a9ca130 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -73,13 +73,11 @@ func init() { signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure - signalToNodeCondition[evictionapi.SignalAllocatableNodeFsAvailable] = v1.NodeDiskPressure // map signals to resources (and vice-versa) signalToResource = map[evictionapi.Signal]v1.ResourceName{} signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory - signalToResource[evictionapi.SignalAllocatableNodeFsAvailable] = resourceNodeFs signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs @@ -212,16 +210,6 @@ func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold Quantity: resource.NewQuantity(int64(0), resource.BinarySI), }, }, - { - Signal: evictionapi.SignalAllocatableNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: resource.NewQuantity(int64(0), resource.BinarySI), - }, - MinReclaim: &evictionapi.ThresholdValue{ - Quantity: resource.NewQuantity(int64(0), resource.BinarySI), - }, - }, } } } @@ -704,7 +692,7 @@ func (a byEvictionPriority) Less(i, j int) bool { } // makeSignalObservations derives observations using the specified summary provider. -func makeSignalObservations(summaryProvider stats.SummaryProvider, capacityProvider CapacityProvider, pods []*v1.Pod, withImageFs bool) (signalObservations, statsFunc, error) { +func makeSignalObservations(summaryProvider stats.SummaryProvider, capacityProvider CapacityProvider, pods []*v1.Pod) (signalObservations, statsFunc, error) { summary, err := summaryProvider.Get() if err != nil { return nil, nil, err @@ -756,11 +744,11 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, capacityProvi } } - nodeCapacity := capacityProvider.GetCapacity() - allocatableReservation := capacityProvider.GetNodeAllocatableReservation() - - memoryAllocatableCapacity, memoryAllocatableAvailable, exist := getResourceAllocatable(nodeCapacity, allocatableReservation, v1.ResourceMemory) - if exist { + if memoryAllocatableCapacity, ok := capacityProvider.GetCapacity()[v1.ResourceMemory]; ok { + memoryAllocatableAvailable := memoryAllocatableCapacity.Copy() + if reserved, exists := capacityProvider.GetNodeAllocatableReservation()[v1.ResourceMemory]; exists { + memoryAllocatableAvailable.Sub(reserved) + } for _, pod := range summary.Pods { mu, err := podMemoryUsage(pod) if err == nil { @@ -769,55 +757,15 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, capacityProvi } result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ available: memoryAllocatableAvailable, - capacity: memoryAllocatableCapacity, - } - } - - if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { - ephemeralStorageCapacity, ephemeralStorageAllocatable, exist := getResourceAllocatable(nodeCapacity, allocatableReservation, v1.ResourceEphemeralStorage) - if exist { - for _, pod := range pods { - podStat, ok := statsFunc(pod) - if !ok { - continue - } - - fsStatsSet := []fsStatsType{} - if withImageFs { - fsStatsSet = []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource} - } else { - fsStatsSet = []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource} - } - - usage, err := podDiskUsage(podStat, pod, fsStatsSet) - if err != nil { - glog.Warningf("eviction manager: error getting pod disk usage %v", err) - continue - } - ephemeralStorageAllocatable.Sub(usage[resourceDisk]) - } - result[evictionapi.SignalAllocatableNodeFsAvailable] = signalObservation{ - available: ephemeralStorageAllocatable, - capacity: ephemeralStorageCapacity, - } + capacity: &memoryAllocatableCapacity, } + } else { + glog.Errorf("Could not find capacity information for resource %v", v1.ResourceMemory) } return result, statsFunc, nil } -func getResourceAllocatable(capacity v1.ResourceList, reservation v1.ResourceList, resourceName v1.ResourceName) (*resource.Quantity, *resource.Quantity, bool) { - if capacity, ok := capacity[resourceName]; ok { - allocate := capacity.Copy() - if reserved, exists := reservation[resourceName]; exists { - allocate.Sub(reserved) - } - return capacity.Copy(), allocate, true - } - glog.Errorf("Could not find capacity information for resource %v", resourceName) - return nil, nil, false -} - // thresholdsMet returns the set of thresholds that were met independent of grace period func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold { results := []evictionapi.Threshold{} diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index d5403be1498..2b2d5f98026 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -78,16 +78,6 @@ func TestParseThresholdConfig(t *testing.T) { Quantity: quantityMustParse("0"), }, }, - { - Signal: evictionapi.SignalAllocatableNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("0"), - }, - MinReclaim: &evictionapi.ThresholdValue{ - Quantity: quantityMustParse("0"), - }, - }, { Signal: evictionapi.SignalMemoryAvailable, Operator: evictionapi.OpLessThan, @@ -793,7 +783,7 @@ func TestMakeSignalObservations(t *testing.T) { if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 { t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity) } - actualObservations, statsFunc, err := makeSignalObservations(provider, capacityProvider, pods, false) + actualObservations, statsFunc, err := makeSignalObservations(provider, capacityProvider, pods) if err != nil { t.Errorf("Unexpected err: %v", err) } diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 067c911e121..3e23280081d 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -119,42 +119,6 @@ var _ = framework.KubeDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disru }) }) -// LocalStorageAllocatableEviction tests that the node responds to node disk pressure by evicting only responsible pods. -// Node disk pressure is only encountered because we reserve the majority of the node's capacity via kube-reserved. -var _ = framework.KubeDescribe("LocalStorageAllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() { - f := framework.NewDefaultFramework("localstorageallocatable-eviction-test") - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { - // Set up --kube-reserved for scratch storage - tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - diskConsumed := uint64(200000000) // At least 200 Mb for pods to consume - summary := eventuallyGetSummary() - availableBytes := *(summary.Node.Fs.AvailableBytes) - initialConfig.KubeReserved = map[string]string{ - string(v1.ResourceEphemeralStorage): fmt.Sprintf("%d", availableBytes-diskConsumed), - } - initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey} - initialConfig.CgroupsPerQOS = true - initialConfig.FeatureGates[string(features.LocalStorageCapacityIsolation)] = true - // set evictionHard to be very small, so that only the allocatable eviction threshold triggers - initialConfig.EvictionHard = map[string]string{"nodefs.available": "1"} - initialConfig.EvictionMinimumReclaim = map[string]string{} - framework.Logf("KubeReserved: %+v", initialConfig.KubeReserved) - }) - runEvictionTest(f, pressureTimeout, expectedNodeCondition, logDiskMetrics, []podEvictSpec{ - { - evictionPriority: 1, - pod: diskConsumingPod("container-disk-hog", 10000, nil, v1.ResourceRequirements{}), - }, - { - evictionPriority: 0, - pod: innocentPod(), - }, - }) - }) -}) - // LocalStorageEviction tests that the node responds to node disk pressure by evicting only responsible pods // Disk pressure is induced by running pods which consume disk space. var _ = framework.KubeDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {