diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index d5fc2a49578..14596188f6d 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -436,14 +436,14 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam } // update our local observations based on the amount reported to have been reclaimed. // note: this is optimistic, other things could have been still consuming the pressured resource in the interim. - signal := resourceToSignal[resourceToReclaim] - value, ok := observations[signal] - if !ok { - glog.Errorf("eviction manager: unable to find value associated with signal %v", signal) - continue + for _, signal := range resourceClaimToSignal[resourceToReclaim] { + value, ok := observations[signal] + if !ok { + glog.Errorf("eviction manager: unable to find value associated with signal %v", signal) + continue + } + value.available.Add(*reclaimed) } - value.available.Add(*reclaimed) - // evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 { return true diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 70689b40302..057eeb7b749 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -1428,3 +1428,295 @@ func TestAllocatableMemoryPressure(t *testing.T) { } } } + +// TestAllocatableNodeFsPressure +func TestAllocatableNodeFsPressure(t *testing.T) { + podMaker := makePodWithDiskStats + summaryStatsMaker := makeDiskStats + + podsToMake := []podToMake{ + {name: "guaranteed-low", requests: newEphemeralStorageResourceList("200Mi", "100m", "1Gi"), limits: newEphemeralStorageResourceList("200Mi", "100m", "1Gi"), rootFsUsed: "200Mi"}, + {name: "guaranteed-high", requests: newEphemeralStorageResourceList("800Mi", "100m", "1Gi"), limits: newEphemeralStorageResourceList("800Mi", "100m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "burstable-low", requests: newEphemeralStorageResourceList("300Mi", "100m", "100Mi"), limits: newEphemeralStorageResourceList("300Mi", "200m", "1Gi"), logsFsUsed: "300Mi"}, + {name: "burstable-high", requests: newEphemeralStorageResourceList("800Mi", "100m", "100Mi"), limits: newEphemeralStorageResourceList("800Mi", "200m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "best-effort-low", requests: newEphemeralStorageResourceList("300Mi", "", ""), limits: newEphemeralStorageResourceList("300Mi", "", ""), logsFsUsed: "300Mi"}, + {name: "best-effort-high", requests: newEphemeralStorageResourceList("800Mi", "", ""), limits: newEphemeralStorageResourceList("800Mi", "", ""), rootFsUsed: "800Mi"}, + } + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[5] + activePodsFunc := func() []*v1.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + capacityProvider := newMockCapacityProvider(newEphemeralStorageResourceList("6Gi", "1000m", "10Gi"), newEphemeralStorageResourceList("1Gi", "1000m", "10Gi")) + diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Ki"), + }, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Gi", "6Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // create a best effort pod to test admission + bestEffortPodToAdmit, _ := podMaker("best-admit", newEphemeralStorageResourceList("", "", ""), newEphemeralStorageResourceList("", "", ""), "0Gi", "", "") + burstablePodToAdmit, _ := podMaker("burst-admit", newEphemeralStorageResourceList("1Gi", "", ""), newEphemeralStorageResourceList("1Gi", "", ""), "1Gi", "", "") + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + // try to admit our pods (they should succeed) + expected := []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + pod, podStat := podMaker("guaranteed-high-2", newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), "2000Mi", "", "") + podStats[pod] = podStat + pods = append(pods, pod) + summaryProvider.result = summaryStatsMaker("6Gi", "6Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, podToEvict.Name) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // try to admit our pod (should fail) + expected = []bool{false, false} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + pods[5] = pods[len(pods)-1] + pods = pods[:len(pods)-1] + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // try to admit our pod (should fail) + expected = []bool{false, false} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // all pods should admit now + expected = []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } +} + +func TestNodeReclaimForAllocatableFuncs(t *testing.T) { + podMaker := makePodWithDiskStats + summaryStatsMaker := makeDiskStats + podsToMake := []podToMake{ + {name: "guaranteed-low", requests: newEphemeralStorageResourceList("200Mi", "100m", "1Gi"), limits: newEphemeralStorageResourceList("200Mi", "100m", "1Gi"), rootFsUsed: "200Mi"}, + {name: "guaranteed-high", requests: newEphemeralStorageResourceList("800Mi", "100m", "1Gi"), limits: newEphemeralStorageResourceList("800Mi", "100m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "burstable-low", requests: newEphemeralStorageResourceList("300Mi", "100m", "100Mi"), limits: newEphemeralStorageResourceList("300Mi", "200m", "1Gi"), logsFsUsed: "300Mi"}, + {name: "burstable-high", requests: newEphemeralStorageResourceList("800Mi", "100m", "100Mi"), limits: newEphemeralStorageResourceList("800Mi", "200m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "best-effort-low", requests: newEphemeralStorageResourceList("300Mi", "", ""), limits: newEphemeralStorageResourceList("300Mi", "", ""), logsFsUsed: "300Mi"}, + {name: "best-effort-high", requests: newEphemeralStorageResourceList("800Mi", "", ""), limits: newEphemeralStorageResourceList("800Mi", "", ""), rootFsUsed: "800Mi"}, + } + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[5] + activePodsFunc := func() []*v1.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + capacityProvider := newMockCapacityProvider(newEphemeralStorageResourceList("6Gi", "1000m", "10Gi"), newEphemeralStorageResourceList("1Gi", "1000m", "10Gi")) + imageGcFree := resource.MustParse("800Mi") + diskGC := &mockDiskGC{imageBytesFreed: imageGcFree.Value(), err: nil} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("10Mi"), + }, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("6Gi", "6Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // induce hard threshold + fakeClock.Step(1 * time.Minute) + + pod, podStat := podMaker("guaranteed-high-2", newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), newEphemeralStorageResourceList("2000Mi", "100m", "1Gi"), "2000Mi", "", "") + podStats[pod] = podStat + pods = append(pods, pod) + summaryProvider.result = summaryStatsMaker("6Gi", "6Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } + + // verify image gc was invoked + if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked { + t.Fatalf("Manager should have invoked image gc") + } + + // verify no pod was killed because image gc was sufficient + if podKiller.pod == nil { + t.Fatalf("Manager should have killed a pod, but not killed") + } + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, podToEvict.Name) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reset state + diskGC.imageGCInvoked = false + diskGC.containerGCInvoked = false + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + pods[5] = pods[len(pods)-1] + pods = pods[:len(pods)-1] + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // no image gc should have occurred + if diskGC.imageGCInvoked || diskGC.containerGCInvoked { + t.Errorf("Manager chose to perform image gc when it was not neeed") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } +} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index dd04e9ff625..9c639d5064f 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -59,8 +59,8 @@ var ( signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType // signalToResource maps a Signal to its associated Resource. signalToResource map[evictionapi.Signal]v1.ResourceName - // resourceToSignal maps a Resource to its associated Signal - resourceToSignal map[v1.ResourceName]evictionapi.Signal + // resourceClaimToSignal maps a Resource that can be reclaimed to its associated Signal + resourceClaimToSignal map[v1.ResourceName][]evictionapi.Signal ) func init() { @@ -84,13 +84,12 @@ func init() { signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes - resourceToSignal = map[v1.ResourceName]evictionapi.Signal{} - for key, value := range signalToResource { - resourceToSignal[value] = key - } - // Hard-code here to make sure resourceNodeFs maps to evictionapi.SignalNodeFsAvailable - // (TODO) resourceToSignal is a map from resource name to a list of signals - resourceToSignal[resourceNodeFs] = evictionapi.SignalNodeFsAvailable + // maps resource to signals (the following resource could be reclaimed) + resourceClaimToSignal = map[v1.ResourceName][]evictionapi.Signal{} + resourceClaimToSignal[resourceNodeFs] = []evictionapi.Signal{evictionapi.SignalNodeFsAvailable} + resourceClaimToSignal[resourceImageFs] = []evictionapi.Signal{evictionapi.SignalImageFsAvailable} + resourceClaimToSignal[resourceNodeFsInodes] = []evictionapi.Signal{evictionapi.SignalNodeFsInodesFree} + resourceClaimToSignal[resourceImageFsInodes] = []evictionapi.Signal{evictionapi.SignalImageFsInodesFree} } // validSignal returns true if the signal is supported. diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 49363965a25..eb0c1b1dcb5 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -1604,6 +1604,20 @@ func newResourceList(cpu, memory string) v1.ResourceList { return res } +func newEphemeralStorageResourceList(ephemeral, cpu, memory string) v1.ResourceList { + res := v1.ResourceList{} + if ephemeral != "" { + res[v1.ResourceEphemeralStorage] = resource.MustParse(ephemeral) + } + if cpu != "" { + res[v1.ResourceCPU] = resource.MustParse(cpu) + } + if memory != "" { + res[v1.ResourceMemory] = resource.MustParse("1Mi") + } + return res +} + func newResourceRequirements(requests, limits v1.ResourceList) v1.ResourceRequirements { res := v1.ResourceRequirements{} res.Requests = requests