diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index adb4ee85bd4..3ee12a03650 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -54,6 +54,8 @@ type managerImpl struct { summaryProvider stats.SummaryProvider // records when a threshold was first observed thresholdsFirstObservedAt thresholdsObservedAt + // records the set of thresholds that have been met (including graceperiod) but not yet resolved + thresholdsMet []Threshold // resourceToRankFunc maps a resource to ranking function for that resource. resourceToRankFunc map[api.ResourceName]rankFunc } @@ -158,7 +160,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act now := m.clock.Now() // determine the set of thresholds met independent of grace period - thresholds = thresholdsMet(thresholds, observations) + thresholds = thresholdsMet(thresholds, observations, false) + + // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim + if len(m.thresholdsMet) > 0 { + thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true) + thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved) + } // track when a threshold was first observed thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now) @@ -180,6 +188,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act m.nodeConditions = nodeConditions m.thresholdsFirstObservedAt = thresholdsFirstObservedAt m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt + m.thresholdsMet = thresholds m.Unlock() // determine the set of resources under starvation diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 0d8c365838e..e7933ecdae6 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -514,3 +514,161 @@ func TestDiskPressureNodeFs(t *testing.T) { t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) } } + +// TestMinReclaim verifies that min-reclaim works as desired. +func TestMinReclaim(t *testing.T) { + podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) { + pod := newPod(name, []api.Container{ + newContainer(name, requests, limits), + }, nil) + podStats := newPodMemoryStats(pod, resource.MustParse(memoryWorkingSet)) + return pod, podStats + } + summaryStatsMaker := func(nodeAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary { + val := resource.MustParse(nodeAvailableBytes) + availableBytes := uint64(val.Value()) + result := &statsapi.Summary{ + Node: statsapi.NodeStats{ + Memory: &statsapi.MemoryStats{ + AvailableBytes: &availableBytes, + }, + }, + Pods: []statsapi.PodStats{}, + } + for _, podStat := range podStats { + result.Pods = append(result.Pods, podStat) + } + return result + } + podsToMake := []struct { + name string + requests api.ResourceList + limits api.ResourceList + memoryWorkingSet string + }{ + {name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "500Mi"}, + {name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "300Mi"}, + {name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "800Mi"}, + {name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"}, + {name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"}, + {name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"}, + } + pods := []*api.Pod{} + podStats := map[*api.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet) + pods = append(pods, pod) + podStats[pod] = podStat + } + activePodsFunc := func() []*api.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: quantityMustParse("1Gi"), + MinReclaim: quantityMustParse("500Mi"), + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // induce memory pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce memory pressure, but not below the min-reclaim amount + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure (because transition period not yet met) + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce memory pressure and ensure the min-reclaim amount + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure (because transition period not yet met) + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure (because transition period met) + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } +} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 2709153cd8c..1e2f3dd1a0f 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -534,7 +534,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv } // thresholdsMet returns the set of thresholds that were met independent of grace period -func thresholdsMet(thresholds []Threshold, observations signalObservations) []Threshold { +func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold { results := []Threshold{} for i := range thresholds { threshold := thresholds[i] @@ -545,7 +545,12 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations) []Th } // determine if we have met the specified threshold thresholdMet := false - thresholdResult := threshold.Value.Cmp(*observed) + quantity := threshold.Value.Copy() + // if enforceMinReclaim is specified, we compare relative to value - minreclaim + if enforceMinReclaim && threshold.MinReclaim != nil { + quantity.Add(*threshold.MinReclaim) + } + thresholdResult := quantity.Cmp(*observed) switch threshold.Operator { case OpLessThan: thresholdMet = thresholdResult > 0 @@ -589,7 +594,9 @@ func nodeConditions(thresholds []Threshold) []api.NodeConditionType { results := []api.NodeConditionType{} for _, threshold := range thresholds { if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found { - results = append(results, nodeCondition) + if !hasNodeCondition(results, nodeCondition) { + results = append(results, nodeCondition) + } } } return results @@ -644,7 +651,18 @@ func hasNodeCondition(inputs []api.NodeConditionType, item api.NodeConditionType return false } -// hasThreshold returns true if the node condition is in the input list +// mergeThresholds will merge both threshold lists eliminating duplicates. +func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold { + results := inputsA + for _, threshold := range inputsB { + if !hasThreshold(results, threshold) { + results = append(results, threshold) + } + } + return results +} + +// hasThreshold returns true if the threshold is in the input list func hasThreshold(inputs []Threshold, item Threshold) bool { for _, input := range inputs { if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && input.Value.Cmp(*item.Value) == 0 { diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 93713ddb7f1..b66eab879e7 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -585,29 +585,50 @@ func TestMakeSignalObservations(t *testing.T) { func TestThresholdsMet(t *testing.T) { hardThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: quantityMustParse("1Gi"), + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: quantityMustParse("1Gi"), + MinReclaim: quantityMustParse("500Mi"), } testCases := map[string]struct { - thresholds []Threshold - observations signalObservations - result []Threshold + enforceMinReclaim bool + thresholds []Threshold + observations signalObservations + result []Threshold }{ "empty": { - thresholds: []Threshold{}, - observations: signalObservations{}, - result: []Threshold{}, + enforceMinReclaim: false, + thresholds: []Threshold{}, + observations: signalObservations{}, + result: []Threshold{}, }, "threshold-met": { - thresholds: []Threshold{hardThreshold}, + enforceMinReclaim: false, + thresholds: []Threshold{hardThreshold}, observations: signalObservations{ SignalMemoryAvailable: quantityMustParse("500Mi"), }, result: []Threshold{hardThreshold}, }, "threshold-not-met": { - thresholds: []Threshold{hardThreshold}, + enforceMinReclaim: false, + thresholds: []Threshold{hardThreshold}, + observations: signalObservations{ + SignalMemoryAvailable: quantityMustParse("2Gi"), + }, + result: []Threshold{}, + }, + "threshold-met-with-min-reclaim": { + enforceMinReclaim: true, + thresholds: []Threshold{hardThreshold}, + observations: signalObservations{ + SignalMemoryAvailable: quantityMustParse("1.05Gi"), + }, + result: []Threshold{hardThreshold}, + }, + "threshold-not-met-with-min-reclaim": { + enforceMinReclaim: true, + thresholds: []Threshold{hardThreshold}, observations: signalObservations{ SignalMemoryAvailable: quantityMustParse("2Gi"), }, @@ -615,7 +636,7 @@ func TestThresholdsMet(t *testing.T) { }, } for testName, testCase := range testCases { - actual := thresholdsMet(testCase.thresholds, testCase.observations) + actual := thresholdsMet(testCase.thresholds, testCase.observations, testCase.enforceMinReclaim) if !thresholdList(actual).Equal(thresholdList(testCase.result)) { t.Errorf("Test case: %s, expected: %v, actual: %v", testName, testCase.result, actual) }