kubelet eviction manager support for min-reclaim

2025-07-23 19:56:01 +00:00 · 2016-08-02 22:58:48 -04:00 · 2016-08-02 22:58:48 -04:00 · 611c127f0d
commit 611c127f0d
parent 8ab06a3f86
4 changed files with 223 additions and 17 deletions
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -54,6 +54,8 @@ type managerImpl struct {
 	summaryProvider stats.SummaryProvider
 	// records when a threshold was first observed
 	thresholdsFirstObservedAt thresholdsObservedAt
+	// records the set of thresholds that have been met (including graceperiod) but not yet resolved
+	thresholdsMet []Threshold
 	// resourceToRankFunc maps a resource to ranking function for that resource.
 	resourceToRankFunc map[api.ResourceName]rankFunc
 }
@ -158,7 +160,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	now := m.clock.Now()

 	// determine the set of thresholds met independent of grace period
-	thresholds = thresholdsMet(thresholds, observations)
+	thresholds = thresholdsMet(thresholds, observations, false)
+
+	// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
+	if len(m.thresholdsMet) > 0 {
+		thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
+		thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
+	}

 	// track when a threshold was first observed
 	thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
@ -180,6 +188,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	m.nodeConditions = nodeConditions
 	m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
 	m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
+	m.thresholdsMet = thresholds
 	m.Unlock()

 	// determine the set of resources under starvation
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -514,3 +514,161 @@ func TestDiskPressureNodeFs(t *testing.T) {
 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
 	}
 }
+
+// TestMinReclaim verifies that min-reclaim works as desired.
+func TestMinReclaim(t *testing.T) {
+	podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) {
+		pod := newPod(name, []api.Container{
+			newContainer(name, requests, limits),
+		}, nil)
+		podStats := newPodMemoryStats(pod, resource.MustParse(memoryWorkingSet))
+		return pod, podStats
+	}
+	summaryStatsMaker := func(nodeAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary {
+		val := resource.MustParse(nodeAvailableBytes)
+		availableBytes := uint64(val.Value())
+		result := &statsapi.Summary{
+			Node: statsapi.NodeStats{
+				Memory: &statsapi.MemoryStats{
+					AvailableBytes: &availableBytes,
+				},
+			},
+			Pods: []statsapi.PodStats{},
+		}
+		for _, podStat := range podStats {
+			result.Pods = append(result.Pods, podStat)
+		}
+		return result
+	}
+	podsToMake := []struct {
+		name             string
+		requests         api.ResourceList
+		limits           api.ResourceList
+		memoryWorkingSet string
+	}{
+		{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "500Mi"},
+		{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "300Mi"},
+		{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "800Mi"},
+		{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"},
+		{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"},
+		{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"},
+	}
+	pods := []*api.Pod{}
+	podStats := map[*api.Pod]statsapi.PodStats{}
+	for _, podToMake := range podsToMake {
+		pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
+		pods = append(pods, pod)
+		podStats[pod] = podStat
+	}
+	activePodsFunc := func() []*api.Pod {
+		return pods
+	}
+
+	fakeClock := clock.NewFakeClock(time.Now())
+	podKiller := &mockPodKiller{}
+	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
+
+	config := Config{
+		MaxPodGracePeriodSeconds: 5,
+		PressureTransitionPeriod: time.Minute * 5,
+		Thresholds: []Threshold{
+			{
+				Signal:     SignalMemoryAvailable,
+				Operator:   OpLessThan,
+				Value:      quantityMustParse("1Gi"),
+				MinReclaim: quantityMustParse("500Mi"),
+			},
+		},
+	}
+	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
+	manager := &managerImpl{
+		clock:           fakeClock,
+		killPodFunc:     podKiller.killPodNow,
+		config:          config,
+		recorder:        &record.FakeRecorder{},
+		summaryProvider: summaryProvider,
+		nodeRef:         nodeRef,
+		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
+		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+	}
+
+	// synchronize
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have memory pressure
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
+	// induce memory pressure!
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// check the right pod was killed
+	if podKiller.pod != pods[0] {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
+	}
+	observedGracePeriod := *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+	// reduce memory pressure, but not below the min-reclaim amount
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1.2Gi", podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure (because transition period not yet met)
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// check the right pod was killed
+	if podKiller.pod != pods[0] {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
+	}
+	observedGracePeriod = *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+	// reduce memory pressure and ensure the min-reclaim amount
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure (because transition period not yet met)
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
+	}
+
+	// move the clock past transition period to ensure that we stop reporting pressure
+	fakeClock.Step(5 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have memory pressure (because transition period met)
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
+	}
+}
--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@ -534,7 +534,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
 }

 // thresholdsMet returns the set of thresholds that were met independent of grace period
-func thresholdsMet(thresholds []Threshold, observations signalObservations) []Threshold {
+func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold {
 	results := []Threshold{}
 	for i := range thresholds {
 		threshold := thresholds[i]
@ -545,7 +545,12 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations) []Th
 		}
 		// determine if we have met the specified threshold
 		thresholdMet := false
-		thresholdResult := threshold.Value.Cmp(*observed)
+		quantity := threshold.Value.Copy()
+		// if enforceMinReclaim is specified, we compare relative to value - minreclaim
+		if enforceMinReclaim && threshold.MinReclaim != nil {
+			quantity.Add(*threshold.MinReclaim)
+		}
+		thresholdResult := quantity.Cmp(*observed)
 		switch threshold.Operator {
 		case OpLessThan:
 			thresholdMet = thresholdResult > 0
@ -589,7 +594,9 @@ func nodeConditions(thresholds []Threshold) []api.NodeConditionType {
 	results := []api.NodeConditionType{}
 	for _, threshold := range thresholds {
 		if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
-			results = append(results, nodeCondition)
+			if !hasNodeCondition(results, nodeCondition) {
+				results = append(results, nodeCondition)
+			}
 		}
 	}
 	return results
@ -644,7 +651,18 @@ func hasNodeCondition(inputs []api.NodeConditionType, item api.NodeConditionType
 	return false
 }

-// hasThreshold returns true if the node condition is in the input list
+// mergeThresholds will merge both threshold lists eliminating duplicates.
+func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold {
+	results := inputsA
+	for _, threshold := range inputsB {
+		if !hasThreshold(results, threshold) {
+			results = append(results, threshold)
+		}
+	}
+	return results
+}
+
+// hasThreshold returns true if the threshold is in the input list
 func hasThreshold(inputs []Threshold, item Threshold) bool {
 	for _, input := range inputs {
 		if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && input.Value.Cmp(*item.Value) == 0 {
--- a/pkg/kubelet/eviction/helpers_test.go
+++ b/pkg/kubelet/eviction/helpers_test.go
@ -585,29 +585,50 @@ func TestMakeSignalObservations(t *testing.T) {

 func TestThresholdsMet(t *testing.T) {
 	hardThreshold := Threshold{
-		Signal:   SignalMemoryAvailable,
-		Operator: OpLessThan,
-		Value:    quantityMustParse("1Gi"),
+		Signal:     SignalMemoryAvailable,
+		Operator:   OpLessThan,
+		Value:      quantityMustParse("1Gi"),
+		MinReclaim: quantityMustParse("500Mi"),
 	}
 	testCases := map[string]struct {
-		thresholds   []Threshold
-		observations signalObservations
-		result       []Threshold
+		enforceMinReclaim bool
+		thresholds        []Threshold
+		observations      signalObservations
+		result            []Threshold
 	}{
 		"empty": {
-			thresholds:   []Threshold{},
-			observations: signalObservations{},
-			result:       []Threshold{},
+			enforceMinReclaim: false,
+			thresholds:        []Threshold{},
+			observations:      signalObservations{},
+			result:            []Threshold{},
 		},
 		"threshold-met": {
-			thresholds: []Threshold{hardThreshold},
+			enforceMinReclaim: false,
+			thresholds:        []Threshold{hardThreshold},
 			observations: signalObservations{
 				SignalMemoryAvailable: quantityMustParse("500Mi"),
 			},
 			result: []Threshold{hardThreshold},
 		},
 		"threshold-not-met": {
-			thresholds: []Threshold{hardThreshold},
+			enforceMinReclaim: false,
+			thresholds:        []Threshold{hardThreshold},
+			observations: signalObservations{
+				SignalMemoryAvailable: quantityMustParse("2Gi"),
+			},
+			result: []Threshold{},
+		},
+		"threshold-met-with-min-reclaim": {
+			enforceMinReclaim: true,
+			thresholds:        []Threshold{hardThreshold},
+			observations: signalObservations{
+				SignalMemoryAvailable: quantityMustParse("1.05Gi"),
+			},
+			result: []Threshold{hardThreshold},
+		},
+		"threshold-not-met-with-min-reclaim": {
+			enforceMinReclaim: true,
+			thresholds:        []Threshold{hardThreshold},
 			observations: signalObservations{
 				SignalMemoryAvailable: quantityMustParse("2Gi"),
 			},
@ -615,7 +636,7 @@ func TestThresholdsMet(t *testing.T) {
 		},
 	}
 	for testName, testCase := range testCases {
-		actual := thresholdsMet(testCase.thresholds, testCase.observations)
+		actual := thresholdsMet(testCase.thresholds, testCase.observations, testCase.enforceMinReclaim)
 		if !thresholdList(actual).Equal(thresholdList(testCase.result)) {
 			t.Errorf("Test case: %s, expected: %v, actual: %v", testName, testCase.result, actual)
 		}