Merge pull request #25772 from derekwaynecarr/eviction-max-grace

Automatic merge from submit-queue Add support for limiting grace period during soft eviction Adds eviction manager support in kubelet for max pod graceful termination period when a soft eviction is met. ```release-note Kubelet evicts pods when available memory falls below configured eviction thresholds ``` /cc @vishh
2025-07-31 15:25:57 +00:00 · 2016-05-21 12:43:45 -07:00 · 2016-05-21 12:43:45 -07:00 · 4bb085c927
commit 4bb085c927
parent 682c188fc8 2a1d3faf08
5 changed files with 84 additions and 2 deletions
--- a/cmd/kubelet/app/server.go
+++ b/cmd/kubelet/app/server.go
@ -191,6 +191,7 @@ func UnsecuredKubeletConfig(s *options.KubeletServer) (*KubeletConfig, error) {
 	}
 	evictionConfig := eviction.Config{
 		PressureTransitionPeriod: s.EvictionPressureTransitionPeriod.Duration,
+		MaxPodGracePeriodSeconds: int64(s.EvictionMaxPodGracePeriod),
 		Thresholds:               thresholds,
 	}

--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@ -551,3 +551,16 @@ func reclaimResources(thresholds []Threshold) []api.ResourceName {
 	}
 	return results
 }
+
+// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
+func isSoftEviction(thresholds []Threshold, starvedResource api.ResourceName) bool {
+	for _, threshold := range thresholds {
+		if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
+			continue
+		}
+		if threshold.GracePeriod == time.Duration(0) {
+			return false
+		}
+	}
+	return true
+}
--- a/pkg/kubelet/eviction/manager.go
+++ b/pkg/kubelet/eviction/manager.go
@ -158,7 +158,7 @@ func (m *managerImpl) synchronize(podFunc ActivePodsFunc) {
 	// determine the set of resources under starvation
 	starvedResources := reclaimResources(thresholds)
 	if len(starvedResources) == 0 {
-		glog.Infof("eviction manager: no resources are starved")
+		glog.V(3).Infof("eviction manager: no resources are starved")
 		return
 	}

@ -167,6 +167,9 @@ func (m *managerImpl) synchronize(podFunc ActivePodsFunc) {
 	resourceToReclaim := starvedResources[0]
 	glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)

+	// determine if this is a soft or hard eviction associated with the resource
+	softEviction := isSoftEviction(thresholds, resourceToReclaim)
+
 	// record an event about the resources we are now attempting to reclaim via eviction
 	m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)

@ -199,8 +202,10 @@ func (m *managerImpl) synchronize(podFunc ActivePodsFunc) {
 		}
 		// record that we are evicting the pod
 		m.recorder.Eventf(pod, api.EventTypeWarning, reason, message)
-		// TODO this needs to be based on soft or hard eviction threshold being met, soft eviction will allow a configured value.
 		gracePeriodOverride := int64(0)
+		if softEviction {
+			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
+		}
 		// this is a blocking call and should only return when the pod and its containers are killed.
 		err := m.killPodFunc(pod, status, &gracePeriodOverride)
 		if err != nil {
--- a/pkg/kubelet/eviction/manager_test.go
+++ b/pkg/kubelet/eviction/manager_test.go
@ -98,6 +98,7 @@ func TestMemoryPressure(t *testing.T) {
 	nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
+		MaxPodGracePeriodSeconds: 5,
 		PressureTransitionPeriod: time.Minute * 5,
 		Thresholds: []Threshold{
 			{
@ -105,6 +106,12 @@ func TestMemoryPressure(t *testing.T) {
 				Operator: OpLessThan,
 				Value:    quantityMustParse("1Gi"),
 			},
+			{
+				Signal:      SignalMemoryAvailable,
+				Operator:    OpLessThan,
+				Value:       quantityMustParse("2Gi"),
+				GracePeriod: time.Minute * 2,
+			},
 		},
 	}
 	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
@ -139,6 +146,56 @@ func TestMemoryPressure(t *testing.T) {
 		}
 	}

+	// induce soft threshold
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
+	manager.synchronize(activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure since soft threshold was met")
+	}
+
+	// verify no pod was yet killed because there has not yet been enough time passed.
+	if podKiller.pod != nil {
+		t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod)
+	}
+
+	// step forward in time pass the grace period
+	fakeClock.Step(3 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
+	manager.synchronize(activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure since soft threshold was met")
+	}
+
+	// verify the right pod was killed with the right grace period.
+	if podKiller.pod != pods[0] {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
+	}
+	if podKiller.gracePeriodOverride == nil {
+		t.Errorf("Manager chose to kill pod but should have had a grace period override.")
+	}
+	observedGracePeriod := *podKiller.gracePeriodOverride
+	if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod)
+	}
+	// reset state
+	podKiller.pod = nil
+	podKiller.gracePeriodOverride = nil
+
+	// remove memory pressure
+	fakeClock.Step(20 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("3Gi", podStats)
+	manager.synchronize(activePodsFunc)
+
+	// we should not have memory pressure
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
 	// induce memory pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
@ -153,6 +210,10 @@ func TestMemoryPressure(t *testing.T) {
 	if podKiller.pod != pods[0] {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
 	}
+	observedGracePeriod = *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}

 	// the best-effort pod should not admit, burstable should
 	expected = []bool{false, true}
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -44,6 +44,8 @@ const (
 type Config struct {
 	// PressureTransitionPeriod is duration the kubelet has to wait before transititioning out of a pressure condition.
 	PressureTransitionPeriod time.Duration
+	// Maximum allowed grace period (in seconds) to use when terminating pods in response to a soft eviction threshold being met.
+	MaxPodGracePeriodSeconds int64
 	// Thresholds define the set of conditions monitored to trigger eviction.
 	Thresholds []Threshold
 }