Merge pull request #99095 from maxlaverse/fix_kubelet_stuck_in_diskpressure

Prevent Kubelet from getting stuck in DiskPressure when imagefs minReclaim is set
2025-07-26 05:03:09 +00:00 · 2021-04-23 18:23:14 -07:00 · 2021-04-23 18:23:14 -07:00 · 8b057cdfa4
commit 8b057cdfa4
parent 60cdf3a8e0 63cba062eb
2 changed files with 48 additions and 2 deletions
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -437,8 +437,9 @@ func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Sign
 		observations, _ := makeSignalObservations(summary)
 		debugLogObservations("observations after resource reclaim", observations)
-		// determine the set of thresholds met independent of grace period
+		// evaluate all thresholds independently of their grace period to see if with
-		thresholds := thresholdsMet(m.config.Thresholds, observations, false)
+		// the new observations, we think we have met min reclaim goals
 		thresholds := thresholdsMet(m.config.Thresholds, observations, true)
 		debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
 		if len(thresholds) == 0 {
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -886,6 +886,51 @@ func TestNodeReclaimFuncs(t *testing.T) {
 		t.Errorf("Manager should not report disk pressure")
 	}
 	// synchronize
 	manager.synchronize(diskInfoProvider, activePodsFunc)
 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
 		t.Errorf("Manager should not report disk pressure")
 	}
 	// induce hard threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
 	// make GC return disk usage bellow the threshold, but not satisfying minReclaim
 	diskGC.summaryAfterGC = summaryStatsMaker("1.1Gi", "200Gi", podStats)
 	manager.synchronize(diskInfoProvider, activePodsFunc)
 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
 		t.Errorf("Manager should report disk pressure since soft threshold was met")
 	}
 	// verify image gc was invoked
 	if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
 		t.Errorf("Manager should have invoked image gc")
 	}
 	// verify a pod was killed because image gc was not enough to satisfy minReclaim
 	if podKiller.pod == nil {
 		t.Errorf("Manager should have killed a pod, but didn't")
 	}
 	// reset state
 	diskGC.imageGCInvoked = false
 	diskGC.containerGCInvoked = false
 	podKiller.pod = nil
 	// remove disk pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
 	manager.synchronize(diskInfoProvider, activePodsFunc)
 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
 		t.Errorf("Manager should not report disk pressure")
 	}
 	// induce disk pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)