mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-26 05:03:09 +00:00
Merge pull request #99095 from maxlaverse/fix_kubelet_stuck_in_diskpressure
Prevent Kubelet from getting stuck in DiskPressure when imagefs minReclaim is set
This commit is contained in:
commit
8b057cdfa4
@ -437,8 +437,9 @@ func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Sign
|
|||||||
observations, _ := makeSignalObservations(summary)
|
observations, _ := makeSignalObservations(summary)
|
||||||
debugLogObservations("observations after resource reclaim", observations)
|
debugLogObservations("observations after resource reclaim", observations)
|
||||||
|
|
||||||
// determine the set of thresholds met independent of grace period
|
// evaluate all thresholds independently of their grace period to see if with
|
||||||
thresholds := thresholdsMet(m.config.Thresholds, observations, false)
|
// the new observations, we think we have met min reclaim goals
|
||||||
|
thresholds := thresholdsMet(m.config.Thresholds, observations, true)
|
||||||
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
|
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
|
||||||
|
|
||||||
if len(thresholds) == 0 {
|
if len(thresholds) == 0 {
|
||||||
|
@ -886,6 +886,51 @@ func TestNodeReclaimFuncs(t *testing.T) {
|
|||||||
t.Errorf("Manager should not report disk pressure")
|
t.Errorf("Manager should not report disk pressure")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// synchronize
|
||||||
|
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||||
|
|
||||||
|
// we should not have disk pressure
|
||||||
|
if manager.IsUnderDiskPressure() {
|
||||||
|
t.Errorf("Manager should not report disk pressure")
|
||||||
|
}
|
||||||
|
|
||||||
|
// induce hard threshold
|
||||||
|
fakeClock.Step(1 * time.Minute)
|
||||||
|
summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
|
||||||
|
// make GC return disk usage bellow the threshold, but not satisfying minReclaim
|
||||||
|
diskGC.summaryAfterGC = summaryStatsMaker("1.1Gi", "200Gi", podStats)
|
||||||
|
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||||
|
|
||||||
|
// we should have disk pressure
|
||||||
|
if !manager.IsUnderDiskPressure() {
|
||||||
|
t.Errorf("Manager should report disk pressure since soft threshold was met")
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify image gc was invoked
|
||||||
|
if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
|
||||||
|
t.Errorf("Manager should have invoked image gc")
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify a pod was killed because image gc was not enough to satisfy minReclaim
|
||||||
|
if podKiller.pod == nil {
|
||||||
|
t.Errorf("Manager should have killed a pod, but didn't")
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset state
|
||||||
|
diskGC.imageGCInvoked = false
|
||||||
|
diskGC.containerGCInvoked = false
|
||||||
|
podKiller.pod = nil
|
||||||
|
|
||||||
|
// remove disk pressure
|
||||||
|
fakeClock.Step(20 * time.Minute)
|
||||||
|
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
|
||||||
|
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||||
|
|
||||||
|
// we should not have disk pressure
|
||||||
|
if manager.IsUnderDiskPressure() {
|
||||||
|
t.Errorf("Manager should not report disk pressure")
|
||||||
|
}
|
||||||
|
|
||||||
// induce disk pressure!
|
// induce disk pressure!
|
||||||
fakeClock.Step(1 * time.Minute)
|
fakeClock.Step(1 * time.Minute)
|
||||||
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
|
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
|
||||||
|
Loading…
Reference in New Issue
Block a user