Merge pull request #99095 from maxlaverse/fix_kubelet_stuck_in_diskpressure

Prevent Kubelet from getting stuck in DiskPressure when imagefs minReclaim is set
This commit is contained in:
Kubernetes Prow Robot 2021-04-23 18:23:14 -07:00 committed by GitHub
commit 8b057cdfa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 2 deletions

View File

@ -437,8 +437,9 @@ func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Sign
observations, _ := makeSignalObservations(summary)
debugLogObservations("observations after resource reclaim", observations)
// determine the set of thresholds met independent of grace period
thresholds := thresholdsMet(m.config.Thresholds, observations, false)
// evaluate all thresholds independently of their grace period to see if with
// the new observations, we think we have met min reclaim goals
thresholds := thresholdsMet(m.config.Thresholds, observations, true)
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
if len(thresholds) == 0 {

View File

@ -886,6 +886,51 @@ func TestNodeReclaimFuncs(t *testing.T) {
t.Errorf("Manager should not report disk pressure")
}
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}
// induce hard threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
// make GC return disk usage bellow the threshold, but not satisfying minReclaim
diskGC.summaryAfterGC = summaryStatsMaker("1.1Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
t.Errorf("Manager should report disk pressure since soft threshold was met")
}
// verify image gc was invoked
if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
t.Errorf("Manager should have invoked image gc")
}
// verify a pod was killed because image gc was not enough to satisfy minReclaim
if podKiller.pod == nil {
t.Errorf("Manager should have killed a pod, but didn't")
}
// reset state
diskGC.imageGCInvoked = false
diskGC.containerGCInvoked = false
podKiller.pod = nil
// remove disk pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}
// induce disk pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)