Merge pull request #44986 from dashpole/fix_image_gc

Automatic merge from submit-queue Allow Partial Success for ImageGC Fixes #44951. When the eviction manager is under disk pressure, it first attempts to reclaim disk space by deleting images. However, if there are any errors during the image deletion process, the eviction manager treats that as a failed attempt delete images--even if some were successfully deleted. This change essentially makes the eviction manager ignore errors during image garbage collection, and instead rely solely on the quantity of resources reclaimed. If image deletion completely fails, for example, then this should still work as it would return 0 bytes freed. This allows for partial success, because any resources freed are counted, regardless of if some images fail to be deleted, for example. This does not require any changes to the image manager, as the current behavior is already to return the disk space freed along with any errors. ```release-note Fixes a bug where pods were evicted even after images are successfully deleted. ``` cc @dchen1107 @vishh @kubernetes/kubernetes-release-managers note to reviewers: this is mostly whitespace changes, so it will make more sense in reviewable
2025-07-22 11:21:47 +00:00 · 2017-04-26 20:52:18 -07:00 · 2017-04-26 20:52:18 -07:00 · e885c77ffd
commit e885c77ffd
parent 2e7cc0222d 958e290c8d
2 changed files with 19 additions and 17 deletions
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -380,23 +380,22 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam
 	for _, nodeReclaimFunc := range nodeReclaimFuncs {
 		// attempt to reclaim the pressured resource.
 		reclaimed, err := nodeReclaimFunc()
-		if err == nil {
-			// update our local observations based on the amount reported to have been reclaimed.
-			// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
-			signal := resourceToSignal[resourceToReclaim]
-			value, ok := observations[signal]
-			if !ok {
-				glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
-				continue
-			}
-			value.available.Add(*reclaimed)
+		if err != nil {
+			glog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
+		}
+		// update our local observations based on the amount reported to have been reclaimed.
+		// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
+		signal := resourceToSignal[resourceToReclaim]
+		value, ok := observations[signal]
+		if !ok {
+			glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
+			continue
+		}
+		value.available.Add(*reclaimed)

-			// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
-			if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
-				return true
-			}
-		} else {
-			glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
+		// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
+		if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
+			return true
 		}
 	}
 	return false
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -76,7 +76,8 @@ type NodeProvider interface {

 // ImageGC is responsible for performing garbage collection of unused images.
 type ImageGC interface {
-	// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.
+	// DeleteUnusedImages deletes unused images and returns the number of bytes freed, and an error.
+	// This returns the bytes freed even if an error is returned.
 	DeleteUnusedImages() (int64, error)
 }

@ -118,6 +119,8 @@ type thresholdsObservedAt map[evictionapi.Threshold]time.Time
 type nodeConditionsObservedAt map[v1.NodeConditionType]time.Time

 // nodeReclaimFunc is a function that knows how to reclaim a resource from the node without impacting pods.
+// Returns the quantity of resources reclaimed and an error, if applicable.
+// nodeReclaimFunc return the resources reclaimed even if an error occurs.
 type nodeReclaimFunc func() (*resource.Quantity, error)

 // nodeReclaimFuncs is an ordered list of nodeReclaimFunc