mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
Merge pull request #44986 from dashpole/fix_image_gc
Automatic merge from submit-queue Allow Partial Success for ImageGC Fixes #44951. When the eviction manager is under disk pressure, it first attempts to reclaim disk space by deleting images. However, if there are any errors during the image deletion process, the eviction manager treats that as a failed attempt delete images--even if some were successfully deleted. This change essentially makes the eviction manager ignore errors during image garbage collection, and instead rely solely on the quantity of resources reclaimed. If image deletion completely fails, for example, then this should still work as it would return 0 bytes freed. This allows for partial success, because any resources freed are counted, regardless of if some images fail to be deleted, for example. This does not require any changes to the image manager, as the current behavior is already to return the disk space freed along with any errors. ```release-note Fixes a bug where pods were evicted even after images are successfully deleted. ``` cc @dchen1107 @vishh @kubernetes/kubernetes-release-managers note to reviewers: this is mostly whitespace changes, so it will make more sense in reviewable
This commit is contained in:
commit
e885c77ffd
@ -380,23 +380,22 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam
|
||||
for _, nodeReclaimFunc := range nodeReclaimFuncs {
|
||||
// attempt to reclaim the pressured resource.
|
||||
reclaimed, err := nodeReclaimFunc()
|
||||
if err == nil {
|
||||
// update our local observations based on the amount reported to have been reclaimed.
|
||||
// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
|
||||
signal := resourceToSignal[resourceToReclaim]
|
||||
value, ok := observations[signal]
|
||||
if !ok {
|
||||
glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
|
||||
continue
|
||||
}
|
||||
value.available.Add(*reclaimed)
|
||||
if err != nil {
|
||||
glog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
|
||||
}
|
||||
// update our local observations based on the amount reported to have been reclaimed.
|
||||
// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
|
||||
signal := resourceToSignal[resourceToReclaim]
|
||||
value, ok := observations[signal]
|
||||
if !ok {
|
||||
glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
|
||||
continue
|
||||
}
|
||||
value.available.Add(*reclaimed)
|
||||
|
||||
// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
|
||||
if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
|
||||
// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
|
||||
if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
@ -76,7 +76,8 @@ type NodeProvider interface {
|
||||
|
||||
// ImageGC is responsible for performing garbage collection of unused images.
|
||||
type ImageGC interface {
|
||||
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.
|
||||
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, and an error.
|
||||
// This returns the bytes freed even if an error is returned.
|
||||
DeleteUnusedImages() (int64, error)
|
||||
}
|
||||
|
||||
@ -118,6 +119,8 @@ type thresholdsObservedAt map[evictionapi.Threshold]time.Time
|
||||
type nodeConditionsObservedAt map[v1.NodeConditionType]time.Time
|
||||
|
||||
// nodeReclaimFunc is a function that knows how to reclaim a resource from the node without impacting pods.
|
||||
// Returns the quantity of resources reclaimed and an error, if applicable.
|
||||
// nodeReclaimFunc return the resources reclaimed even if an error occurs.
|
||||
type nodeReclaimFunc func() (*resource.Quantity, error)
|
||||
|
||||
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc
|
||||
|
Loading…
Reference in New Issue
Block a user