Merge pull request #44986 from dashpole/fix_image_gc

Automatic merge from submit-queue

Allow Partial Success for ImageGC

Fixes #44951.  When the eviction manager is under disk pressure, it first attempts to reclaim disk space by deleting images.  However, if there are any errors during the image deletion process, the eviction manager treats that as a failed attempt delete images--even if some were successfully deleted.

This change essentially makes the eviction manager ignore errors during image garbage collection, and instead rely solely on the quantity of resources reclaimed.  If image deletion completely fails, for example, then this should still work as it would return 0 bytes freed.  This allows for partial success, because any resources freed are counted, regardless of if some images fail to be deleted, for example.

This does not require any changes to the image manager, as the current behavior is already to return the disk space freed along with any errors.

```release-note
Fixes a bug where pods were evicted even after images are successfully deleted.
```

cc @dchen1107 @vishh @kubernetes/kubernetes-release-managers

note to reviewers: this is mostly whitespace changes, so it will make more sense in reviewable
This commit is contained in:
Kubernetes Submit Queue 2017-04-26 20:52:18 -07:00 committed by GitHub
commit e885c77ffd
2 changed files with 19 additions and 17 deletions

View File

@ -380,23 +380,22 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam
for _, nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
reclaimed, err := nodeReclaimFunc()
if err == nil {
// update our local observations based on the amount reported to have been reclaimed.
// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
signal := resourceToSignal[resourceToReclaim]
value, ok := observations[signal]
if !ok {
glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
continue
}
value.available.Add(*reclaimed)
if err != nil {
glog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
}
// update our local observations based on the amount reported to have been reclaimed.
// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
signal := resourceToSignal[resourceToReclaim]
value, ok := observations[signal]
if !ok {
glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
continue
}
value.available.Add(*reclaimed)
// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
return true
}
} else {
glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
return true
}
}
return false

View File

@ -76,7 +76,8 @@ type NodeProvider interface {
// ImageGC is responsible for performing garbage collection of unused images.
type ImageGC interface {
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, and an error.
// This returns the bytes freed even if an error is returned.
DeleteUnusedImages() (int64, error)
}
@ -118,6 +119,8 @@ type thresholdsObservedAt map[evictionapi.Threshold]time.Time
type nodeConditionsObservedAt map[v1.NodeConditionType]time.Time
// nodeReclaimFunc is a function that knows how to reclaim a resource from the node without impacting pods.
// Returns the quantity of resources reclaimed and an error, if applicable.
// nodeReclaimFunc return the resources reclaimed even if an error occurs.
type nodeReclaimFunc func() (*resource.Quantity, error)
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc