Merge pull request #38339 from gnufied/backoff-on-volume-delete

Automatic merge from submit-queue (batch tested with PRs 38377, 36365, 36648, 37691, 38339)

Exponential back off when volume delete fails

**What this PR does / why we need it**:

This PR implements ability in pv_controller to back off when deleting a volume fails from plugin API. 

**Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: 

Partly fixes #38295 , but I think volume delete is most problematic thing happening in pv_controller without any sort of backoff. 

After this change the attempts of volume deletion look like:

```
controller : I1208 00:18:35.532061   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:20:50.578325   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:23:05.563488   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:25:20.599158   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:27:35.560009   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:29:50.594967   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:32:05.539168   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
controller : I1208 00:34:20.581665   16388 aws_util.go:55] Error deleting EBS Disk volume aws://us-east-1d/vol-abcdefg: VolumeInUse: Volume vol-abcdefg is currently attached to i-1234567
```
This commit is contained in:
Kubernetes Submit Queue 2016-12-08 10:52:03 -08:00 committed by GitHub
commit 8f11cc78a8
2 changed files with 13 additions and 13 deletions

View File

@ -936,8 +936,7 @@ func (ctrl *PersistentVolumeController) reclaimVolume(volume *v1.PersistentVolum
glog.V(4).Infof("reclaimVolume[%s]: policy is Delete", volume.Name)
opName := fmt.Sprintf("delete-%s[%s]", volume.Name, string(volume.UID))
ctrl.scheduleOperation(opName, func() error {
ctrl.deleteVolumeOperation(volume)
return nil
return ctrl.deleteVolumeOperation(volume)
})
default:
@ -1042,12 +1041,13 @@ func (ctrl *PersistentVolumeController) recycleVolumeOperation(arg interface{})
// deleteVolumeOperation deletes a volume. This method is running in standalone
// goroutine and already has all necessary locks.
func (ctrl *PersistentVolumeController) deleteVolumeOperation(arg interface{}) {
func (ctrl *PersistentVolumeController) deleteVolumeOperation(arg interface{}) error {
volume, ok := arg.(*v1.PersistentVolume)
if !ok {
glog.Errorf("Cannot convert deleteVolumeOperation argument to volume, got %#v", arg)
return
return nil
}
glog.V(4).Infof("deleteVolumeOperation [%s] started", volume.Name)
// This method may have been waiting for a volume lock for some time.
@ -1056,16 +1056,16 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(arg interface{}) {
newVolume, err := ctrl.kubeClient.Core().PersistentVolumes().Get(volume.Name)
if err != nil {
glog.V(3).Infof("error reading peristent volume %q: %v", volume.Name, err)
return
return nil
}
needsReclaim, err := ctrl.isVolumeReleased(newVolume)
if err != nil {
glog.V(3).Infof("error reading claim for volume %q: %v", volume.Name, err)
return
return nil
}
if !needsReclaim {
glog.V(3).Infof("volume %q no longer needs deletion, skipping", volume.Name)
return
return nil
}
deleted, err := ctrl.doDeleteVolume(volume)
@ -1082,17 +1082,17 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(arg interface{}) {
if _, err = ctrl.updateVolumePhaseWithEvent(volume, v1.VolumeFailed, v1.EventTypeWarning, "VolumeFailedDelete", err.Error()); err != nil {
glog.V(4).Infof("deleteVolumeOperation [%s]: failed to mark volume as failed: %v", volume.Name, err)
// Save failed, retry on the next deletion attempt
return
return err
}
}
// Despite the volume being Failed, the controller will retry deleting
// the volume in every syncVolume() call.
return
return err
}
if !deleted {
// The volume waits for deletion by an external plugin. Do nothing.
return
return nil
}
glog.V(4).Infof("deleteVolumeOperation [%s]: success", volume.Name)
@ -1103,9 +1103,9 @@ func (ctrl *PersistentVolumeController) deleteVolumeOperation(arg interface{}) {
// cache of "recently deleted volumes" and avoid unnecessary deletion,
// this is left out as future optimization.
glog.V(3).Infof("failed to delete volume %q from database: %v", volume.Name, err)
return
return nil
}
return
return nil
}
// isVolumeReleased returns true if given volume is released and can be recycled

View File

@ -71,7 +71,7 @@ func NewController(p ControllerParameters) *PersistentVolumeController {
claims: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
kubeClient: p.KubeClient,
eventRecorder: eventRecorder,
runningOperations: goroutinemap.NewGoRoutineMap(false /* exponentialBackOffOnError */),
runningOperations: goroutinemap.NewGoRoutineMap(true /* exponentialBackOffOnError */),
cloud: p.Cloud,
enableDynamicProvisioning: p.EnableDynamicProvisioning,
clusterName: p.ClusterName,