Make AttachDisk idempotent again

if DescribeInstance tells us that Volume is attached, we enter
waitForAttach loop without calling AttachDisk. But if DescribeVolume
tell us - device is actually detached, then we don't wait for 20
minutes and return error early.
This commit is contained in:
Hemant Kumar 2020-07-30 07:32:12 -04:00
parent db28b0239a
commit 77e070fb59

View File

@ -257,6 +257,14 @@ const (
filterNodeLimit = 150
)
const (
// represents expected attachment status of a volume after attach
volumeAttachedStatus = "attached"
// represents expected attachment status of a volume after detach
volumeDetachedStatus = "detached"
)
// awsTagNameMasterRoles is a set of well-known AWS tag names that indicate the instance is a master
// The major consequence is that it is then not considered for AWS zone discovery for dynamic volume creation.
var awsTagNameMasterRoles = sets.NewString("kubernetes.io/role/master", "k8s.io/role/master")
@ -1967,7 +1975,6 @@ func (c *Cloud) getMountDevice(
// AWS API returns consistent result next time (i.e. the volume is detached).
status := volumeStatus[mappingVolumeID]
klog.Warningf("Got assignment call for already-assigned volume: %s@%s, volume status: %s", mountDevice, mappingVolumeID, status)
return mountDevice, false, fmt.Errorf("volume is still being detached from the node")
}
return mountDevice, true, nil
}
@ -2168,7 +2175,7 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
// waitForAttachmentStatus polls until the attachment status is the expected value
// On success, it returns the last attachment state.
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) {
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string, alreadyAttached bool) (*ec2.VolumeAttachment, error) {
backoff := wait.Backoff{
Duration: volumeAttachmentStatusPollDelay,
Factor: volumeAttachmentStatusFactor,
@ -2193,7 +2200,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
if err != nil {
// The VolumeNotFound error is special -- we don't need to wait for it to repeat
if isAWSErrorVolumeNotFound(err) {
if status == "detached" {
if status == volumeDetachedStatus {
// The disk doesn't exist, assume it's detached, log warning and stop waiting
klog.Warningf("Waiting for volume %q to be detached but the volume does not exist", d.awsID)
stateStr := "detached"
@ -2202,7 +2209,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
}
return true, nil
}
if status == "attached" {
if status == volumeAttachedStatus {
// The disk doesn't exist, complain, give up waiting and report error
klog.Warningf("Waiting for volume %q to be attached but the volume does not exist", d.awsID)
return false, err
@ -2237,7 +2244,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
}
}
if attachmentStatus == "" {
attachmentStatus = "detached"
attachmentStatus = volumeDetachedStatus
}
if attachment != nil {
// AWS eventual consistency can go back in time.
@ -2266,6 +2273,13 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
}
}
// if we expected volume to be attached and it was reported as already attached via DescribeInstance call
// but DescribeVolume told us volume is detached, we will short-circuit this long wait loop and return error
// so as AttachDisk can be retried without waiting for 20 minutes.
if (status == volumeAttachedStatus) && alreadyAttached && (attachmentStatus != status) {
return false, fmt.Errorf("attachment of disk %q failed, expected device to be attached but was %s", d.name, attachmentStatus)
}
if attachmentStatus == status {
// Attachment is in requested state, finish waiting
return true, nil
@ -2411,7 +2425,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse)
}
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device)
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device, alreadyAttached)
if err != nil {
if err == wait.ErrWaitTimeout {
@ -2489,7 +2503,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
return "", errors.New("no response from DetachVolume")
}
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "")
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "", false)
if err != nil {
return "", err
}
@ -4797,7 +4811,7 @@ func setNodeDisk(
}
func getInitialAttachDetachDelay(status string) time.Duration {
if status == "detached" {
if status == volumeDetachedStatus {
return volumeDetachmentStatusInitialDelay
}
return volumeAttachmentStatusInitialDelay