From 77e070fb595a919464e11a61823c495867d18ac5 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Thu, 30 Jul 2020 07:32:12 -0400 Subject: [PATCH] Make AttachDisk idempotent again if DescribeInstance tells us that Volume is attached, we enter waitForAttach loop without calling AttachDisk. But if DescribeVolume tell us - device is actually detached, then we don't wait for 20 minutes and return error early. --- .../k8s.io/legacy-cloud-providers/aws/aws.go | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go index ece2cf5597c..014b78007d9 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go +++ b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go @@ -257,6 +257,14 @@ const ( filterNodeLimit = 150 ) +const ( + // represents expected attachment status of a volume after attach + volumeAttachedStatus = "attached" + + // represents expected attachment status of a volume after detach + volumeDetachedStatus = "detached" +) + // awsTagNameMasterRoles is a set of well-known AWS tag names that indicate the instance is a master // The major consequence is that it is then not considered for AWS zone discovery for dynamic volume creation. var awsTagNameMasterRoles = sets.NewString("kubernetes.io/role/master", "k8s.io/role/master") @@ -1967,7 +1975,6 @@ func (c *Cloud) getMountDevice( // AWS API returns consistent result next time (i.e. the volume is detached). status := volumeStatus[mappingVolumeID] klog.Warningf("Got assignment call for already-assigned volume: %s@%s, volume status: %s", mountDevice, mappingVolumeID, status) - return mountDevice, false, fmt.Errorf("volume is still being detached from the node") } return mountDevice, true, nil } @@ -2168,7 +2175,7 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string) // waitForAttachmentStatus polls until the attachment status is the expected value // On success, it returns the last attachment state. -func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) { +func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string, alreadyAttached bool) (*ec2.VolumeAttachment, error) { backoff := wait.Backoff{ Duration: volumeAttachmentStatusPollDelay, Factor: volumeAttachmentStatusFactor, @@ -2193,7 +2200,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec if err != nil { // The VolumeNotFound error is special -- we don't need to wait for it to repeat if isAWSErrorVolumeNotFound(err) { - if status == "detached" { + if status == volumeDetachedStatus { // The disk doesn't exist, assume it's detached, log warning and stop waiting klog.Warningf("Waiting for volume %q to be detached but the volume does not exist", d.awsID) stateStr := "detached" @@ -2202,7 +2209,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec } return true, nil } - if status == "attached" { + if status == volumeAttachedStatus { // The disk doesn't exist, complain, give up waiting and report error klog.Warningf("Waiting for volume %q to be attached but the volume does not exist", d.awsID) return false, err @@ -2237,7 +2244,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec } } if attachmentStatus == "" { - attachmentStatus = "detached" + attachmentStatus = volumeDetachedStatus } if attachment != nil { // AWS eventual consistency can go back in time. @@ -2266,6 +2273,13 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec } } + // if we expected volume to be attached and it was reported as already attached via DescribeInstance call + // but DescribeVolume told us volume is detached, we will short-circuit this long wait loop and return error + // so as AttachDisk can be retried without waiting for 20 minutes. + if (status == volumeAttachedStatus) && alreadyAttached && (attachmentStatus != status) { + return false, fmt.Errorf("attachment of disk %q failed, expected device to be attached but was %s", d.name, attachmentStatus) + } + if attachmentStatus == status { // Attachment is in requested state, finish waiting return true, nil @@ -2411,7 +2425,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName) klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse) } - attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device) + attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device, alreadyAttached) if err != nil { if err == wait.ErrWaitTimeout { @@ -2489,7 +2503,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName) return "", errors.New("no response from DetachVolume") } - attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "") + attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "", false) if err != nil { return "", err } @@ -4797,7 +4811,7 @@ func setNodeDisk( } func getInitialAttachDetachDelay(status string) time.Duration { - if status == "detached" { + if status == volumeDetachedStatus { return volumeDetachmentStatusInitialDelay } return volumeAttachmentStatusInitialDelay