From 4c0c97d0db77aee06a654a4113b4e7422111895e Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Mon, 7 Oct 2019 15:07:11 +0200 Subject: [PATCH] Fix attachment of just detached AWS volumes AWS API has eventual consistency model and it can return stale data. For example, DescribeInstances can return that a volume is still being detached, while it has already been confirmed to be detached. AttachDisk() should return error in this case, allowing A/D controller to retry, and not to assume that the volume is attached. --- .../k8s.io/legacy-cloud-providers/aws/aws.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go index b0dc26fc479..52a0d05b5f7 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go +++ b/staging/src/k8s.io/legacy-cloud-providers/aws/aws.go @@ -47,7 +47,7 @@ import ( "github.com/aws/aws-sdk-go/service/kms" "github.com/aws/aws-sdk-go/service/sts" "gopkg.in/gcfg.v1" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/klog" "k8s.io/apimachinery/pkg/api/resource" @@ -63,7 +63,7 @@ import ( "k8s.io/client-go/pkg/version" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" - "k8s.io/cloud-provider" + cloudprovider "k8s.io/cloud-provider" nodehelpers "k8s.io/cloud-provider/node/helpers" servicehelpers "k8s.io/cloud-provider/service/helpers" cloudvolume "k8s.io/cloud-provider/volume" @@ -1861,6 +1861,7 @@ func (c *Cloud) getMountDevice( assign bool) (assigned mountDevice, alreadyAttached bool, err error) { deviceMappings := map[mountDevice]EBSVolumeID{} + volumeStatus := map[EBSVolumeID]string{} // for better logging of volume status for _, blockDevice := range info.BlockDeviceMappings { name := aws.StringValue(blockDevice.DeviceName) if strings.HasPrefix(name, "/dev/sd") { @@ -1872,6 +1873,10 @@ func (c *Cloud) getMountDevice( if len(name) < 1 || len(name) > 2 { klog.Warningf("Unexpected EBS DeviceName: %q", aws.StringValue(blockDevice.DeviceName)) } + if blockDevice.Ebs != nil && blockDevice.Ebs.VolumeId != nil { + volumeStatus[EBSVolumeID(*blockDevice.Ebs.VolumeId)] = aws.StringValue(blockDevice.Ebs.Status) + } + deviceMappings[mountDevice(name)] = EBSVolumeID(aws.StringValue(blockDevice.Ebs.VolumeId)) } @@ -1889,7 +1894,15 @@ func (c *Cloud) getMountDevice( for mountDevice, mappingVolumeID := range deviceMappings { if volumeID == mappingVolumeID { if assign { - klog.Warningf("Got assignment call for already-assigned volume: %s@%s", mountDevice, mappingVolumeID) + // DescribeInstances shows the volume as attached / detaching, while Kubernetes + // cloud provider thinks it's detached. + // This can happened when the volume has just been detached from the same node + // and AWS API returns stale data in this DescribeInstances ("eventual consistency"). + // Fail the attachment and let A/D controller retry in a while, hoping that + // AWS API returns consistent result next time (i.e. the volume is detached). + status := volumeStatus[mappingVolumeID] + klog.Warningf("Got assignment call for already-assigned volume: %s@%s, volume status: %s", mountDevice, mappingVolumeID, status) + return mountDevice, false, fmt.Errorf("volume is still being detached from the node") } return mountDevice, true, nil }