AWS EBS: Remove the attached volumes cache

There are known issues with the attached-volume state cache that just aren't possible to fix with the current interface. Replace it with a map of the active attach jobs (that was the original requirement, to avoid a nasty race condition). This costs us an extra DescribeInstance call on attach/detach, but that seems worth it if it ends this class of bugs. Fix #15073
2025-08-04 18:00:08 +00:00 · 2016-03-10 07:47:30 -05:00 · 2016-03-10 07:47:30 -05:00 · 79b2b7edef
commit 79b2b7edef
parent d7a87c2285
1 changed files with 39 additions and 57 deletions
--- a/pkg/cloudprovider/providers/aws/aws.go
+++ b/pkg/cloudprovider/providers/aws/aws.go
@ -932,9 +932,10 @@ type awsInstance struct {
 	mutex sync.Mutex
-	// We must cache because otherwise there is a race condition,
+	// We keep an active list of devices we have assigned but not yet
-	// where we assign a device mapping and then get a second request before we attach the volume
+	// attached, to avoid a race condition where we assign a device mapping
-	deviceMappings map[mountDevice]string
+	// and then get a second request before we attach the volume
 	attaching map[mountDevice]string
 }
 // newAWSInstance creates a new awsInstance object
@ -953,8 +954,7 @@ func newAWSInstance(ec2Service EC2, instance *ec2.Instance) *awsInstance {
 		subnetID:         aws.StringValue(instance.SubnetId),
 	}
-	// We lazy-init deviceMappings
+	self.attaching = make(map[mountDevice]string)
 	self.deviceMappings = nil
 	return self
 }
@ -1001,31 +1001,31 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 	self.mutex.Lock()
 	defer self.mutex.Unlock()
-	// We cache both for efficiency and correctness
+	info, err := self.describeInstance()
-	if self.deviceMappings == nil {
+	if err != nil {
-		info, err := self.describeInstance()
+		return "", false, err
-		if err != nil {
+	}
-			return "", false, err
+	deviceMappings := map[mountDevice]string{}
 	for _, blockDevice := range info.BlockDeviceMappings {
 		name := aws.StringValue(blockDevice.DeviceName)
 		if strings.HasPrefix(name, "/dev/sd") {
 			name = name[7:]
 		}
-		deviceMappings := map[mountDevice]string{}
+		if strings.HasPrefix(name, "/dev/xvd") {
-		for _, blockDevice := range info.BlockDeviceMappings {
+			name = name[8:]
 			name := aws.StringValue(blockDevice.DeviceName)
 			if strings.HasPrefix(name, "/dev/sd") {
 				name = name[7:]
 			}
 			if strings.HasPrefix(name, "/dev/xvd") {
 				name = name[8:]
 			}
 			if len(name) != 1 {
 				glog.Warningf("Unexpected EBS DeviceName: %q", aws.StringValue(blockDevice.DeviceName))
 			}
 			deviceMappings[mountDevice(name)] = aws.StringValue(blockDevice.Ebs.VolumeId)
 		}
-		self.deviceMappings = deviceMappings
+		if len(name) != 1 {
 			glog.Warningf("Unexpected EBS DeviceName: %q", aws.StringValue(blockDevice.DeviceName))
 		}
 		deviceMappings[mountDevice(name)] = aws.StringValue(blockDevice.Ebs.VolumeId)
 	}
 	for mountDevice, volume := range self.attaching {
 		deviceMappings[mountDevice] = volume
 	}
 	// Check to see if this volume is already assigned a device on this machine
-	for mountDevice, mappingVolumeID := range self.deviceMappings {
+	for mountDevice, mappingVolumeID := range deviceMappings {
 		if volumeID == mappingVolumeID {
 			if assign {
 				glog.Warningf("Got assignment call for already-assigned volume: %s@%s", mountDevice, mappingVolumeID)
@ -1042,7 +1042,7 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 	valid := instanceType.getEBSMountDevices()
 	chosen := mountDevice("")
 	for _, mountDevice := range valid {
-		_, found := self.deviceMappings[mountDevice]
+		_, found := deviceMappings[mountDevice]
 		if !found {
 			chosen = mountDevice
 			break
@ -1050,31 +1050,31 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 	}
 	if chosen == "" {
-		glog.Warningf("Could not assign a mount device (all in use?).  mappings=%v, valid=%v", self.deviceMappings, valid)
+		glog.Warningf("Could not assign a mount device (all in use?).  mappings=%v, valid=%v", deviceMappings, valid)
 		return "", false, nil
 	}
-	self.deviceMappings[chosen] = volumeID
+	self.attaching[chosen] = volumeID
 	glog.V(2).Infof("Assigned mount device %s -> volume %s", chosen, volumeID)
 	return chosen, false, nil
 }
-func (self *awsInstance) releaseMountDevice(volumeID string, mountDevice mountDevice) {
+func (self *awsInstance) endAttaching(volumeID string, mountDevice mountDevice) {
 	self.mutex.Lock()
 	defer self.mutex.Unlock()
-	existingVolumeID, found := self.deviceMappings[mountDevice]
+	existingVolumeID, found := self.attaching[mountDevice]
 	if !found {
-		glog.Errorf("releaseMountDevice on non-allocated device")
+		glog.Errorf("endAttaching on non-allocated device")
 		return
 	}
 	if volumeID != existingVolumeID {
-		glog.Errorf("releaseMountDevice on device assigned to different volume")
+		glog.Errorf("endAttaching on device assigned to different volume")
 		return
 	}
 	glog.V(2).Infof("Releasing mount device mapping: %s -> volume %s", mountDevice, volumeID)
-	delete(self.deviceMappings, mountDevice)
+	delete(self.attaching, mountDevice)
 }
 type awsDisk struct {
@ -1280,8 +1280,8 @@ func (c *AWSCloud) AttachDisk(diskName string, instanceName string, readOnly boo
 	attached := false
 	defer func() {
-		if !attached {
+		if attached {
-			awsInstance.releaseMountDevice(disk.awsID, mountDevice)
+			awsInstance.endAttaching(disk.awsID, mountDevice)
 		}
 	}()
@ -1346,33 +1346,15 @@ func (aws *AWSCloud) DetachDisk(diskName string, instanceName string) (string, e
 		return "", errors.New("no response from DetachVolume")
 	}
 	// TODO: Fix this - just remove the cache?
 	// If we don't have a cache; we don't have to wait any more (the driver does it for us)
 	// Also, maybe we could get the locally connected drivers from the AWS metadata service?
 	// At this point we are waiting for the volume being detached. This
 	// releases the volume and invalidates the cache even when there is a timeout.
 	//
 	// TODO: A timeout leaves the cache in an inconsistent state. The volume is still
 	// detaching though the cache shows it as ready to be attached again. Subsequent
 	// attach operations will fail. The attach is being retried and eventually
 	// works though. An option would be to completely flush the cache upon timeouts.
 	//
 	defer func() {
 		// TODO: Not thread safe?
 		for mountDevice, existingVolumeID := range awsInstance.deviceMappings {
 			if existingVolumeID == disk.awsID {
 				awsInstance.releaseMountDevice(disk.awsID, mountDevice)
 				return
 			}
 		}
 	}()
 	err = disk.waitForAttachmentStatus("detached")
 	if err != nil {
 		return "", err
 	}
 	if mountDevice != "" {
 		awsInstance.endAttaching(disk.awsID, mountDevice)
 	}
 	hostDevicePath := "/dev/xvd" + string(mountDevice)
 	return hostDevicePath, err
 }