From 5297c146c1f4f03e71a0914077ca7a2b234966fb Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Thu, 9 Nov 2017 09:41:31 -0500 Subject: [PATCH] Fix dangling attach errors Detach volumes from shutdown nodes and ensure that dangling volumes are handled correctly in AWS --- pkg/cloudprovider/providers/aws/aws.go | 22 ++++++++-- pkg/volume/util/BUILD | 2 + pkg/volume/util/error.go | 41 +++++++++++++++++++ .../operationexecutor/operation_generator.go | 12 ++++++ 4 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 pkg/volume/util/error.go diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index 88b18e9c1e1..50df25bbbdf 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -1717,6 +1717,9 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName, if !alreadyAttached { available, err := c.checkIfAvailable(disk, "attaching", awsInstance.awsID) + if err != nil { + glog.Error(err) + } if !available { attachEnded = true @@ -1955,6 +1958,9 @@ func (c *Cloud) DeleteDisk(volumeName KubernetesVolumeID) (bool, error) { return false, err } available, err := c.checkIfAvailable(awsDisk, "deleting", "") + if err != nil { + glog.Error(err) + } if !available { return false, err @@ -1983,13 +1989,21 @@ func (c *Cloud) checkIfAvailable(disk *awsDisk, opName string, instance string) // Volume is attached somewhere else and we can not attach it here if len(info.Attachments) > 0 { attachment := info.Attachments[0] - attachErr := fmt.Errorf("%s since volume is currently attached to %q", opError, aws.StringValue(attachment.InstanceId)) - glog.Error(attachErr) - return false, attachErr + instanceId := aws.StringValue(attachment.InstanceId) + attachedInstance, ierr := c.getInstanceByID(instanceId) + attachErr := fmt.Sprintf("%s since volume is currently attached to %q", opError, instanceId) + if ierr != nil { + glog.Error(attachErr) + return false, errors.New(attachErr) + } + devicePath := aws.StringValue(attachment.Device) + nodeName := mapInstanceToNodeName(attachedInstance) + + danglingErr := volumeutil.NewDanglingError(attachErr, nodeName, devicePath) + return false, danglingErr } attachErr := fmt.Errorf("%s since volume is in %q state", opError, volumeState) - glog.Error(attachErr) return false, attachErr } diff --git a/pkg/volume/util/BUILD b/pkg/volume/util/BUILD index 5cff2bf5568..e79abd2d492 100644 --- a/pkg/volume/util/BUILD +++ b/pkg/volume/util/BUILD @@ -13,6 +13,7 @@ go_library( "device_util.go", "device_util_unsupported.go", "doc.go", + "error.go", "fs_unsupported.go", "io_util.go", "metrics.go", @@ -41,6 +42,7 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", ] + select({ diff --git a/pkg/volume/util/error.go b/pkg/volume/util/error.go new file mode 100644 index 00000000000..2c9655866b0 --- /dev/null +++ b/pkg/volume/util/error.go @@ -0,0 +1,41 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + k8stypes "k8s.io/apimachinery/pkg/types" +) + +// This error on attach indicates volume is attached to a different node +// than we expected. +type DanglingAttachError struct { + msg string + CurrentNode k8stypes.NodeName + DevicePath string +} + +func (err *DanglingAttachError) Error() string { + return err.msg +} + +func NewDanglingError(msg string, node k8stypes.NodeName, devicePath string) error { + return &DanglingAttachError{ + msg: msg, + CurrentNode: node, + DevicePath: devicePath, + } +} diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index b09e09ed20e..223a28cc379 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -267,6 +267,18 @@ func (og *operationGenerator) GenerateAttachVolumeFunc( volumeToAttach.VolumeSpec, volumeToAttach.NodeName) if attachErr != nil { + if derr, ok := attachErr.(*util.DanglingAttachError); ok { + addErr := actualStateOfWorld.MarkVolumeAsAttached( + v1.UniqueVolumeName(""), + volumeToAttach.VolumeSpec, + derr.CurrentNode, + derr.DevicePath) + + if addErr != nil { + glog.Errorf("AttachVolume.MarkVolumeAsAttached failed to fix dangling volume error for volume %q with %s", volumeToAttach.VolumeName, addErr) + } + + } // On failure, return error. Caller will log and retry. eventErr, detailedErr := volumeToAttach.GenerateError("AttachVolume.Attach failed", attachErr) for _, pod := range volumeToAttach.ScheduledPods {