From 04f5b203885f57e77d6c70f271a10dda334419fa Mon Sep 17 00:00:00 2001 From: carlory Date: Mon, 21 Oct 2024 15:01:45 +0800 Subject: [PATCH] kubelet: Fix the volume manager did't check the device mount state in the actual state of the world before marking the volume as detached. It may cause a pod to be stuck in the Terminating state due to the above issue when it was deleted. --- .../cache/actual_state_of_world.go | 17 +++++++++++++++++ .../reconciler/reconciler_common.go | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go index 38017a588f7..426e5b9b8ad 100644 --- a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go +++ b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go @@ -169,6 +169,11 @@ type ActualStateOfWorld interface { // or have a mount/unmount operation pending. GetAttachedVolumes() []AttachedVolume + // GetAttachedVolume returns the volume that is known to be attached to the node + // with the given volume name. If the volume is not found, the second return value + // is false. + GetAttachedVolume(volumeName v1.UniqueVolumeName) (AttachedVolume, bool) + // Add the specified volume to ASW as uncertainly attached. AddAttachUncertainReconstructedVolume(volumeName v1.UniqueVolumeName, volumeSpec *volume.Spec, nodeName types.NodeName, devicePath string) error @@ -1125,6 +1130,18 @@ func (asw *actualStateOfWorld) GetAttachedVolumes() []AttachedVolume { return allAttachedVolumes } +func (asw *actualStateOfWorld) GetAttachedVolume(volumeName v1.UniqueVolumeName) (AttachedVolume, bool) { + asw.RLock() + defer asw.RUnlock() + + volumeObj, ok := asw.attachedVolumes[volumeName] + if !ok { + return AttachedVolume{}, false + } + + return asw.newAttachedVolume(&volumeObj), true +} + func (asw *actualStateOfWorld) GetUnmountedVolumes() []AttachedVolume { asw.RLock() defer asw.RUnlock() diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler_common.go b/pkg/kubelet/volumemanager/reconciler/reconciler_common.go index a9e656a7781..f3ff308e414 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler_common.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler_common.go @@ -269,6 +269,11 @@ func (rc *reconciler) unmountDetachDevices() { // Check IsOperationPending to avoid marking a volume as detached if it's in the process of mounting. if !rc.desiredStateOfWorld.VolumeExists(attachedVolume.VolumeName, attachedVolume.SELinuxMountContext) && !rc.operationExecutor.IsOperationPending(attachedVolume.VolumeName, nestedpendingoperations.EmptyUniquePodName, nestedpendingoperations.EmptyNodeName) { + + // Re-read the actual state of the world, maybe the volume got mounted in the meantime. + // This is safe, because there is no pending operation (checked above) and no new operation + // could start in the meantime. The only goroutine that adds new operations is this reconciler. + attachedVolume, _ = rc.actualStateOfWorld.GetAttachedVolume(attachedVolume.VolumeName) if attachedVolume.DeviceMayBeMounted() { // Volume is globally mounted to device, unmount it klog.V(5).InfoS(attachedVolume.GenerateMsgDetailed("Starting operationExecutor.UnmountDevice", ""))