From 780ebfa496c997fec642e16a235d83028ed9cbcc Mon Sep 17 00:00:00 2001 From: carlory Date: Mon, 21 Oct 2024 15:01:45 +0800 Subject: [PATCH] kubelet: Fix the volume manager did't check the device mount state in the actual state of the world before marking the volume as detached. It may cause a pod to be stuck in the Terminating state due to the above issue when it was deleted. --- .../cache/actual_state_of_world.go | 17 +++++++++++++++++ .../reconciler/reconciler_common.go | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go index 2741e459f32..96ceeb82c5e 100644 --- a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go +++ b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go @@ -168,6 +168,11 @@ type ActualStateOfWorld interface { // or have a mount/unmount operation pending. GetAttachedVolumes() []AttachedVolume + // GetAttachedVolume returns the volume that is known to be attached to the node + // with the given volume name. If the volume is not found, the second return value + // is false. + GetAttachedVolume(volumeName v1.UniqueVolumeName) (AttachedVolume, bool) + // SyncReconstructedVolume check the volume.outerVolumeSpecName in asw and // the one populated from dsw, if they do not match, update this field from the value from dsw. SyncReconstructedVolume(volumeName v1.UniqueVolumeName, podName volumetypes.UniquePodName, outerVolumeSpecName string) @@ -1104,6 +1109,18 @@ func (asw *actualStateOfWorld) GetAttachedVolumes() []AttachedVolume { return allAttachedVolumes } +func (asw *actualStateOfWorld) GetAttachedVolume(volumeName v1.UniqueVolumeName) (AttachedVolume, bool) { + asw.RLock() + defer asw.RUnlock() + + volumeObj, ok := asw.attachedVolumes[volumeName] + if !ok { + return AttachedVolume{}, false + } + + return asw.newAttachedVolume(&volumeObj), true +} + func (asw *actualStateOfWorld) GetUnmountedVolumes() []AttachedVolume { asw.RLock() defer asw.RUnlock() diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler_common.go b/pkg/kubelet/volumemanager/reconciler/reconciler_common.go index 4129247658b..cd818a32a66 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler_common.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler_common.go @@ -270,6 +270,11 @@ func (rc *reconciler) unmountDetachDevices() { // Check IsOperationPending to avoid marking a volume as detached if it's in the process of mounting. if !rc.desiredStateOfWorld.VolumeExists(attachedVolume.VolumeName, attachedVolume.SELinuxMountContext) && !rc.operationExecutor.IsOperationPending(attachedVolume.VolumeName, nestedpendingoperations.EmptyUniquePodName, nestedpendingoperations.EmptyNodeName) { + + // Re-read the actual state of the world, maybe the volume got mounted in the meantime. + // This is safe, because there is no pending operation (checked above) and no new operation + // could start in the meantime. The only goroutine that adds new operations is this reconciler. + attachedVolume, _ = rc.actualStateOfWorld.GetAttachedVolume(attachedVolume.VolumeName) if attachedVolume.DeviceMayBeMounted() { // Volume is globally mounted to device, unmount it klog.V(5).InfoS(attachedVolume.GenerateMsgDetailed("Starting operationExecutor.UnmountDevice", ""))