From d190fa3e7de7c764bca6ce1dd4e2787b04843a7e Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 31 Jan 2024 12:23:56 -0500 Subject: [PATCH] Fix race condition between external-resizer and kubelet This fixes the race condition that could happen because resize controller just finished volume expansiona and has only finished marking PV and yet to mark PVC. The workaround proposed here should not be necessary once RecoverVolumeExpansionFailure goes GA/beta. --- .../volumemanager/reconciler/reconciler_test.go | 10 ++++++++++ pkg/volume/csi/expander.go | 2 +- pkg/volume/testing/testing.go | 9 +++++++-- .../operationexecutor/operation_generator.go | 8 ++++++++ pkg/volume/util/types/types.go | 17 +++++++++++++++++ 5 files changed, 43 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go index bf909d4871a..8fc21c7d5d7 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go @@ -1279,6 +1279,16 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { newPVSize: resource.MustParse("15G"), oldPVSize: resource.MustParse("13G"), }, + { + name: "expand-fs-volume with unsupported error", + volumeMode: &fsMode, + expansionFailed: false, + pvName: volumetesting.FailWithUnSupportedVolumeName, + pvcSize: resource.MustParse("10G"), + pvcStatusSize: resource.MustParse("10G"), + newPVSize: resource.MustParse("15G"), + oldPVSize: resource.MustParse("13G"), + }, } for _, tc := range tests { diff --git a/pkg/volume/csi/expander.go b/pkg/volume/csi/expander.go index 408780582f1..d6aae060101 100644 --- a/pkg/volume/csi/expander.go +++ b/pkg/volume/csi/expander.go @@ -72,7 +72,7 @@ func (c *csiPlugin) nodeExpandWithClient( } if !nodeExpandSet { - return false, fmt.Errorf("Expander.NodeExpand found CSI plugin %s/%s to not support node expansion", c.GetPluginName(), driverName) + return false, volumetypes.NewOperationNotSupportedError(fmt.Sprintf("NodeExpand is not supported by the CSI driver %s", driverName)) } pv := resizeOptions.VolumeSpec.PersistentVolume diff --git a/pkg/volume/testing/testing.go b/pkg/volume/testing/testing.go index f0d4cc48175..4607500c70e 100644 --- a/pkg/volume/testing/testing.go +++ b/pkg/volume/testing/testing.go @@ -83,7 +83,8 @@ const ( SuccessAndFailOnMountDeviceName = "success-and-failed-mount-device-name" // FailWithInUseVolumeName will cause NodeExpandVolume to result in FailedPrecondition error - FailWithInUseVolumeName = "fail-expansion-in-use" + FailWithInUseVolumeName = "fail-expansion-in-use" + FailWithUnSupportedVolumeName = "fail-expansion-unsupported" FailVolumeExpansion = "fail-expansion-test" @@ -500,8 +501,12 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions volume.NodeResizeOption if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName { return false, volumetypes.NewFailedPreconditionError("volume-in-use") } + if resizeOptions.VolumeSpec.Name() == FailWithUnSupportedVolumeName { + return false, volumetypes.NewOperationNotSupportedError("volume-unsupported") + } + if resizeOptions.VolumeSpec.Name() == AlwaysFailNodeExpansion { - return false, fmt.Errorf("Test failure: NodeExpand") + return false, fmt.Errorf("test failure: NodeExpand") } if resizeOptions.VolumeSpec.Name() == FailVolumeExpansion { diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 22917267510..42fb6c601c3 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -2205,6 +2205,14 @@ func (og *operationGenerator) legacyCallNodeExpandOnPlugin(resizeOp nodeResizeOp _, resizeErr := expandableVolumePlugin.NodeExpand(rsOpts) if resizeErr != nil { + // This is a workaround for now, until RecoverFromVolumeExpansionFailure feature goes GA. + // If RecoverFromVolumeExpansionFailure feature is enabled, we will not ever hit this state, because + // we will wait for VolumeExpansionPendingOnNode before trying to expand volume in kubelet. + if volumetypes.IsOperationNotSupportedError(resizeErr) { + klog.V(4).InfoS(volumeToMount.GenerateMsgDetailed("MountVolume.NodeExpandVolume failed", "NodeExpandVolume not supported"), "pod", klog.KObj(volumeToMount.Pod)) + return true, nil + } + // if driver returned FailedPrecondition error that means // volume expansion should not be retried on this node but // expansion operation should not block mounting diff --git a/pkg/volume/util/types/types.go b/pkg/volume/util/types/types.go index af309353ba7..238e919b331 100644 --- a/pkg/volume/util/types/types.go +++ b/pkg/volume/util/types/types.go @@ -102,6 +102,23 @@ func IsFailedPreconditionError(err error) bool { return errors.As(err, &failedPreconditionError) } +type OperationNotSupported struct { + msg string +} + +func (err *OperationNotSupported) Error() string { + return err.msg +} + +func NewOperationNotSupportedError(msg string) *OperationNotSupported { + return &OperationNotSupported{msg: msg} +} + +func IsOperationNotSupportedError(err error) bool { + var operationNotSupportedError *OperationNotSupported + return errors.As(err, &operationNotSupportedError) +} + // TransientOperationFailure indicates operation failed with a transient error // and may fix itself when retried. type TransientOperationFailure struct {