Fix race condition between external-resizer and kubelet

This fixes the race condition that could happen because
resize controller just finished volume expansiona and has only
finished marking PV and yet to mark PVC.

The workaround proposed here should not be necessary once
RecoverVolumeExpansionFailure goes GA/beta.
This commit is contained in:
Hemant Kumar 2024-01-31 12:23:56 -05:00
parent a9e4f5b786
commit d190fa3e7d
5 changed files with 43 additions and 3 deletions

View File

@ -1279,6 +1279,16 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) {
newPVSize: resource.MustParse("15G"),
oldPVSize: resource.MustParse("13G"),
},
{
name: "expand-fs-volume with unsupported error",
volumeMode: &fsMode,
expansionFailed: false,
pvName: volumetesting.FailWithUnSupportedVolumeName,
pvcSize: resource.MustParse("10G"),
pvcStatusSize: resource.MustParse("10G"),
newPVSize: resource.MustParse("15G"),
oldPVSize: resource.MustParse("13G"),
},
}
for _, tc := range tests {

View File

@ -72,7 +72,7 @@ func (c *csiPlugin) nodeExpandWithClient(
}
if !nodeExpandSet {
return false, fmt.Errorf("Expander.NodeExpand found CSI plugin %s/%s to not support node expansion", c.GetPluginName(), driverName)
return false, volumetypes.NewOperationNotSupportedError(fmt.Sprintf("NodeExpand is not supported by the CSI driver %s", driverName))
}
pv := resizeOptions.VolumeSpec.PersistentVolume

View File

@ -84,6 +84,7 @@ const (
// FailWithInUseVolumeName will cause NodeExpandVolume to result in FailedPrecondition error
FailWithInUseVolumeName = "fail-expansion-in-use"
FailWithUnSupportedVolumeName = "fail-expansion-unsupported"
FailVolumeExpansion = "fail-expansion-test"
@ -500,8 +501,12 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions volume.NodeResizeOption
if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName {
return false, volumetypes.NewFailedPreconditionError("volume-in-use")
}
if resizeOptions.VolumeSpec.Name() == FailWithUnSupportedVolumeName {
return false, volumetypes.NewOperationNotSupportedError("volume-unsupported")
}
if resizeOptions.VolumeSpec.Name() == AlwaysFailNodeExpansion {
return false, fmt.Errorf("Test failure: NodeExpand")
return false, fmt.Errorf("test failure: NodeExpand")
}
if resizeOptions.VolumeSpec.Name() == FailVolumeExpansion {

View File

@ -2205,6 +2205,14 @@ func (og *operationGenerator) legacyCallNodeExpandOnPlugin(resizeOp nodeResizeOp
_, resizeErr := expandableVolumePlugin.NodeExpand(rsOpts)
if resizeErr != nil {
// This is a workaround for now, until RecoverFromVolumeExpansionFailure feature goes GA.
// If RecoverFromVolumeExpansionFailure feature is enabled, we will not ever hit this state, because
// we will wait for VolumeExpansionPendingOnNode before trying to expand volume in kubelet.
if volumetypes.IsOperationNotSupportedError(resizeErr) {
klog.V(4).InfoS(volumeToMount.GenerateMsgDetailed("MountVolume.NodeExpandVolume failed", "NodeExpandVolume not supported"), "pod", klog.KObj(volumeToMount.Pod))
return true, nil
}
// if driver returned FailedPrecondition error that means
// volume expansion should not be retried on this node but
// expansion operation should not block mounting

View File

@ -102,6 +102,23 @@ func IsFailedPreconditionError(err error) bool {
return errors.As(err, &failedPreconditionError)
}
type OperationNotSupported struct {
msg string
}
func (err *OperationNotSupported) Error() string {
return err.msg
}
func NewOperationNotSupportedError(msg string) *OperationNotSupported {
return &OperationNotSupported{msg: msg}
}
func IsOperationNotSupportedError(err error) bool {
var operationNotSupportedError *OperationNotSupported
return errors.As(err, &operationNotSupportedError)
}
// TransientOperationFailure indicates operation failed with a transient error
// and may fix itself when retried.
type TransientOperationFailure struct {