Mark volume mount as uncertain in case of volume expansion fails

should mark volume mount in actual state even if volume expansion fails so that
reconciler can tear down the volume when needed. To avoid pods start
using it, mark volume as uncertain instead of mounted.

Will add unit test after the logic is reviewed.

Change-Id: I5aebfa11ec93235a87af8f17bea7f7b1570b603d
This commit is contained in:
Jing Xu 2021-03-25 12:32:17 -07:00
parent 671251a8b9
commit 0fa01c371c
4 changed files with 60 additions and 8 deletions

View File

@ -386,7 +386,7 @@ func (rc *reconciler) syncStates() {
// Get volumes information by reading the pod's directory
podVolumes, err := getVolumesFromPodDir(rc.kubeletPodsDir)
if err != nil {
klog.ErrorS(err, "Cannot get volumes from disk")
klog.ErrorS(err, "Cannot get volumes from disk, skip sync states for volume reconstruction")
return
}
volumesNeedUpdate := make(map[v1.UniqueVolumeName]*reconstructedVolume)

View File

@ -43,6 +43,7 @@ import (
"k8s.io/kubernetes/pkg/volume/util"
"k8s.io/kubernetes/pkg/volume/util/hostutil"
"k8s.io/kubernetes/pkg/volume/util/operationexecutor"
"k8s.io/kubernetes/pkg/volume/util/types"
)
const (
@ -1004,6 +1005,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) {
name string
volumeMode *v1.PersistentVolumeMode
expansionFailed bool
uncertainTest bool
pvName string
pvcSize resource.Quantity
pvcStatusSize resource.Quantity
@ -1164,9 +1166,6 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) {
if !cache.IsFSResizeRequiredError(podExistErr) {
t.Fatalf("Volume should be marked as fsResizeRequired, but receive unexpected error: %v", podExistErr)
}
// Start the reconciler again, we hope reconciler will perform the
// resize operation and clear the fsResizeRequired flag for volume.
go reconciler.Run(wait.NeverStop)
waitErr := retryWithExponentialBackOff(testOperationBackOffDuration, func() (done bool, err error) {
@ -1174,7 +1173,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) {
return mounted && err == nil, nil
})
if waitErr != nil {
t.Fatal("Volume resize should succeeded")
t.Fatalf("Volume resize should succeeded %v", waitErr)
}
}
@ -1371,6 +1370,8 @@ func Test_UncertainVolumeMountState(t *testing.T) {
unmountVolumeCount int
volumeName string
supportRemount bool
pvcStatusSize resource.Quantity
pvSize resource.Quantity
}{
{
name: "timed out operations should result in volume marked as uncertain",
@ -1409,6 +1410,16 @@ func Test_UncertainVolumeMountState(t *testing.T) {
volumeName: volumetesting.SuccessAndFailOnSetupVolumeName,
supportRemount: true,
},
{
name: "mount success but fail to expand filesystem",
volumeState: operationexecutor.VolumeMountUncertain,
unmountDeviceCallCount: 1,
unmountVolumeCount: 1,
volumeName: volumetesting.FailVolumeExpansion,
supportRemount: true,
pvSize: resource.MustParse("10G"),
pvcStatusSize: resource.MustParse("2G"),
},
}
modes := []v1.PersistentVolumeMode{v1.PersistentVolumeBlock, v1.PersistentVolumeFilesystem}
@ -1431,6 +1442,11 @@ func Test_UncertainVolumeMountState(t *testing.T) {
VolumeMode: &mode,
},
}
if tc.pvSize.CmpInt64(0) > 0 {
pv.Spec.Capacity = v1.ResourceList{
v1.ResourceStorage: tc.pvSize,
}
}
pvc := &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: "pvc",
@ -1441,6 +1457,13 @@ func Test_UncertainVolumeMountState(t *testing.T) {
VolumeMode: &mode,
},
}
if tc.pvcStatusSize.CmpInt64(0) > 0 {
pvc.Status = v1.PersistentVolumeClaimStatus{
Capacity: v1.ResourceList{
v1.ResourceStorage: tc.pvcStatusSize,
},
}
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
@ -1525,7 +1548,7 @@ func Test_UncertainVolumeMountState(t *testing.T) {
}
if tc.volumeState == operationexecutor.VolumeMountUncertain {
waitForUncertainPodMount(t, volumeName, asw)
waitForUncertainPodMount(t, volumeName, podName, asw)
}
if tc.volumeState == operationexecutor.VolumeMounted {
@ -1607,11 +1630,15 @@ func waitForGlobalMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache.
}
}
func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache.ActualStateOfWorld) {
func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, podName types.UniquePodName, asw cache.ActualStateOfWorld) {
// check if volume is locally pod mounted in uncertain state
err := retryWithExponentialBackOff(
testOperationBackOffDuration,
func() (bool, error) {
mounted, _, err := asw.PodExistsInVolume(podName, volumeName)
if mounted || err != nil {
return false, nil
}
allMountedVolumes := asw.GetAllMountedVolumes()
for _, v := range allMountedVolumes {
if v.VolumeName == volumeName {

View File

@ -81,6 +81,8 @@ const (
// FailWithInUseVolumeName will cause NodeExpandVolume to result in FailedPrecondition error
FailWithInUseVolumeName = "fail-expansion-in-use"
FailVolumeExpansion = "fail-expansion-test"
deviceNotMounted = "deviceNotMounted"
deviceMountUncertain = "deviceMountUncertain"
deviceMounted = "deviceMounted"
@ -467,6 +469,14 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions NodeResizeOptions) (boo
if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName {
return false, volumetypes.NewFailedPreconditionError("volume-in-use")
}
// Set up fakeVolumePlugin not support STAGE_UNSTAGE for testing the behavior
// so as volume can be node published before we can resize
if resizeOptions.CSIVolumePhase == volume.CSIVolumeStaged {
return false, nil
}
if resizeOptions.CSIVolumePhase == volume.CSIVolumePublished && resizeOptions.VolumeSpec.Name() == FailVolumeExpansion {
return false, fmt.Errorf("fail volume expansion for volume: %s", FailVolumeExpansion)
}
return true, nil
}

View File

@ -708,6 +708,14 @@ func (og *operationGenerator) GenerateMountVolumeFunc(
if resizeError != nil {
klog.Errorf("MountVolume.NodeExpandVolume failed with %v", resizeError)
eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.Setup failed while expanding volume", resizeError)
// At this point, MountVolume.Setup already succeeded, we should add volume into actual state
// so that reconciler can clean up volume when needed. However, volume resize failed,
// we should not mark the volume as mounted to avoid pod starts using it.
// Considering the above situations, we mark volume as uncertain here so that reconciler will tigger
// volume tear down when pod is deleted, and also makes sure pod will not start using it.
if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markOpts); err != nil {
klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error())
}
return volumetypes.NewOperationContext(eventErr, detailedErr, migrated)
}
}
@ -718,7 +726,6 @@ func (og *operationGenerator) GenerateMountVolumeFunc(
eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.MarkVolumeAsMounted failed", markVolMountedErr)
return volumetypes.NewOperationContext(eventErr, detailedErr, migrated)
}
return volumetypes.NewOperationContext(nil, nil, migrated)
}
@ -1172,6 +1179,14 @@ func (og *operationGenerator) GenerateMapVolumeFunc(
if resizeError != nil {
klog.Errorf("MapVolume.NodeExpandVolume failed with %v", resizeError)
eventErr, detailedErr := volumeToMount.GenerateError("MapVolume.MarkVolumeAsMounted failed while expanding volume", resizeError)
// At this point, MountVolume.Setup already succeeded, we should add volume into actual state
// so that reconciler can clean up volume when needed. However, if nodeExpandVolume failed,
// we should not mark the volume as mounted to avoid pod starts using it.
// Considering the above situations, we mark volume as uncertain here so that reconciler will tigger
// volume tear down when pod is deleted, and also makes sure pod will not start using it.
if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markVolumeOpts); err != nil {
klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error())
}
return volumetypes.NewOperationContext(eventErr, detailedErr, migrated)
}