From 0fa01c371c9d1ef9530db06444229a007c3c8fdc Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Thu, 25 Mar 2021 12:32:17 -0700 Subject: [PATCH] Mark volume mount as uncertain in case of volume expansion fails should mark volume mount in actual state even if volume expansion fails so that reconciler can tear down the volume when needed. To avoid pods start using it, mark volume as uncertain instead of mounted. Will add unit test after the logic is reviewed. Change-Id: I5aebfa11ec93235a87af8f17bea7f7b1570b603d --- .../volumemanager/reconciler/reconciler.go | 2 +- .../reconciler/reconciler_test.go | 39 ++++++++++++++++--- pkg/volume/testing/testing.go | 10 +++++ .../operationexecutor/operation_generator.go | 17 +++++++- 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler.go b/pkg/kubelet/volumemanager/reconciler/reconciler.go index 5966cd969d9..2e8ad5e7802 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler.go @@ -386,7 +386,7 @@ func (rc *reconciler) syncStates() { // Get volumes information by reading the pod's directory podVolumes, err := getVolumesFromPodDir(rc.kubeletPodsDir) if err != nil { - klog.ErrorS(err, "Cannot get volumes from disk") + klog.ErrorS(err, "Cannot get volumes from disk, skip sync states for volume reconstruction") return } volumesNeedUpdate := make(map[v1.UniqueVolumeName]*reconstructedVolume) diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go index 10bb3080610..02fc8fe9926 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go @@ -43,6 +43,7 @@ import ( "k8s.io/kubernetes/pkg/volume/util" "k8s.io/kubernetes/pkg/volume/util/hostutil" "k8s.io/kubernetes/pkg/volume/util/operationexecutor" + "k8s.io/kubernetes/pkg/volume/util/types" ) const ( @@ -1004,6 +1005,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { name string volumeMode *v1.PersistentVolumeMode expansionFailed bool + uncertainTest bool pvName string pvcSize resource.Quantity pvcStatusSize resource.Quantity @@ -1164,9 +1166,6 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { if !cache.IsFSResizeRequiredError(podExistErr) { t.Fatalf("Volume should be marked as fsResizeRequired, but receive unexpected error: %v", podExistErr) } - - // Start the reconciler again, we hope reconciler will perform the - // resize operation and clear the fsResizeRequired flag for volume. go reconciler.Run(wait.NeverStop) waitErr := retryWithExponentialBackOff(testOperationBackOffDuration, func() (done bool, err error) { @@ -1174,7 +1173,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { return mounted && err == nil, nil }) if waitErr != nil { - t.Fatal("Volume resize should succeeded") + t.Fatalf("Volume resize should succeeded %v", waitErr) } } @@ -1371,6 +1370,8 @@ func Test_UncertainVolumeMountState(t *testing.T) { unmountVolumeCount int volumeName string supportRemount bool + pvcStatusSize resource.Quantity + pvSize resource.Quantity }{ { name: "timed out operations should result in volume marked as uncertain", @@ -1409,6 +1410,16 @@ func Test_UncertainVolumeMountState(t *testing.T) { volumeName: volumetesting.SuccessAndFailOnSetupVolumeName, supportRemount: true, }, + { + name: "mount success but fail to expand filesystem", + volumeState: operationexecutor.VolumeMountUncertain, + unmountDeviceCallCount: 1, + unmountVolumeCount: 1, + volumeName: volumetesting.FailVolumeExpansion, + supportRemount: true, + pvSize: resource.MustParse("10G"), + pvcStatusSize: resource.MustParse("2G"), + }, } modes := []v1.PersistentVolumeMode{v1.PersistentVolumeBlock, v1.PersistentVolumeFilesystem} @@ -1431,6 +1442,11 @@ func Test_UncertainVolumeMountState(t *testing.T) { VolumeMode: &mode, }, } + if tc.pvSize.CmpInt64(0) > 0 { + pv.Spec.Capacity = v1.ResourceList{ + v1.ResourceStorage: tc.pvSize, + } + } pvc := &v1.PersistentVolumeClaim{ ObjectMeta: metav1.ObjectMeta{ Name: "pvc", @@ -1441,6 +1457,13 @@ func Test_UncertainVolumeMountState(t *testing.T) { VolumeMode: &mode, }, } + if tc.pvcStatusSize.CmpInt64(0) > 0 { + pvc.Status = v1.PersistentVolumeClaimStatus{ + Capacity: v1.ResourceList{ + v1.ResourceStorage: tc.pvcStatusSize, + }, + } + } pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod1", @@ -1525,7 +1548,7 @@ func Test_UncertainVolumeMountState(t *testing.T) { } if tc.volumeState == operationexecutor.VolumeMountUncertain { - waitForUncertainPodMount(t, volumeName, asw) + waitForUncertainPodMount(t, volumeName, podName, asw) } if tc.volumeState == operationexecutor.VolumeMounted { @@ -1607,11 +1630,15 @@ func waitForGlobalMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache. } } -func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache.ActualStateOfWorld) { +func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, podName types.UniquePodName, asw cache.ActualStateOfWorld) { // check if volume is locally pod mounted in uncertain state err := retryWithExponentialBackOff( testOperationBackOffDuration, func() (bool, error) { + mounted, _, err := asw.PodExistsInVolume(podName, volumeName) + if mounted || err != nil { + return false, nil + } allMountedVolumes := asw.GetAllMountedVolumes() for _, v := range allMountedVolumes { if v.VolumeName == volumeName { diff --git a/pkg/volume/testing/testing.go b/pkg/volume/testing/testing.go index 4f297781619..c21ffc72bb3 100644 --- a/pkg/volume/testing/testing.go +++ b/pkg/volume/testing/testing.go @@ -81,6 +81,8 @@ const ( // FailWithInUseVolumeName will cause NodeExpandVolume to result in FailedPrecondition error FailWithInUseVolumeName = "fail-expansion-in-use" + FailVolumeExpansion = "fail-expansion-test" + deviceNotMounted = "deviceNotMounted" deviceMountUncertain = "deviceMountUncertain" deviceMounted = "deviceMounted" @@ -467,6 +469,14 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions NodeResizeOptions) (boo if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName { return false, volumetypes.NewFailedPreconditionError("volume-in-use") } + // Set up fakeVolumePlugin not support STAGE_UNSTAGE for testing the behavior + // so as volume can be node published before we can resize + if resizeOptions.CSIVolumePhase == volume.CSIVolumeStaged { + return false, nil + } + if resizeOptions.CSIVolumePhase == volume.CSIVolumePublished && resizeOptions.VolumeSpec.Name() == FailVolumeExpansion { + return false, fmt.Errorf("fail volume expansion for volume: %s", FailVolumeExpansion) + } return true, nil } diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 13bcfd2a567..cf1dcefdccd 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -708,6 +708,14 @@ func (og *operationGenerator) GenerateMountVolumeFunc( if resizeError != nil { klog.Errorf("MountVolume.NodeExpandVolume failed with %v", resizeError) eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.Setup failed while expanding volume", resizeError) + // At this point, MountVolume.Setup already succeeded, we should add volume into actual state + // so that reconciler can clean up volume when needed. However, volume resize failed, + // we should not mark the volume as mounted to avoid pod starts using it. + // Considering the above situations, we mark volume as uncertain here so that reconciler will tigger + // volume tear down when pod is deleted, and also makes sure pod will not start using it. + if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markOpts); err != nil { + klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error()) + } return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) } } @@ -718,7 +726,6 @@ func (og *operationGenerator) GenerateMountVolumeFunc( eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.MarkVolumeAsMounted failed", markVolMountedErr) return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) } - return volumetypes.NewOperationContext(nil, nil, migrated) } @@ -1172,6 +1179,14 @@ func (og *operationGenerator) GenerateMapVolumeFunc( if resizeError != nil { klog.Errorf("MapVolume.NodeExpandVolume failed with %v", resizeError) eventErr, detailedErr := volumeToMount.GenerateError("MapVolume.MarkVolumeAsMounted failed while expanding volume", resizeError) + // At this point, MountVolume.Setup already succeeded, we should add volume into actual state + // so that reconciler can clean up volume when needed. However, if nodeExpandVolume failed, + // we should not mark the volume as mounted to avoid pod starts using it. + // Considering the above situations, we mark volume as uncertain here so that reconciler will tigger + // volume tear down when pod is deleted, and also makes sure pod will not start using it. + if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markVolumeOpts); err != nil { + klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error()) + } return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) }