diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler.go b/pkg/kubelet/volumemanager/reconciler/reconciler.go index 5966cd969d9..2e8ad5e7802 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler.go @@ -386,7 +386,7 @@ func (rc *reconciler) syncStates() { // Get volumes information by reading the pod's directory podVolumes, err := getVolumesFromPodDir(rc.kubeletPodsDir) if err != nil { - klog.ErrorS(err, "Cannot get volumes from disk") + klog.ErrorS(err, "Cannot get volumes from disk, skip sync states for volume reconstruction") return } volumesNeedUpdate := make(map[v1.UniqueVolumeName]*reconstructedVolume) diff --git a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go index 10bb3080610..02fc8fe9926 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconciler_test.go +++ b/pkg/kubelet/volumemanager/reconciler/reconciler_test.go @@ -43,6 +43,7 @@ import ( "k8s.io/kubernetes/pkg/volume/util" "k8s.io/kubernetes/pkg/volume/util/hostutil" "k8s.io/kubernetes/pkg/volume/util/operationexecutor" + "k8s.io/kubernetes/pkg/volume/util/types" ) const ( @@ -1004,6 +1005,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { name string volumeMode *v1.PersistentVolumeMode expansionFailed bool + uncertainTest bool pvName string pvcSize resource.Quantity pvcStatusSize resource.Quantity @@ -1164,9 +1166,6 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { if !cache.IsFSResizeRequiredError(podExistErr) { t.Fatalf("Volume should be marked as fsResizeRequired, but receive unexpected error: %v", podExistErr) } - - // Start the reconciler again, we hope reconciler will perform the - // resize operation and clear the fsResizeRequired flag for volume. go reconciler.Run(wait.NeverStop) waitErr := retryWithExponentialBackOff(testOperationBackOffDuration, func() (done bool, err error) { @@ -1174,7 +1173,7 @@ func Test_Run_Positive_VolumeFSResizeControllerAttachEnabled(t *testing.T) { return mounted && err == nil, nil }) if waitErr != nil { - t.Fatal("Volume resize should succeeded") + t.Fatalf("Volume resize should succeeded %v", waitErr) } } @@ -1371,6 +1370,8 @@ func Test_UncertainVolumeMountState(t *testing.T) { unmountVolumeCount int volumeName string supportRemount bool + pvcStatusSize resource.Quantity + pvSize resource.Quantity }{ { name: "timed out operations should result in volume marked as uncertain", @@ -1409,6 +1410,16 @@ func Test_UncertainVolumeMountState(t *testing.T) { volumeName: volumetesting.SuccessAndFailOnSetupVolumeName, supportRemount: true, }, + { + name: "mount success but fail to expand filesystem", + volumeState: operationexecutor.VolumeMountUncertain, + unmountDeviceCallCount: 1, + unmountVolumeCount: 1, + volumeName: volumetesting.FailVolumeExpansion, + supportRemount: true, + pvSize: resource.MustParse("10G"), + pvcStatusSize: resource.MustParse("2G"), + }, } modes := []v1.PersistentVolumeMode{v1.PersistentVolumeBlock, v1.PersistentVolumeFilesystem} @@ -1431,6 +1442,11 @@ func Test_UncertainVolumeMountState(t *testing.T) { VolumeMode: &mode, }, } + if tc.pvSize.CmpInt64(0) > 0 { + pv.Spec.Capacity = v1.ResourceList{ + v1.ResourceStorage: tc.pvSize, + } + } pvc := &v1.PersistentVolumeClaim{ ObjectMeta: metav1.ObjectMeta{ Name: "pvc", @@ -1441,6 +1457,13 @@ func Test_UncertainVolumeMountState(t *testing.T) { VolumeMode: &mode, }, } + if tc.pvcStatusSize.CmpInt64(0) > 0 { + pvc.Status = v1.PersistentVolumeClaimStatus{ + Capacity: v1.ResourceList{ + v1.ResourceStorage: tc.pvcStatusSize, + }, + } + } pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod1", @@ -1525,7 +1548,7 @@ func Test_UncertainVolumeMountState(t *testing.T) { } if tc.volumeState == operationexecutor.VolumeMountUncertain { - waitForUncertainPodMount(t, volumeName, asw) + waitForUncertainPodMount(t, volumeName, podName, asw) } if tc.volumeState == operationexecutor.VolumeMounted { @@ -1607,11 +1630,15 @@ func waitForGlobalMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache. } } -func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, asw cache.ActualStateOfWorld) { +func waitForUncertainPodMount(t *testing.T, volumeName v1.UniqueVolumeName, podName types.UniquePodName, asw cache.ActualStateOfWorld) { // check if volume is locally pod mounted in uncertain state err := retryWithExponentialBackOff( testOperationBackOffDuration, func() (bool, error) { + mounted, _, err := asw.PodExistsInVolume(podName, volumeName) + if mounted || err != nil { + return false, nil + } allMountedVolumes := asw.GetAllMountedVolumes() for _, v := range allMountedVolumes { if v.VolumeName == volumeName { diff --git a/pkg/volume/testing/testing.go b/pkg/volume/testing/testing.go index 34bb7ce1085..8ca324ec925 100644 --- a/pkg/volume/testing/testing.go +++ b/pkg/volume/testing/testing.go @@ -81,6 +81,8 @@ const ( // FailWithInUseVolumeName will cause NodeExpandVolume to result in FailedPrecondition error FailWithInUseVolumeName = "fail-expansion-in-use" + FailVolumeExpansion = "fail-expansion-test" + deviceNotMounted = "deviceNotMounted" deviceMountUncertain = "deviceMountUncertain" deviceMounted = "deviceMounted" @@ -467,6 +469,14 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions NodeResizeOptions) (boo if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName { return false, volumetypes.NewFailedPreconditionError("volume-in-use") } + // Set up fakeVolumePlugin not support STAGE_UNSTAGE for testing the behavior + // so as volume can be node published before we can resize + if resizeOptions.CSIVolumePhase == volume.CSIVolumeStaged { + return false, nil + } + if resizeOptions.CSIVolumePhase == volume.CSIVolumePublished && resizeOptions.VolumeSpec.Name() == FailVolumeExpansion { + return false, fmt.Errorf("fail volume expansion for volume: %s", FailVolumeExpansion) + } return true, nil } diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 487c8a79bb9..74addd92928 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -722,6 +722,14 @@ func (og *operationGenerator) GenerateMountVolumeFunc( if resizeError != nil { klog.Errorf("MountVolume.NodeExpandVolume failed with %v", resizeError) eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.Setup failed while expanding volume", resizeError) + // At this point, MountVolume.Setup already succeeded, we should add volume into actual state + // so that reconciler can clean up volume when needed. However, volume resize failed, + // we should not mark the volume as mounted to avoid pod starts using it. + // Considering the above situations, we mark volume as uncertain here so that reconciler will tigger + // volume tear down when pod is deleted, and also makes sure pod will not start using it. + if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markOpts); err != nil { + klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error()) + } return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) } } @@ -732,7 +740,6 @@ func (og *operationGenerator) GenerateMountVolumeFunc( eventErr, detailedErr := volumeToMount.GenerateError("MountVolume.MarkVolumeAsMounted failed", markVolMountedErr) return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) } - return volumetypes.NewOperationContext(nil, nil, migrated) } @@ -1198,6 +1205,14 @@ func (og *operationGenerator) GenerateMapVolumeFunc( if resizeError != nil { klog.Errorf("MapVolume.NodeExpandVolume failed with %v", resizeError) eventErr, detailedErr := volumeToMount.GenerateError("MapVolume.MarkVolumeAsMounted failed while expanding volume", resizeError) + // At this point, MountVolume.Setup already succeeded, we should add volume into actual state + // so that reconciler can clean up volume when needed. However, if nodeExpandVolume failed, + // we should not mark the volume as mounted to avoid pod starts using it. + // Considering the above situations, we mark volume as uncertain here so that reconciler will tigger + // volume tear down when pod is deleted, and also makes sure pod will not start using it. + if err := actualStateOfWorld.MarkVolumeMountAsUncertain(markVolumeOpts); err != nil { + klog.Errorf(volumeToMount.GenerateErrorDetailed("MountVolume.MarkVolumeMountAsUncertain failed", err).Error()) + } return volumetypes.NewOperationContext(eventErr, detailedErr, migrated) }