diff --git a/pkg/volume/util/operationexecutor/node_expander.go b/pkg/volume/util/operationexecutor/node_expander.go index 19547c94e09..fb68ad8831b 100644 --- a/pkg/volume/util/operationexecutor/node_expander.go +++ b/pkg/volume/util/operationexecutor/node_expander.go @@ -36,7 +36,6 @@ type NodeExpander struct { // computed via precheck pvcStatusCap resource.Quantity - pvCap resource.Quantity resizeStatus v1.ClaimResourceStatus // indicates that if volume expansion failed on the node, then current expansion should be marked @@ -76,7 +75,6 @@ type testResponseData struct { // it returns false. func (ne *NodeExpander) runPreCheck() bool { ne.pvcStatusCap = ne.pvc.Status.Capacity[v1.ResourceStorage] - ne.pvCap = ne.pv.Spec.Capacity[v1.ResourceStorage] allocatedResourceStatus := ne.pvc.Status.AllocatedResourceStatuses if currentStatus, ok := allocatedResourceStatus[v1.ResourceStorage]; ok { @@ -120,6 +118,7 @@ func (ne *NodeExpander) runPreCheck() bool { func (ne *NodeExpander) expandOnPlugin() (bool, error, testResponseData) { allowExpansion := ne.runPreCheck() if !allowExpansion { + klog.V(3).Infof("NodeExpandVolume is not allowed to proceed for volume %s with resizeStatus %s", ne.vmt.VolumeName, ne.resizeStatus) return false, nil, testResponseData{false, true} } diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index e246719c23e..d54de2cc2f0 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -2009,6 +2009,9 @@ func (og *operationGenerator) expandVolumeDuringMount(volumeToMount VolumeToMoun actualStateOfWorld: actualStateOfWorld, } if og.checkForRecoveryFromExpansion(pvc, volumeToMount) { + // if recovery feature is enabled, we can use allocated size from PVC status as new size + rsOpts.NewSize = pvc.Status.AllocatedResources[v1.ResourceStorage] + resizeOp.pluginResizeOpts = rsOpts nodeExpander := newNodeExpander(resizeOp, og.kubeClient, og.recorder) resizeFinished, err, _ := nodeExpander.expandOnPlugin() return resizeFinished, err @@ -2072,6 +2075,9 @@ func (og *operationGenerator) nodeExpandVolume( } if og.checkForRecoveryFromExpansion(pvc, volumeToMount) { + // if recovery feature is enabled, we can use allocated size from PVC status as new size + rsOpts.NewSize = pvc.Status.AllocatedResources[v1.ResourceStorage] + resizeOp.pluginResizeOpts = rsOpts nodeExpander := newNodeExpander(resizeOp, og.kubeClient, og.recorder) resizeFinished, err, _ := nodeExpander.expandOnPlugin() return resizeFinished, err diff --git a/test/e2e/storage/csimock/base.go b/test/e2e/storage/csimock/base.go index 71cfeb8bcbd..dcad86bd06a 100644 --- a/test/e2e/storage/csimock/base.go +++ b/test/e2e/storage/csimock/base.go @@ -96,6 +96,7 @@ type testParameters struct { enableNodeExpansion bool // enable node expansion for CSI mock driver // just disable resizing on driver it overrides enableResizing flag for CSI mock driver disableResizingOnDriver bool + disableControllerExpansion bool enableSnapshot bool enableVolumeMountGroup bool // enable the VOLUME_MOUNT_GROUP node capability in the CSI mock driver. enableNodeVolumeCondition bool @@ -172,6 +173,7 @@ func (m *mockDriverSetup) init(ctx context.Context, tp testParameters) { EnableResizing: tp.enableResizing, EnableNodeExpansion: tp.enableNodeExpansion, EnableNodeVolumeCondition: tp.enableNodeVolumeCondition, + DisableControllerExpansion: tp.disableControllerExpansion, EnableSnapshot: tp.enableSnapshot, EnableVolumeMountGroup: tp.enableVolumeMountGroup, TokenRequests: tp.tokenRequests, diff --git a/test/e2e/storage/csimock/csi_volume_expansion.go b/test/e2e/storage/csimock/csi_volume_expansion.go index 47ae8be9884..2460b778702 100644 --- a/test/e2e/storage/csimock/csi_volume_expansion.go +++ b/test/e2e/storage/csimock/csi_volume_expansion.go @@ -45,8 +45,10 @@ type expansionStatus int const ( expansionSuccess = iota - expansionFailedOnController - expansionFailedOnNode + expansionFailedOnControllerWithInfeasibleError + expansionFailedOnControllerWithFinalError + expansionFailedOnNodeWithInfeasibleError + expansionFailedOnNodeWithFinalError expansionFailedMissingStagingPath ) @@ -61,12 +63,13 @@ var ( ) type recoveryTest struct { - name string - pvcRequestSize string - allocatedResource string - simulatedCSIDriverError expansionStatus - expectedResizeStatus v1.ClaimResourceStatus - recoverySize resource.Quantity + name string + pvcRequestSize string + allocatedResource string + simulatedCSIDriverError expansionStatus + disableControllerExpansion bool + expectedResizeStatus v1.ClaimResourceStatus + recoverySize resource.Quantity } var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { @@ -400,27 +403,57 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { f.Context("Expansion with recovery", feature.RecoverVolumeExpansionFailure, func() { tests := []recoveryTest{ { - name: "should record target size in allocated resources", - pvcRequestSize: "4Gi", - allocatedResource: "4Gi", - simulatedCSIDriverError: expansionSuccess, - expectedResizeStatus: "", + name: "should record target size in allocated resources", + pvcRequestSize: "4Gi", + allocatedResource: "4Gi", + disableControllerExpansion: false, + simulatedCSIDriverError: expansionSuccess, + expectedResizeStatus: "", }, { - name: "should allow recovery if controller expansion fails with final error", - pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller - allocatedResource: "11Gi", - simulatedCSIDriverError: expansionFailedOnController, - expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInfeasible, - recoverySize: resource.MustParse("4Gi"), + name: "should allow recovery if controller expansion fails with infeasible error", + pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller + allocatedResource: "11Gi", + disableControllerExpansion: false, + simulatedCSIDriverError: expansionFailedOnControllerWithInfeasibleError, + expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInfeasible, + recoverySize: resource.MustParse("4Gi"), }, { - name: "recovery should not be possible in partially expanded volumes", - pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node - allocatedResource: "9Gi", - simulatedCSIDriverError: expansionFailedOnNode, - expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, - recoverySize: resource.MustParse("5Gi"), + name: "should allow recovery if controller expansion fails with final error", + pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller + allocatedResource: "11Gi", + disableControllerExpansion: false, + simulatedCSIDriverError: expansionFailedOnControllerWithFinalError, + expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInProgress, + recoverySize: resource.MustParse("4Gi"), + }, + { + name: "recovery should not be possible in partially expanded volumes", + pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node + allocatedResource: "9Gi", + disableControllerExpansion: false, + simulatedCSIDriverError: expansionFailedOnNodeWithInfeasibleError, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, + recoverySize: resource.MustParse("5Gi"), + }, + { + name: "recovery should be possible for node-only expanded volumes with infeasible error", + pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node + allocatedResource: "9Gi", + disableControllerExpansion: true, + simulatedCSIDriverError: expansionFailedOnNodeWithInfeasibleError, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, + recoverySize: resource.MustParse("5Gi"), + }, + { + name: "recovery should be possible for node-only expanded volumes with final error", + pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node + allocatedResource: "9Gi", + disableControllerExpansion: true, + simulatedCSIDriverError: expansionFailedOnNodeWithFinalError, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInProgress, + recoverySize: resource.MustParse("5Gi"), }, } @@ -428,7 +461,7 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { test := t ginkgo.It(test.name, func(ctx context.Context) { var err error - params := testParameters{enableResizing: true, enableNodeExpansion: true, enableRecoverExpansionFailure: true} + params := testParameters{enableResizing: true, enableNodeExpansion: true, enableRecoverExpansionFailure: true, disableControllerExpansion: test.disableControllerExpansion} if test.simulatedCSIDriverError != expansionSuccess { params.hooks = createExpansionHook(test.simulatedCSIDriverError) @@ -476,9 +509,15 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai err = waitForAllocatedResource(ctx, pvc, m, test.allocatedResource) framework.ExpectNoError(err, "While waiting for allocated resource to be updated") - ginkgo.By("Waiting for resizer to set resize status") - err = waitForResizeStatus(ctx, pvc, m.cs, test.expectedResizeStatus) - framework.ExpectNoError(err, "While waiting for resize status to be set") + if test.expectedResizeStatus == v1.PersistentVolumeClaimNodeResizeInfeasible { + ginkgo.By("Waiting for kubelet to fail expansion on the node") + err = waitForResizeToFailOnNode(ctx, pvc, m.cs) + framework.ExpectNoError(err, "While waiting for resize status to be set") + } else { + ginkgo.By("Waiting for resizer to set resize status") + err = waitForResizeStatus(ctx, pvc, m.cs, test.expectedResizeStatus) + framework.ExpectNoError(err, "While waiting for resize status to be set") + } ginkgo.By("Recover pvc size") newPVC, err := testsuites.ExpandPVCSize(ctx, pvc, test.recoverySize, m.cs) @@ -492,16 +531,25 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai } // if expansion failed on controller with final error, then recovery should be possible - if test.simulatedCSIDriverError == expansionFailedOnController { + if test.simulatedCSIDriverError == expansionFailedOnControllerWithInfeasibleError { + validateExpansionSuccess(ctx, pvc, m, test, test.recoverySize.String()) + return + } + + // if expansion failed on node with final error but volume was only expanded on the node + // then recovery should be possible + if test.disableControllerExpansion && + (test.simulatedCSIDriverError == expansionFailedOnNodeWithInfeasibleError || + test.simulatedCSIDriverError == expansionFailedOnNodeWithFinalError) { validateExpansionSuccess(ctx, pvc, m, test, test.recoverySize.String()) return } // if expansion succeeded on controller but failed on the node - if test.simulatedCSIDriverError == expansionFailedOnNode { + if test.simulatedCSIDriverError == expansionFailedOnNodeWithInfeasibleError { ginkgo.By("Wait for expansion to fail on node again") - err = waitForResizeStatus(ctx, pvc, m.cs, v1.PersistentVolumeClaimNodeResizeInfeasible) - framework.ExpectNoError(err, "While waiting for resize status to be set to expansion-failed-on-node") + err = waitForResizeToFailOnNode(ctx, pvc, m.cs) + framework.ExpectNoError(err, "While waiting for resize status to be set") ginkgo.By("verify allocated resources after recovery") pvc, err = m.cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) @@ -520,11 +568,11 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai func validateExpansionSuccess(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, test recoveryTest, expectedAllocatedSize string) { var err error - ginkgo.By("Waiting for persistent volume resize to finish") - err = testsuites.WaitForControllerVolumeResize(ctx, pvc, m.cs, csiResizeWaitPeriod) + ginkgo.By(fmt.Sprintf("Waiting for PV %s to be expanded to %s", pvc.Spec.VolumeName, test.recoverySize.String())) + err = testsuites.WaitForRecoveryPVSize(pvc, m.cs, csiResizeWaitPeriod) framework.ExpectNoError(err, "While waiting for PV resize to finish") - ginkgo.By("Waiting for PVC resize to finish") + ginkgo.By(fmt.Sprintf("Waiting for PVC %s to be expanded to %s", pvc.Name, test.recoverySize.String())) pvc, err = testsuites.WaitForFSResize(ctx, pvc, m.cs) framework.ExpectNoError(err, "while waiting for PVC to finish") @@ -561,6 +609,31 @@ func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c c return nil } +func waitForResizeToFailOnNode(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) error { + var finalConditions []v1.PersistentVolumeClaimCondition + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) { + var err error + updatedPVC, err := c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(pollContext, pvc.Name, metav1.GetOptions{}) + + if err != nil { + return false, fmt.Errorf("error fetching pvc %q for checking for resize status: %w", pvc.Name, err) + } + pvcConditions := updatedPVC.Status.Conditions + for _, cond := range pvcConditions { + if cond.Type == v1.PersistentVolumeClaimNodeResizeError { + return true, nil + } + } + finalConditions = pvcConditions + return false, nil + }) + + if waitErr != nil { + return fmt.Errorf("error while waiting for resize condition sync to NodeResizeError, actualStatus %+v: %w", finalConditions, waitErr) + } + return nil +} + func waitForAllocatedResource(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, expectedSize string) error { expectedQuantity := resource.MustParse(expectedSize) waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) { @@ -596,7 +669,7 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks } } - case expansionFailedOnController: + case expansionFailedOnControllerWithInfeasibleError: expansionRequest, ok := request.(*csipbv1.ControllerExpandVolumeRequest) if ok { requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI) @@ -604,7 +677,16 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks return nil, status.Error(codes.InvalidArgument, "invalid expansion request") } } - case expansionFailedOnNode: + case expansionFailedOnControllerWithFinalError: + // This simulates a condition that a final, but not infeasible error is returned when expansion fails in the controller. + expansionRequest, ok := request.(*csipbv1.ControllerExpandVolumeRequest) + if ok { + requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI) + if requestedSize.Cmp(maxControllerSizeLimit) > 0 { + return nil, status.Error(codes.PermissionDenied, "permission denied for expansion") + } + } + case expansionFailedOnNodeWithInfeasibleError: expansionRequest, ok := request.(*csipbv1.NodeExpandVolumeRequest) if ok { requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI) @@ -613,6 +695,14 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks } } + case expansionFailedOnNodeWithFinalError: + expansionRequest, ok := request.(*csipbv1.NodeExpandVolumeRequest) + if ok { + requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI) + if requestedSize.Cmp(maxNodeExpansionLimit) > 0 { + return nil, status.Error(codes.PermissionDenied, "permission denied for expansion") + } + } } return nil, nil diff --git a/test/e2e/storage/drivers/csi.go b/test/e2e/storage/drivers/csi.go index f11964a0bb3..c7ce8a7663e 100644 --- a/test/e2e/storage/drivers/csi.go +++ b/test/e2e/storage/drivers/csi.go @@ -350,6 +350,7 @@ type mockCSIDriver struct { embeddedCSIDriver *mockdriver.CSIDriver enableSELinuxMount *bool enableRecoverExpansionFailure bool + disableControllerExpansion bool enableHonorPVReclaimPolicy bool // Additional values set during PrepareTest @@ -392,6 +393,7 @@ type CSIMockDriverOpts struct { EnableTopology bool EnableResizing bool EnableNodeExpansion bool + DisableControllerExpansion bool EnableSnapshot bool EnableVolumeMountGroup bool EnableNodeVolumeCondition bool @@ -550,6 +552,7 @@ func InitMockCSIDriver(driverOpts CSIMockDriverOpts) MockCSITestDriver { attachLimit: driverOpts.AttachLimit, enableNodeExpansion: driverOpts.EnableNodeExpansion, enableNodeVolumeCondition: driverOpts.EnableNodeVolumeCondition, + disableControllerExpansion: driverOpts.DisableControllerExpansion, tokenRequests: driverOpts.TokenRequests, requiresRepublish: driverOpts.RequiresRepublish, fsGroupPolicy: driverOpts.FSGroupPolicy, @@ -628,6 +631,7 @@ func (m *mockCSIDriver) PrepareTest(ctx context.Context, f *framework.Framework) DriverName: "csi-mock-" + f.UniqueName, AttachLimit: int64(m.attachLimit), NodeExpansionRequired: m.enableNodeExpansion, + DisableControllerExpansion: m.disableControllerExpansion, NodeVolumeConditionRequired: m.enableNodeVolumeCondition, VolumeMountGroupRequired: m.enableVolumeMountGroup, EnableTopology: m.enableTopology, diff --git a/test/e2e/storage/testsuites/volume_expand.go b/test/e2e/storage/testsuites/volume_expand.go index 8f64ec4bd49..92df4f36481 100644 --- a/test/e2e/storage/testsuites/volume_expand.go +++ b/test/e2e/storage/testsuites/volume_expand.go @@ -318,15 +318,15 @@ func ExpandPVCSize(ctx context.Context, origPVC *v1.PersistentVolumeClaim, size // Retry the update on error, until we hit a timeout. // TODO: Determine whether "retry with timeout" is appropriate here. Maybe we should only retry on version conflict. var lastUpdateError error - waitErr := wait.PollImmediate(resizePollInterval, 30*time.Second, func() (bool, error) { + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, 30*time.Second, true, func(pollContext context.Context) (bool, error) { var err error - updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Get(ctx, pvcName, metav1.GetOptions{}) + updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Get(pollContext, pvcName, metav1.GetOptions{}) if err != nil { return false, fmt.Errorf("error fetching pvc %q for resizing: %w", pvcName, err) } updatedPVC.Spec.Resources.Requests[v1.ResourceStorage] = size - updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Update(ctx, updatedPVC, metav1.UpdateOptions{}) + updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Update(pollContext, updatedPVC, metav1.UpdateOptions{}) if err != nil { framework.Logf("Error updating pvc %s: %v", pvcName, err) lastUpdateError = err @@ -392,6 +392,34 @@ func WaitForControllerVolumeResize(ctx context.Context, pvc *v1.PersistentVolume return nil } +// WaitForRecoveryPVSize waits for the controller resize to be finished when we are reducing volume size +// This is different from WaitForControllerVolumeResize which just waits for PV to report bigger size than +// size requested. +func WaitForRecoveryPVSize(pvc *v1.PersistentVolumeClaim, c clientset.Interface, timeout time.Duration) error { + pvName := pvc.Spec.VolumeName + waitErr := wait.PollImmediate(resizePollInterval, timeout, func() (bool, error) { + pvcSpecSize := pvc.Spec.Resources.Requests.Storage() + + pv, err := c.CoreV1().PersistentVolumes().Get(context.TODO(), pvName, metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("error fetching pv %q for resizing %v", pvName, err) + } + + pvSize := pv.Spec.Capacity[v1.ResourceStorage] + + // If pv size is greater than what is mentioned in pvc's status that means control-plane + // volume expansion is finished. + if pvSize.Cmp(*pvcSpecSize) == 0 { + return true, nil + } + return false, nil + }) + if waitErr != nil { + return fmt.Errorf("error while waiting for controller resize to finish: %v", waitErr) + } + return nil +} + // WaitForPendingFSResizeCondition waits for pvc to have resize condition func WaitForPendingFSResizeCondition(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) (*v1.PersistentVolumeClaim, error) { var updatedPVC *v1.PersistentVolumeClaim @@ -423,9 +451,9 @@ func WaitForPendingFSResizeCondition(ctx context.Context, pvc *v1.PersistentVolu // WaitForFSResize waits for the filesystem in the pv to be resized func WaitForFSResize(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) (*v1.PersistentVolumeClaim, error) { var updatedPVC *v1.PersistentVolumeClaim - waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, totalResizeWaitPeriod, true, func(ctx context.Context) (bool, error) { + waitErr := wait.PollImmediate(resizePollInterval, totalResizeWaitPeriod, func() (bool, error) { var err error - updatedPVC, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(ctx, pvc.Name, metav1.GetOptions{}) + updatedPVC, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) if err != nil { return false, fmt.Errorf("error fetching pvc %q for checking for resize status : %w", pvc.Name, err)