Use allocatedResources as new size for finishing volume expansion

on the node.

Also add new tests for node only expansion modes
This commit is contained in:
Hemant Kumar 2024-07-22 09:59:07 -04:00
parent 201bdaa2fe
commit de5e4963c1
6 changed files with 174 additions and 45 deletions

View File

@ -36,7 +36,6 @@ type NodeExpander struct {
// computed via precheck
pvcStatusCap resource.Quantity
pvCap resource.Quantity
resizeStatus v1.ClaimResourceStatus
// indicates that if volume expansion failed on the node, then current expansion should be marked
@ -76,7 +75,6 @@ type testResponseData struct {
// it returns false.
func (ne *NodeExpander) runPreCheck() bool {
ne.pvcStatusCap = ne.pvc.Status.Capacity[v1.ResourceStorage]
ne.pvCap = ne.pv.Spec.Capacity[v1.ResourceStorage]
allocatedResourceStatus := ne.pvc.Status.AllocatedResourceStatuses
if currentStatus, ok := allocatedResourceStatus[v1.ResourceStorage]; ok {
@ -120,6 +118,7 @@ func (ne *NodeExpander) runPreCheck() bool {
func (ne *NodeExpander) expandOnPlugin() (bool, error, testResponseData) {
allowExpansion := ne.runPreCheck()
if !allowExpansion {
klog.V(3).Infof("NodeExpandVolume is not allowed to proceed for volume %s with resizeStatus %s", ne.vmt.VolumeName, ne.resizeStatus)
return false, nil, testResponseData{false, true}
}

View File

@ -2009,6 +2009,9 @@ func (og *operationGenerator) expandVolumeDuringMount(volumeToMount VolumeToMoun
actualStateOfWorld: actualStateOfWorld,
}
if og.checkForRecoveryFromExpansion(pvc, volumeToMount) {
// if recovery feature is enabled, we can use allocated size from PVC status as new size
rsOpts.NewSize = pvc.Status.AllocatedResources[v1.ResourceStorage]
resizeOp.pluginResizeOpts = rsOpts
nodeExpander := newNodeExpander(resizeOp, og.kubeClient, og.recorder)
resizeFinished, err, _ := nodeExpander.expandOnPlugin()
return resizeFinished, err
@ -2072,6 +2075,9 @@ func (og *operationGenerator) nodeExpandVolume(
}
if og.checkForRecoveryFromExpansion(pvc, volumeToMount) {
// if recovery feature is enabled, we can use allocated size from PVC status as new size
rsOpts.NewSize = pvc.Status.AllocatedResources[v1.ResourceStorage]
resizeOp.pluginResizeOpts = rsOpts
nodeExpander := newNodeExpander(resizeOp, og.kubeClient, og.recorder)
resizeFinished, err, _ := nodeExpander.expandOnPlugin()
return resizeFinished, err

View File

@ -96,6 +96,7 @@ type testParameters struct {
enableNodeExpansion bool // enable node expansion for CSI mock driver
// just disable resizing on driver it overrides enableResizing flag for CSI mock driver
disableResizingOnDriver bool
disableControllerExpansion bool
enableSnapshot bool
enableVolumeMountGroup bool // enable the VOLUME_MOUNT_GROUP node capability in the CSI mock driver.
enableNodeVolumeCondition bool
@ -172,6 +173,7 @@ func (m *mockDriverSetup) init(ctx context.Context, tp testParameters) {
EnableResizing: tp.enableResizing,
EnableNodeExpansion: tp.enableNodeExpansion,
EnableNodeVolumeCondition: tp.enableNodeVolumeCondition,
DisableControllerExpansion: tp.disableControllerExpansion,
EnableSnapshot: tp.enableSnapshot,
EnableVolumeMountGroup: tp.enableVolumeMountGroup,
TokenRequests: tp.tokenRequests,

View File

@ -45,8 +45,10 @@ type expansionStatus int
const (
expansionSuccess = iota
expansionFailedOnController
expansionFailedOnNode
expansionFailedOnControllerWithInfeasibleError
expansionFailedOnControllerWithFinalError
expansionFailedOnNodeWithInfeasibleError
expansionFailedOnNodeWithFinalError
expansionFailedMissingStagingPath
)
@ -61,12 +63,13 @@ var (
)
type recoveryTest struct {
name string
pvcRequestSize string
allocatedResource string
simulatedCSIDriverError expansionStatus
expectedResizeStatus v1.ClaimResourceStatus
recoverySize resource.Quantity
name string
pvcRequestSize string
allocatedResource string
simulatedCSIDriverError expansionStatus
disableControllerExpansion bool
expectedResizeStatus v1.ClaimResourceStatus
recoverySize resource.Quantity
}
var _ = utils.SIGDescribe("CSI Mock volume expansion", func() {
@ -400,27 +403,57 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() {
f.Context("Expansion with recovery", feature.RecoverVolumeExpansionFailure, func() {
tests := []recoveryTest{
{
name: "should record target size in allocated resources",
pvcRequestSize: "4Gi",
allocatedResource: "4Gi",
simulatedCSIDriverError: expansionSuccess,
expectedResizeStatus: "",
name: "should record target size in allocated resources",
pvcRequestSize: "4Gi",
allocatedResource: "4Gi",
disableControllerExpansion: false,
simulatedCSIDriverError: expansionSuccess,
expectedResizeStatus: "",
},
{
name: "should allow recovery if controller expansion fails with final error",
pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller
allocatedResource: "11Gi",
simulatedCSIDriverError: expansionFailedOnController,
expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInfeasible,
recoverySize: resource.MustParse("4Gi"),
name: "should allow recovery if controller expansion fails with infeasible error",
pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller
allocatedResource: "11Gi",
disableControllerExpansion: false,
simulatedCSIDriverError: expansionFailedOnControllerWithInfeasibleError,
expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInfeasible,
recoverySize: resource.MustParse("4Gi"),
},
{
name: "recovery should not be possible in partially expanded volumes",
pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node
allocatedResource: "9Gi",
simulatedCSIDriverError: expansionFailedOnNode,
expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible,
recoverySize: resource.MustParse("5Gi"),
name: "should allow recovery if controller expansion fails with final error",
pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller
allocatedResource: "11Gi",
disableControllerExpansion: false,
simulatedCSIDriverError: expansionFailedOnControllerWithFinalError,
expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInProgress,
recoverySize: resource.MustParse("4Gi"),
},
{
name: "recovery should not be possible in partially expanded volumes",
pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node
allocatedResource: "9Gi",
disableControllerExpansion: false,
simulatedCSIDriverError: expansionFailedOnNodeWithInfeasibleError,
expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible,
recoverySize: resource.MustParse("5Gi"),
},
{
name: "recovery should be possible for node-only expanded volumes with infeasible error",
pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node
allocatedResource: "9Gi",
disableControllerExpansion: true,
simulatedCSIDriverError: expansionFailedOnNodeWithInfeasibleError,
expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible,
recoverySize: resource.MustParse("5Gi"),
},
{
name: "recovery should be possible for node-only expanded volumes with final error",
pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node
allocatedResource: "9Gi",
disableControllerExpansion: true,
simulatedCSIDriverError: expansionFailedOnNodeWithFinalError,
expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInProgress,
recoverySize: resource.MustParse("5Gi"),
},
}
@ -428,7 +461,7 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() {
test := t
ginkgo.It(test.name, func(ctx context.Context) {
var err error
params := testParameters{enableResizing: true, enableNodeExpansion: true, enableRecoverExpansionFailure: true}
params := testParameters{enableResizing: true, enableNodeExpansion: true, enableRecoverExpansionFailure: true, disableControllerExpansion: test.disableControllerExpansion}
if test.simulatedCSIDriverError != expansionSuccess {
params.hooks = createExpansionHook(test.simulatedCSIDriverError)
@ -476,9 +509,15 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai
err = waitForAllocatedResource(ctx, pvc, m, test.allocatedResource)
framework.ExpectNoError(err, "While waiting for allocated resource to be updated")
ginkgo.By("Waiting for resizer to set resize status")
err = waitForResizeStatus(ctx, pvc, m.cs, test.expectedResizeStatus)
framework.ExpectNoError(err, "While waiting for resize status to be set")
if test.expectedResizeStatus == v1.PersistentVolumeClaimNodeResizeInfeasible {
ginkgo.By("Waiting for kubelet to fail expansion on the node")
err = waitForResizeToFailOnNode(ctx, pvc, m.cs)
framework.ExpectNoError(err, "While waiting for resize status to be set")
} else {
ginkgo.By("Waiting for resizer to set resize status")
err = waitForResizeStatus(ctx, pvc, m.cs, test.expectedResizeStatus)
framework.ExpectNoError(err, "While waiting for resize status to be set")
}
ginkgo.By("Recover pvc size")
newPVC, err := testsuites.ExpandPVCSize(ctx, pvc, test.recoverySize, m.cs)
@ -492,16 +531,25 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai
}
// if expansion failed on controller with final error, then recovery should be possible
if test.simulatedCSIDriverError == expansionFailedOnController {
if test.simulatedCSIDriverError == expansionFailedOnControllerWithInfeasibleError {
validateExpansionSuccess(ctx, pvc, m, test, test.recoverySize.String())
return
}
// if expansion failed on node with final error but volume was only expanded on the node
// then recovery should be possible
if test.disableControllerExpansion &&
(test.simulatedCSIDriverError == expansionFailedOnNodeWithInfeasibleError ||
test.simulatedCSIDriverError == expansionFailedOnNodeWithFinalError) {
validateExpansionSuccess(ctx, pvc, m, test, test.recoverySize.String())
return
}
// if expansion succeeded on controller but failed on the node
if test.simulatedCSIDriverError == expansionFailedOnNode {
if test.simulatedCSIDriverError == expansionFailedOnNodeWithInfeasibleError {
ginkgo.By("Wait for expansion to fail on node again")
err = waitForResizeStatus(ctx, pvc, m.cs, v1.PersistentVolumeClaimNodeResizeInfeasible)
framework.ExpectNoError(err, "While waiting for resize status to be set to expansion-failed-on-node")
err = waitForResizeToFailOnNode(ctx, pvc, m.cs)
framework.ExpectNoError(err, "While waiting for resize status to be set")
ginkgo.By("verify allocated resources after recovery")
pvc, err = m.cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{})
@ -520,11 +568,11 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai
func validateExpansionSuccess(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, test recoveryTest, expectedAllocatedSize string) {
var err error
ginkgo.By("Waiting for persistent volume resize to finish")
err = testsuites.WaitForControllerVolumeResize(ctx, pvc, m.cs, csiResizeWaitPeriod)
ginkgo.By(fmt.Sprintf("Waiting for PV %s to be expanded to %s", pvc.Spec.VolumeName, test.recoverySize.String()))
err = testsuites.WaitForRecoveryPVSize(pvc, m.cs, csiResizeWaitPeriod)
framework.ExpectNoError(err, "While waiting for PV resize to finish")
ginkgo.By("Waiting for PVC resize to finish")
ginkgo.By(fmt.Sprintf("Waiting for PVC %s to be expanded to %s", pvc.Name, test.recoverySize.String()))
pvc, err = testsuites.WaitForFSResize(ctx, pvc, m.cs)
framework.ExpectNoError(err, "while waiting for PVC to finish")
@ -561,6 +609,31 @@ func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c c
return nil
}
func waitForResizeToFailOnNode(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) error {
var finalConditions []v1.PersistentVolumeClaimCondition
waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) {
var err error
updatedPVC, err := c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(pollContext, pvc.Name, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error fetching pvc %q for checking for resize status: %w", pvc.Name, err)
}
pvcConditions := updatedPVC.Status.Conditions
for _, cond := range pvcConditions {
if cond.Type == v1.PersistentVolumeClaimNodeResizeError {
return true, nil
}
}
finalConditions = pvcConditions
return false, nil
})
if waitErr != nil {
return fmt.Errorf("error while waiting for resize condition sync to NodeResizeError, actualStatus %+v: %w", finalConditions, waitErr)
}
return nil
}
func waitForAllocatedResource(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, expectedSize string) error {
expectedQuantity := resource.MustParse(expectedSize)
waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) {
@ -596,7 +669,7 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks
}
}
case expansionFailedOnController:
case expansionFailedOnControllerWithInfeasibleError:
expansionRequest, ok := request.(*csipbv1.ControllerExpandVolumeRequest)
if ok {
requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI)
@ -604,7 +677,16 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks
return nil, status.Error(codes.InvalidArgument, "invalid expansion request")
}
}
case expansionFailedOnNode:
case expansionFailedOnControllerWithFinalError:
// This simulates a condition that a final, but not infeasible error is returned when expansion fails in the controller.
expansionRequest, ok := request.(*csipbv1.ControllerExpandVolumeRequest)
if ok {
requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI)
if requestedSize.Cmp(maxControllerSizeLimit) > 0 {
return nil, status.Error(codes.PermissionDenied, "permission denied for expansion")
}
}
case expansionFailedOnNodeWithInfeasibleError:
expansionRequest, ok := request.(*csipbv1.NodeExpandVolumeRequest)
if ok {
requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI)
@ -613,6 +695,14 @@ func createExpansionHook(expectedExpansionStatus expansionStatus) *drivers.Hooks
}
}
case expansionFailedOnNodeWithFinalError:
expansionRequest, ok := request.(*csipbv1.NodeExpandVolumeRequest)
if ok {
requestedSize := resource.NewQuantity(expansionRequest.CapacityRange.RequiredBytes, resource.BinarySI)
if requestedSize.Cmp(maxNodeExpansionLimit) > 0 {
return nil, status.Error(codes.PermissionDenied, "permission denied for expansion")
}
}
}
return nil, nil

View File

@ -350,6 +350,7 @@ type mockCSIDriver struct {
embeddedCSIDriver *mockdriver.CSIDriver
enableSELinuxMount *bool
enableRecoverExpansionFailure bool
disableControllerExpansion bool
enableHonorPVReclaimPolicy bool
// Additional values set during PrepareTest
@ -392,6 +393,7 @@ type CSIMockDriverOpts struct {
EnableTopology bool
EnableResizing bool
EnableNodeExpansion bool
DisableControllerExpansion bool
EnableSnapshot bool
EnableVolumeMountGroup bool
EnableNodeVolumeCondition bool
@ -550,6 +552,7 @@ func InitMockCSIDriver(driverOpts CSIMockDriverOpts) MockCSITestDriver {
attachLimit: driverOpts.AttachLimit,
enableNodeExpansion: driverOpts.EnableNodeExpansion,
enableNodeVolumeCondition: driverOpts.EnableNodeVolumeCondition,
disableControllerExpansion: driverOpts.DisableControllerExpansion,
tokenRequests: driverOpts.TokenRequests,
requiresRepublish: driverOpts.RequiresRepublish,
fsGroupPolicy: driverOpts.FSGroupPolicy,
@ -628,6 +631,7 @@ func (m *mockCSIDriver) PrepareTest(ctx context.Context, f *framework.Framework)
DriverName: "csi-mock-" + f.UniqueName,
AttachLimit: int64(m.attachLimit),
NodeExpansionRequired: m.enableNodeExpansion,
DisableControllerExpansion: m.disableControllerExpansion,
NodeVolumeConditionRequired: m.enableNodeVolumeCondition,
VolumeMountGroupRequired: m.enableVolumeMountGroup,
EnableTopology: m.enableTopology,

View File

@ -318,15 +318,15 @@ func ExpandPVCSize(ctx context.Context, origPVC *v1.PersistentVolumeClaim, size
// Retry the update on error, until we hit a timeout.
// TODO: Determine whether "retry with timeout" is appropriate here. Maybe we should only retry on version conflict.
var lastUpdateError error
waitErr := wait.PollImmediate(resizePollInterval, 30*time.Second, func() (bool, error) {
waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, 30*time.Second, true, func(pollContext context.Context) (bool, error) {
var err error
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Get(ctx, pvcName, metav1.GetOptions{})
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Get(pollContext, pvcName, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error fetching pvc %q for resizing: %w", pvcName, err)
}
updatedPVC.Spec.Resources.Requests[v1.ResourceStorage] = size
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Update(ctx, updatedPVC, metav1.UpdateOptions{})
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(origPVC.Namespace).Update(pollContext, updatedPVC, metav1.UpdateOptions{})
if err != nil {
framework.Logf("Error updating pvc %s: %v", pvcName, err)
lastUpdateError = err
@ -392,6 +392,34 @@ func WaitForControllerVolumeResize(ctx context.Context, pvc *v1.PersistentVolume
return nil
}
// WaitForRecoveryPVSize waits for the controller resize to be finished when we are reducing volume size
// This is different from WaitForControllerVolumeResize which just waits for PV to report bigger size than
// size requested.
func WaitForRecoveryPVSize(pvc *v1.PersistentVolumeClaim, c clientset.Interface, timeout time.Duration) error {
pvName := pvc.Spec.VolumeName
waitErr := wait.PollImmediate(resizePollInterval, timeout, func() (bool, error) {
pvcSpecSize := pvc.Spec.Resources.Requests.Storage()
pv, err := c.CoreV1().PersistentVolumes().Get(context.TODO(), pvName, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error fetching pv %q for resizing %v", pvName, err)
}
pvSize := pv.Spec.Capacity[v1.ResourceStorage]
// If pv size is greater than what is mentioned in pvc's status that means control-plane
// volume expansion is finished.
if pvSize.Cmp(*pvcSpecSize) == 0 {
return true, nil
}
return false, nil
})
if waitErr != nil {
return fmt.Errorf("error while waiting for controller resize to finish: %v", waitErr)
}
return nil
}
// WaitForPendingFSResizeCondition waits for pvc to have resize condition
func WaitForPendingFSResizeCondition(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) (*v1.PersistentVolumeClaim, error) {
var updatedPVC *v1.PersistentVolumeClaim
@ -423,9 +451,9 @@ func WaitForPendingFSResizeCondition(ctx context.Context, pvc *v1.PersistentVolu
// WaitForFSResize waits for the filesystem in the pv to be resized
func WaitForFSResize(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface) (*v1.PersistentVolumeClaim, error) {
var updatedPVC *v1.PersistentVolumeClaim
waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, totalResizeWaitPeriod, true, func(ctx context.Context) (bool, error) {
waitErr := wait.PollImmediate(resizePollInterval, totalResizeWaitPeriod, func() (bool, error) {
var err error
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(ctx, pvc.Name, metav1.GetOptions{})
updatedPVC, err = c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error fetching pvc %q for checking for resize status : %w", pvc.Name, err)