From b92c68b61f99cfc8ee0b3e6737e7e53c9abb842b Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 9 Jul 2024 12:37:52 -0400 Subject: [PATCH 01/15] Add new conditions for indicating node and controller resize failures --- pkg/apis/core/types.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index 49f36e1a735..064cbdeb12b 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -544,6 +544,11 @@ const ( // PersistentVolumeClaimFileSystemResizePending - controller resize is finished and a file system resize is pending on node PersistentVolumeClaimFileSystemResizePending PersistentVolumeClaimConditionType = "FileSystemResizePending" + // PersistentVolumeClaimControllerResizeError indicates an error while resizing volume for size in the controller + PersistentVolumeClaimControllerResizeError PersistentVolumeClaimConditionType = "ControllerResizeError" + // PersistentVolumeClaimNodeResizeError indicates an error while resizing volume for size in the node. + PersistentVolumeClaimNodeResizeError PersistentVolumeClaimConditionType = "NodeResizeError" + // Applying the target VolumeAttributesClass encountered an error PersistentVolumeClaimVolumeModifyVolumeError PersistentVolumeClaimConditionType = "ModifyVolumeError" // Volume is being modified From cb1fa3fd730a7f016576ff170217777d64959d95 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 9 Jul 2024 12:47:51 -0400 Subject: [PATCH 02/15] Add new values for reporting expansion errors via conditions --- pkg/apis/core/types.go | 11 +++++++++-- staging/src/k8s.io/api/core/v1/types.go | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index 064cbdeb12b..41f435ba0bf 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -534,10 +534,17 @@ type TypedObjectReference struct { } // PersistentVolumeClaimConditionType defines the condition of PV claim. -// Valid values are either "Resizing" or "FileSystemResizePending". +// Valid values are: +// - "Resizing", "FileSystemResizePending" +// +// If RecoverVolumeExpansionFailure feature gate is enabled, then following additional values can be expected: +// - "ControllerResizeError", "NodeResizeError" +// +// If VolumeAttributesClass feature gate is enabled, then following additional values can be expected: +// - "ModifyVolumeError", "ModifyingVolume" type PersistentVolumeClaimConditionType string -// These are valid conditions of Pvc +// These are valid conditions of PVC const ( // An user trigger resize of pvc has been started PersistentVolumeClaimResizing PersistentVolumeClaimConditionType = "Resizing" diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index 0ef5fe6d62c..ea60ddf2c86 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -585,15 +585,29 @@ type TypedObjectReference struct { Namespace *string `json:"namespace,omitempty" protobuf:"bytes,4,opt,name=namespace"` } -// PersistentVolumeClaimConditionType is a valid value of PersistentVolumeClaimCondition.Type +// PersistentVolumeClaimConditionType defines the condition of PV claim. +// Valid values are: +// - "Resizing", "FileSystemResizePending" +// +// If RecoverVolumeExpansionFailure feature gate is enabled, then following additional values can be expected: +// - "ControllerResizeError", "NodeResizeError" +// +// If VolumeAttributesClass feature gate is enabled, then following additional values can be expected: +// - "ModifyVolumeError", "ModifyingVolume" type PersistentVolumeClaimConditionType string +// These are valid conditions of PVC const ( // PersistentVolumeClaimResizing - a user trigger resize of pvc has been started PersistentVolumeClaimResizing PersistentVolumeClaimConditionType = "Resizing" // PersistentVolumeClaimFileSystemResizePending - controller resize is finished and a file system resize is pending on node PersistentVolumeClaimFileSystemResizePending PersistentVolumeClaimConditionType = "FileSystemResizePending" + // PersistentVolumeClaimControllerResizeError indicates an error while resizing volume for size in the controller + PersistentVolumeClaimControllerResizeError PersistentVolumeClaimConditionType = "ControllerResizeError" + // PersistentVolumeClaimNodeResizeError indicates an error while resizing volume for size in the node. + PersistentVolumeClaimNodeResizeError PersistentVolumeClaimConditionType = "NodeResizeError" + // Applying the target VolumeAttributesClass encountered an error PersistentVolumeClaimVolumeModifyVolumeError PersistentVolumeClaimConditionType = "ModifyVolumeError" // Volume is being modified From 49e82fd12012c9e57411ded2f12d3461acf93994 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 10 Jul 2024 12:02:54 -0400 Subject: [PATCH 03/15] Rename ReizeFailed conditions to ResizeInfeasible --- pkg/apis/core/types.go | 9 +++++---- pkg/apis/core/validation/validation.go | 4 ++-- pkg/apis/core/validation/validation_test.go | 6 +++--- pkg/volume/util/operationexecutor/operation_generator.go | 6 +++--- pkg/volume/util/resize_util.go | 2 +- pkg/volume/util/resize_util_test.go | 8 ++++---- staging/src/k8s.io/api/core/v1/types.go | 9 +++++---- test/e2e/storage/csimock/csi_volume_expansion.go | 6 +++--- 8 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index 41f435ba0bf..9e1a1b709e3 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -572,18 +572,19 @@ const ( // State set when resize controller starts resizing the volume in control-plane PersistentVolumeClaimControllerResizeInProgress ClaimResourceStatus = "ControllerResizeInProgress" - // State set when resize has failed in resize controller with a terminal error. + // State set when resize has failed in resize controller with a terminal unrecoverable error. // Transient errors such as timeout should not set this status and should leave allocatedResourceStatus // unmodified, so as resize controller can resume the volume expansion. - PersistentVolumeClaimControllerResizeFailed ClaimResourceStatus = "ControllerResizeFailed" + PersistentVolumeClaimControllerResizeInfeasible ClaimResourceStatus = "ControllerResizeInfeasible" // State set when resize controller has finished resizing the volume but further resizing of volume // is needed on the node. PersistentVolumeClaimNodeResizePending ClaimResourceStatus = "NodeResizePending" // State set when kubelet starts resizing the volume. PersistentVolumeClaimNodeResizeInProgress ClaimResourceStatus = "NodeResizeInProgress" - // State set when resizing has failed in kubelet with a terminal error. Transient errors don't set NodeResizeFailed - PersistentVolumeClaimNodeResizeFailed ClaimResourceStatus = "NodeResizeFailed" + // State set when resizing has failed in kubelet with a terminal unrecoverable error. Transient errors + // shouldn't set this status + PersistentVolumeClaimNodeResizeInfeasible ClaimResourceStatus = "NodeResizeInfeasible" ) // +enum diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index e0841d0c21b..864df20738e 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -2481,10 +2481,10 @@ func validatePersistentVolumeClaimResourceKey(value string, fldPath *field.Path) } var resizeStatusSet = sets.New(core.PersistentVolumeClaimControllerResizeInProgress, - core.PersistentVolumeClaimControllerResizeFailed, + core.PersistentVolumeClaimControllerResizeInfeasible, core.PersistentVolumeClaimNodeResizePending, core.PersistentVolumeClaimNodeResizeInProgress, - core.PersistentVolumeClaimNodeResizeFailed) + core.PersistentVolumeClaimNodeResizeInfeasible) // ValidatePersistentVolumeClaimStatusUpdate validates an update to status of a PersistentVolumeClaim func ValidatePersistentVolumeClaimStatusUpdate(newPvc, oldPvc *core.PersistentVolumeClaim, validationOpts PersistentVolumeClaimSpecValidationOptions) field.ErrorList { diff --git a/pkg/apis/core/validation/validation_test.go b/pkg/apis/core/validation/validation_test.go index 65d5a2236e3..6f22c96b814 100644 --- a/pkg/apis/core/validation/validation_test.go +++ b/pkg/apis/core/validation/validation_test.go @@ -18971,7 +18971,7 @@ func TestValidatePersistentVolumeClaimStatusUpdate(t *testing.T) { }, }, core.PersistentVolumeClaimStatus{ AllocatedResourceStatuses: map[core.ResourceName]core.ClaimResourceStatus{ - core.ResourceStorage: core.PersistentVolumeClaimControllerResizeFailed, + core.ResourceStorage: core.PersistentVolumeClaimControllerResizeInfeasible, }, }) @@ -19001,7 +19001,7 @@ func TestValidatePersistentVolumeClaimStatusUpdate(t *testing.T) { }, }, core.PersistentVolumeClaimStatus{ AllocatedResourceStatuses: map[core.ResourceName]core.ClaimResourceStatus{ - core.ResourceStorage: core.PersistentVolumeClaimNodeResizeFailed, + core.ResourceStorage: core.PersistentVolumeClaimNodeResizeInfeasible, }, }) @@ -19045,7 +19045,7 @@ func TestValidatePersistentVolumeClaimStatusUpdate(t *testing.T) { validResizeKeyCustom: resource.MustParse("10Gi"), }, AllocatedResourceStatuses: map[core.ResourceName]core.ClaimResourceStatus{ - core.ResourceStorage: core.PersistentVolumeClaimControllerResizeFailed, + core.ResourceStorage: core.PersistentVolumeClaimControllerResizeInfeasible, validResizeKeyCustom: core.PersistentVolumeClaimControllerResizeInProgress, }, }) diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 06f96024c33..83566f4c5bf 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -1718,7 +1718,7 @@ func (og *operationGenerator) expandAndRecoverFunction(resizeOpts inTreeResizeOp case v1.PersistentVolumeClaimControllerResizeInProgress, v1.PersistentVolumeClaimNodeResizePending, v1.PersistentVolumeClaimNodeResizeInProgress, - v1.PersistentVolumeClaimNodeResizeFailed: + v1.PersistentVolumeClaimNodeResizeInfeasible: if allocatedSize != nil { newSize = *allocatedSize } @@ -1742,14 +1742,14 @@ func (og *operationGenerator) expandAndRecoverFunction(resizeOpts inTreeResizeOp // we don't need to do any work. We could be here because of a spurious update event. // This is case #1 return resizeResponse - case v1.PersistentVolumeClaimNodeResizeFailed: + case v1.PersistentVolumeClaimNodeResizeInfeasible: // This is case#3 pvc, err = og.markForPendingNodeExpansion(pvc, pv) resizeResponse.pvc = pvc resizeResponse.err = err return resizeResponse case v1.PersistentVolumeClaimControllerResizeInProgress, - v1.PersistentVolumeClaimControllerResizeFailed: + v1.PersistentVolumeClaimControllerResizeInfeasible: // This is case#2 or it could also be case#4 when user manually shrunk the PVC // after expanding it. if allocatedSize != nil { diff --git a/pkg/volume/util/resize_util.go b/pkg/volume/util/resize_util.go index 0f1495f7b34..aaa93d7b1e5 100644 --- a/pkg/volume/util/resize_util.go +++ b/pkg/volume/util/resize_util.go @@ -238,7 +238,7 @@ func MarkFSResizeFinished( // of volumes which are in failed state. func MarkNodeExpansionFailed(pvc *v1.PersistentVolumeClaim, kubeClient clientset.Interface) (*v1.PersistentVolumeClaim, error) { newPVC := pvc.DeepCopy() - newPVC = mergeStorageResourceStatus(newPVC, v1.PersistentVolumeClaimNodeResizeFailed) + newPVC = mergeStorageResourceStatus(newPVC, v1.PersistentVolumeClaimNodeResizeInfeasible) patchBytes, err := createPVCPatch(pvc, newPVC, false /* addResourceVersionCheck */) if err != nil { diff --git a/pkg/volume/util/resize_util_test.go b/pkg/volume/util/resize_util_test.go index 2292ee91513..7b244c96cfd 100644 --- a/pkg/volume/util/resize_util_test.go +++ b/pkg/volume/util/resize_util_test.go @@ -166,8 +166,8 @@ func TestResizeFunctions(t *testing.T) { }, { name: "mark fs resize, when other resource statuses are present", - pvc: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeFailed).get(), - expectedPVC: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeFailed). + pvc: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeInfeasible).get(), + expectedPVC: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeInfeasible). withStorageResourceStatus(v1.PersistentVolumeClaimNodeResizePending).get(), testFunc: func(pvc *v1.PersistentVolumeClaim, c clientset.Interface, _ resource.Quantity) (*v1.PersistentVolumeClaim, error) { return MarkForFSResize(pvc, c) @@ -183,9 +183,9 @@ func TestResizeFunctions(t *testing.T) { }, { name: "mark resize finished", - pvc: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeFailed). + pvc: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeInfeasible). withStorageResourceStatus(v1.PersistentVolumeClaimNodeResizePending).get(), - expectedPVC: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeFailed). + expectedPVC: basePVC.withResourceStatus(v1.ResourceCPU, v1.PersistentVolumeClaimControllerResizeInfeasible). withStorageResourceStatus("").get(), testFunc: func(pvc *v1.PersistentVolumeClaim, i clientset.Interface, q resource.Quantity) (*v1.PersistentVolumeClaim, error) { return MarkFSResizeFinished(pvc, q, i) diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index ea60ddf2c86..0adf0b4679c 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -624,18 +624,19 @@ const ( // State set when resize controller starts resizing the volume in control-plane. PersistentVolumeClaimControllerResizeInProgress ClaimResourceStatus = "ControllerResizeInProgress" - // State set when resize has failed in resize controller with a terminal error. + // State set when resize has failed in resize controller with a terminal unrecoverable error. // Transient errors such as timeout should not set this status and should leave allocatedResourceStatus // unmodified, so as resize controller can resume the volume expansion. - PersistentVolumeClaimControllerResizeFailed ClaimResourceStatus = "ControllerResizeFailed" + PersistentVolumeClaimControllerResizeInfeasible ClaimResourceStatus = "ControllerResizeInfeasible" // State set when resize controller has finished resizing the volume but further resizing of volume // is needed on the node. PersistentVolumeClaimNodeResizePending ClaimResourceStatus = "NodeResizePending" // State set when kubelet starts resizing the volume. PersistentVolumeClaimNodeResizeInProgress ClaimResourceStatus = "NodeResizeInProgress" - // State set when resizing has failed in kubelet with a terminal error. Transient errors don't set NodeResizeFailed - PersistentVolumeClaimNodeResizeFailed ClaimResourceStatus = "NodeResizeFailed" + // State set when resizing has failed in kubelet with a terminal unrecoverable error. Transient errors + // shouldn't set this status + PersistentVolumeClaimNodeResizeInfeasible ClaimResourceStatus = "NodeResizeInfeasible" ) // +enum diff --git a/test/e2e/storage/csimock/csi_volume_expansion.go b/test/e2e/storage/csimock/csi_volume_expansion.go index bf55eec8404..3edf98736fd 100644 --- a/test/e2e/storage/csimock/csi_volume_expansion.go +++ b/test/e2e/storage/csimock/csi_volume_expansion.go @@ -411,7 +411,7 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { pvcRequestSize: "11Gi", // expansion to 11Gi will cause expansion to fail on controller allocatedResource: "11Gi", simulatedCSIDriverError: expansionFailedOnController, - expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeFailed, + expectedResizeStatus: v1.PersistentVolumeClaimControllerResizeInfeasible, recoverySize: resource.MustParse("4Gi"), }, { @@ -419,7 +419,7 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { pvcRequestSize: "9Gi", // expansion to 9Gi will cause expansion to fail on node allocatedResource: "9Gi", simulatedCSIDriverError: expansionFailedOnNode, - expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeFailed, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, recoverySize: resource.MustParse("5Gi"), }, } @@ -500,7 +500,7 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai // if expansion succeeded on controller but failed on the node if test.simulatedCSIDriverError == expansionFailedOnNode { ginkgo.By("Wait for expansion to fail on node again") - err = waitForResizeStatus(pvc, m.cs, v1.PersistentVolumeClaimNodeResizeFailed) + err = waitForResizeStatus(pvc, m.cs, v1.PersistentVolumeClaimNodeResizeInfeasible) framework.ExpectNoError(err, "While waiting for resize status to be set to expansion-failed-on-node") ginkgo.By("verify allocated resources after recovery") From dac308bdceb3ad79b37826942d1ad19012f395b3 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 10 Jul 2024 16:18:11 +0000 Subject: [PATCH 04/15] Update generated files --- pkg/generated/openapi/zz_generated.openapi.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index b9f01a3a04d..21b3bb56590 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -25574,7 +25574,7 @@ func schema_k8sio_api_core_v1_PersistentVolumeClaimStatus(ref common.ReferenceCa Default: "", Type: []string{"string"}, Format: "", - Enum: []interface{}{"ControllerResizeFailed", "ControllerResizeInProgress", "NodeResizeFailed", "NodeResizeInProgress", "NodeResizePending"}, + Enum: []interface{}{"ControllerResizeInProgress", "ControllerResizeInfeasible", "NodeResizeInProgress", "NodeResizeInfeasible", "NodeResizePending"}, }, }, }, From ad28dfb42ec516bd70003c682379369e8614c493 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Fri, 12 Jul 2024 14:37:35 -0400 Subject: [PATCH 05/15] Fix code that uses old values for resize errors --- pkg/api/persistentvolumeclaim/util_test.go | 18 +++++++++--------- .../operationexecutor/node_expander_test.go | 2 +- .../operation_generator_test.go | 10 +++++----- .../noderestriction/admission_test.go | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pkg/api/persistentvolumeclaim/util_test.go b/pkg/api/persistentvolumeclaim/util_test.go index d75aadde912..f24d3f61b22 100644 --- a/pkg/api/persistentvolumeclaim/util_test.go +++ b/pkg/api/persistentvolumeclaim/util_test.go @@ -498,7 +498,7 @@ func TestDropDisabledFieldsFromStatus(t *testing.T) { name: "for:newPVC=hasResizeStatus,oldPVC=nil, featuregate=false should drop field", enableRecoverVolumeExpansionFailure: false, enableVolumeAttributesClass: false, - pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), + pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), oldPVC: nil, expected: getPVC(), }, @@ -506,25 +506,25 @@ func TestDropDisabledFieldsFromStatus(t *testing.T) { name: "for:newPVC=hasResizeStatus,oldPVC=doesnot,featuregate=RecoverVolumeExpansionFailure=true; should keep field", enableRecoverVolumeExpansionFailure: true, enableVolumeAttributesClass: false, - pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), + pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), oldPVC: getPVC(), - expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), + expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), }, { name: "for:newPVC=hasResizeStatus,oldPVC=hasResizeStatus,featuregate=RecoverVolumeExpansionFailure=true; should keep field", enableRecoverVolumeExpansionFailure: true, enableVolumeAttributesClass: false, - pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), - oldPVC: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), - expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), + pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), + oldPVC: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), + expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), }, { name: "for:newPVC=hasResizeStatus,oldPVC=hasResizeStatus,featuregate=false; should keep field", enableRecoverVolumeExpansionFailure: false, enableVolumeAttributesClass: false, - pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), - oldPVC: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), - expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeFailed), + pvc: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), + oldPVC: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), + expected: withResizeStatus(core.PersistentVolumeClaimNodeResizeInfeasible), }, { name: "for:newPVC=hasVolumeAttributeClass,oldPVC=nil, featuregate=false should drop field", diff --git a/pkg/volume/util/operationexecutor/node_expander_test.go b/pkg/volume/util/operationexecutor/node_expander_test.go index 189663359a3..09d1620fd52 100644 --- a/pkg/volume/util/operationexecutor/node_expander_test.go +++ b/pkg/volume/util/operationexecutor/node_expander_test.go @@ -29,7 +29,7 @@ import ( ) func TestNodeExpander(t *testing.T) { - nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeFailed + nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeInfeasible nodeResizePending := v1.PersistentVolumeClaimNodeResizePending var tests = []struct { diff --git a/pkg/volume/util/operationexecutor/operation_generator_test.go b/pkg/volume/util/operationexecutor/operation_generator_test.go index f0527c39c15..6fc4497361d 100644 --- a/pkg/volume/util/operationexecutor/operation_generator_test.go +++ b/pkg/volume/util/operationexecutor/operation_generator_test.go @@ -94,7 +94,7 @@ func TestOperationGenerator_GenerateUnmapVolumeFunc_PluginName(t *testing.T) { func TestOperationGenerator_GenerateExpandAndRecoverVolumeFunc(t *testing.T) { nodeResizePending := v1.PersistentVolumeClaimNodeResizePending - nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeFailed + nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeInfeasible var tests = []struct { name string pvc *v1.PersistentVolumeClaim @@ -194,7 +194,7 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { return &x } - nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeFailed + nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeInfeasible nodeResizePending := v1.PersistentVolumeClaimNodeResizePending var tests = []struct { name string @@ -217,7 +217,7 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { pvc: getTestPVC("test-vol0", "2G", "1G", "", &nodeResizeFailed), pv: getTestPV("test-vol0", "2G"), - expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeFailed, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, resizeCallCount: 0, expectedStatusSize: resource.MustParse("1G"), }, @@ -234,7 +234,7 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { pvc: getTestPVC(volumetesting.AlwaysFailNodeExpansion, "2G", "1G", "2G", &nodeResizePending), pv: getTestPV(volumetesting.AlwaysFailNodeExpansion, "2G"), expectError: true, - expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeFailed, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, resizeCallCount: 1, expectedStatusSize: resource.MustParse("1G"), }, @@ -265,7 +265,7 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { desiredSize: getSizeFunc("2G"), actualSize: getSizeFunc("1G"), - expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeFailed, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, resizeCallCount: 0, expectedStatusSize: resource.MustParse("2G"), }, diff --git a/plugin/pkg/admission/noderestriction/admission_test.go b/plugin/pkg/admission/noderestriction/admission_test.go index d4eb6d8fef4..63aa1c5fb68 100644 --- a/plugin/pkg/admission/noderestriction/admission_test.go +++ b/plugin/pkg/admission/noderestriction/admission_test.go @@ -1448,7 +1448,7 @@ func TestAdmitPVCStatus(t *testing.T) { noExistingPods := corev1lister.NewPodLister(noExistingPodsIndex) mynode := &user.DefaultInfo{Name: "system:node:mynode", Groups: []string{"system:nodes"}} - nodeExpansionFailed := api.PersistentVolumeClaimNodeResizeFailed + nodeExpansionFailed := api.PersistentVolumeClaimNodeResizeInfeasible tests := []struct { name string From 7a51999ddf2a467cb3ee7b9c0c14f9020cd0344d Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Fri, 12 Jul 2024 14:42:04 -0400 Subject: [PATCH 06/15] Deprecate intree Volume Expansion controller --- pkg/controller/volume/expand/expand_controller.go | 2 ++ pkg/volume/util/operationexecutor/operation_generator.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg/controller/volume/expand/expand_controller.go b/pkg/controller/volume/expand/expand_controller.go index 735bc388bdc..27ee243906c 100644 --- a/pkg/controller/volume/expand/expand_controller.go +++ b/pkg/controller/volume/expand/expand_controller.go @@ -68,6 +68,8 @@ type CSINameTranslator interface { GetCSINameFromInTreeName(pluginName string) (string, error) } +// Deprecated: This controller is deprecated and for now exists for the sole purpose of adding +// necessary annotations if necessary, so as volume can be expanded externally in the control-plane type expandController struct { // kubeClient is the kube API client used by volumehost to communicate with // the API server. diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 83566f4c5bf..6accbb53fd1 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -1679,6 +1679,8 @@ func (og *operationGenerator) GenerateExpandAndRecoverVolumeFunc( }, nil } +// Deprecated: This function should not called by any controller code in future and should be removed +// from kubernetes code func (og *operationGenerator) expandAndRecoverFunction(resizeOpts inTreeResizeOpts) inTreeResizeResponse { pvc := resizeOpts.pvc pv := resizeOpts.pv From cbda0889056b121798019fb36c8d7be09ac37e8b Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Fri, 12 Jul 2024 15:14:46 -0400 Subject: [PATCH 07/15] Add functions for storing volume as failed with final error --- .../cache/actual_state_of_world.go | 33 ++++++++++++++++--- .../operationexecutor/operation_executor.go | 12 +++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go index 91e2d3436ac..a9898326dbf 100644 --- a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go +++ b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go @@ -27,6 +27,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/features" @@ -211,10 +212,11 @@ func NewActualStateOfWorld( nodeName types.NodeName, volumePluginMgr *volume.VolumePluginMgr) ActualStateOfWorld { return &actualStateOfWorld{ - nodeName: nodeName, - attachedVolumes: make(map[v1.UniqueVolumeName]attachedVolume), - foundDuringReconstruction: make(map[v1.UniqueVolumeName]map[volumetypes.UniquePodName]types.UID), - volumePluginMgr: volumePluginMgr, + nodeName: nodeName, + attachedVolumes: make(map[v1.UniqueVolumeName]attachedVolume), + foundDuringReconstruction: make(map[v1.UniqueVolumeName]map[volumetypes.UniquePodName]types.UID), + volumePluginMgr: volumePluginMgr, + volumesWithFinalExpansionErrors: sets.New[string](), } } @@ -247,6 +249,8 @@ type actualStateOfWorld struct { // from kubelet root directory when kubelet was restarted. foundDuringReconstruction map[v1.UniqueVolumeName]map[volumetypes.UniquePodName]types.UID + volumesWithFinalExpansionErrors sets.Set[v1.UniqueVolumeName] + // volumePluginMgr is the volume plugin manager used to create volume // plugin objects. volumePluginMgr *volume.VolumePluginMgr @@ -396,6 +400,27 @@ func (asw *actualStateOfWorld) MarkVolumeAsDetached( asw.DeleteVolume(volumeName) } +func (asw *actualStateOfWorld) MarkVolumeExpansionFailedWithFinalError(volumeName v1.UniqueVolumeName) { + asw.Lock() + defer asw.Unlock() + + asw.volumesWithFinalExpansionErrors.Insert(volumeName) +} + +func (asw *actualStateOfWorld) RemoveVolumeFromFailedWithFinalErrors(volumeName v1.UniqueVolumeName) { + asw.Lock() + defer asw.Unlock() + + asw.volumesWithFinalExpansionErrors.Delete(volumeName) +} + +func (asw *actualStateOfWorld) CheckVolumeInFailedExpansionWithFinalErrors(volumeName v1.UniqueVolumeName) bool { + asw.RLock() + defer asw.RUnlock() + + return asw.volumesWithFinalExpansionErrors.Has(volumeName) +} + func (asw *actualStateOfWorld) IsVolumeReconstructed(volumeName v1.UniqueVolumeName, podName volumetypes.UniquePodName) bool { volumeState := asw.GetVolumeMountState(volumeName, podName) diff --git a/pkg/volume/util/operationexecutor/operation_executor.go b/pkg/volume/util/operationexecutor/operation_executor.go index 85646de50b2..eaca8ca2282 100644 --- a/pkg/volume/util/operationexecutor/operation_executor.go +++ b/pkg/volume/util/operationexecutor/operation_executor.go @@ -233,6 +233,18 @@ type ActualStateOfWorldMounterUpdater interface { // IsVolumeDeviceReconstructed returns true if volume device identified by volumeName has been // found during reconstruction. IsVolumeDeviceReconstructed(volumeName v1.UniqueVolumeName) bool + + // MarkVolumeExpansionFailedWithFinalError marks volume as failed with a final error, so as + // this state doesn't have to be recorded in the API server + MarkVolumeExpansionFailedWithFinalError(volumeName v1.UniqueVolumeName) + + // RemoveVolumeFromFailedWithFinalErrors removes volume from list that indicates that volume + // has failed expansion with a final error + RemoveVolumeFromFailedWithFinalErrors(volumeName v1.UniqueVolumeName) + + // CheckVolumeInFailedExpansionWithFinalErrors verifies if volume expansion has failed with a final + // error + CheckVolumeInFailedExpansionWithFinalErrors(volumeName v1.UniqueVolumeName) bool } // ActualStateOfWorldAttacherUpdater defines a set of operations updating the From c8d9863a3e6815a00ab34d377942a696c7d99848 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 15 Jul 2024 10:14:57 -0400 Subject: [PATCH 08/15] Add new type for infeasible errors --- .../cache/actual_state_of_world.go | 2 +- pkg/volume/csi/expander.go | 27 +++++++++++++++++++ pkg/volume/util/types/types.go | 21 +++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go index a9898326dbf..38017a588f7 100644 --- a/pkg/kubelet/volumemanager/cache/actual_state_of_world.go +++ b/pkg/kubelet/volumemanager/cache/actual_state_of_world.go @@ -216,7 +216,7 @@ func NewActualStateOfWorld( attachedVolumes: make(map[v1.UniqueVolumeName]attachedVolume), foundDuringReconstruction: make(map[v1.UniqueVolumeName]map[volumetypes.UniquePodName]types.UID), volumePluginMgr: volumePluginMgr, - volumesWithFinalExpansionErrors: sets.New[string](), + volumesWithFinalExpansionErrors: sets.New[v1.UniqueVolumeName](), } } diff --git a/pkg/volume/csi/expander.go b/pkg/volume/csi/expander.go index d6aae060101..f5e6738434c 100644 --- a/pkg/volume/csi/expander.go +++ b/pkg/volume/csi/expander.go @@ -119,6 +119,11 @@ func (c *csiPlugin) nodeExpandWithClient( failedConditionErr := fmt.Errorf("Expander.NodeExpand failed to expand the volume : %w", volumetypes.NewFailedPreconditionError(err.Error())) return false, failedConditionErr } + + if isInfeasibleError(err) { + infeasibleError := fmt.Errorf("Expander.NodeExpand failed to expand the volume: %w", volumetypes.NewInfeasibleError(err.Error())) + return false, infeasibleError + } return false, fmt.Errorf("Expander.NodeExpand failed to expand the volume : %w", err) } return true, nil @@ -135,3 +140,25 @@ func inUseError(err error) bool { // More info - https://github.com/container-storage-interface/spec/blob/master/spec.md#controllerexpandvolume-errors return st.Code() == codes.FailedPrecondition } + +// IsInfeasibleError returns true for grpc errors that are considered terminal in a way +// that they indicate CSI operation as infeasible. +// This function returns a subset of final errors. All infeasible errors are also final errors. +func isInfeasibleError(err error) bool { + st, ok := status.FromError(err) + if !ok { + // This is not gRPC error. The operation must have failed before gRPC + // method was called, otherwise we would get gRPC error. + // We don't know if any previous volume operation is in progress, be on the safe side. + return false + } + switch st.Code() { + case codes.InvalidArgument, + codes.OutOfRange, + codes.NotFound: + return true + } + // All other errors mean that operation either did not + // even start or failed. It is for sure are not infeasible errors + return false +} diff --git a/pkg/volume/util/types/types.go b/pkg/volume/util/types/types.go index 238e919b331..316962f8f5d 100644 --- a/pkg/volume/util/types/types.go +++ b/pkg/volume/util/types/types.go @@ -102,6 +102,27 @@ func IsFailedPreconditionError(err error) bool { return errors.As(err, &failedPreconditionError) } +// InfeasibleError errors are a subset of OperationFinished or final error +// codes. In terms of CSI - this usually means that, the operation is not possible +// in current state with given arguments. +type InfeasibleError struct { + msg string +} + +func (err *InfeasibleError) Error() string { + return err.msg +} + +// NewInfeasibleError returns a new instance of InfeasibleError +func NewInfeasibleError(msg string) *InfeasibleError { + return &InfeasibleError{msg: msg} +} + +func IsInfeasibleError(err error) bool { + var infeasibleError *InfeasibleError + return errors.As(err, &infeasibleError) +} + type OperationNotSupported struct { msg string } From 785e68a626634e01e15f3864d3fe92c7ec2b64bb Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 15 Jul 2024 15:20:35 -0400 Subject: [PATCH 09/15] Add code for testing final errors --- pkg/volume/testing/testing.go | 11 +- .../util/operationexecutor/node_expander.go | 27 +++- .../operationexecutor/node_expander_test.go | 146 +++++++++++++++++- .../operation_generator_test.go | 7 +- pkg/volume/util/resize_util.go | 35 ++++- 5 files changed, 211 insertions(+), 15 deletions(-) diff --git a/pkg/volume/testing/testing.go b/pkg/volume/testing/testing.go index 1710dc63f32..cd7c654610d 100644 --- a/pkg/volume/testing/testing.go +++ b/pkg/volume/testing/testing.go @@ -88,7 +88,8 @@ const ( FailVolumeExpansion = "fail-expansion-test" - AlwaysFailNodeExpansion = "always-fail-node-expansion" + InfeasibleNodeExpansion = "infeasible-fail-node-expansion" + OtherFinalNodeExpansionError = "other-final-node-expansion-error" deviceNotMounted = "deviceNotMounted" deviceMountUncertain = "deviceMountUncertain" @@ -500,8 +501,12 @@ func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions volume.NodeResizeOption return false, volumetypes.NewOperationNotSupportedError("volume-unsupported") } - if resizeOptions.VolumeSpec.Name() == AlwaysFailNodeExpansion { - return false, fmt.Errorf("test failure: NodeExpand") + if resizeOptions.VolumeSpec.Name() == InfeasibleNodeExpansion { + return false, volumetypes.NewInfeasibleError("infeasible-expansion") + } + + if resizeOptions.VolumeSpec.Name() == OtherFinalNodeExpansionError { + return false, fmt.Errorf("other-final-node-expansion-error") } if resizeOptions.VolumeSpec.Name() == FailVolumeExpansion { diff --git a/pkg/volume/util/operationexecutor/node_expander.go b/pkg/volume/util/operationexecutor/node_expander.go index cf9c57504e8..8e55d0b771c 100644 --- a/pkg/volume/util/operationexecutor/node_expander.go +++ b/pkg/volume/util/operationexecutor/node_expander.go @@ -39,6 +39,10 @@ type NodeExpander struct { pvCap resource.Quantity resizeStatus v1.ClaimResourceStatus + // indicates that pvc spec size has actually changed since last we observerd size + // on the kubelet + markExpansionInfeasible bool + // pvcAlreadyUpdated if true indicates that although we are calling NodeExpandVolume on the kubelet // PVC has already been updated - possibly because expansion already succeeded on different node. // This can happen when a RWX PVC is expanded. @@ -79,6 +83,15 @@ func (ne *NodeExpander) runPreCheck() bool { ne.resizeStatus = currentStatus } + pvcSpecCap := ne.pvc.Spec.Resources.Requests[v1.ResourceStorage] + // usually when are performing node expansion, we expect pv size and pvc spec size + // to be the same, but if user has edited pvc since then and volume expansion failed + // with final error, then we should let controller reconcile this state. + if pvcSpecCap.Cmp(ne.pluginResizeOpts.NewSize) != 0 && + ne.actualStateOfWorld.CheckVolumeInFailedExpansionWithFinalErrors(ne.vmt.VolumeName) { + ne.markExpansionInfeasible = true + } + // PVC is already expanded but we are still trying to expand the volume because // last recorded size in ASOW is older. This can happen for RWX volume types. if ne.pvcStatusCap.Cmp(ne.pluginResizeOpts.NewSize) >= 0 && ne.resizeStatus == "" { @@ -124,9 +137,17 @@ func (ne *NodeExpander) expandOnPlugin() (bool, error, testResponseData) { if resizeErr != nil { if volumetypes.IsOperationFinishedError(resizeErr) { var markFailedError error - ne.pvc, markFailedError = util.MarkNodeExpansionFailed(ne.pvc, ne.kubeClient) - if markFailedError != nil { - klog.Errorf(ne.vmt.GenerateErrorDetailed("MountMount.NodeExpandVolume failed to mark node expansion as failed: %v", err).Error()) + ne.actualStateOfWorld.MarkVolumeExpansionFailedWithFinalError(ne.vmt.VolumeName) + if volumetypes.IsInfeasibleError(resizeErr) || ne.markExpansionInfeasible { + ne.pvc, markFailedError = util.MarkNodeExpansionInfeasible(ne.pvc, ne.kubeClient, resizeErr) + if markFailedError != nil { + klog.Errorf(ne.vmt.GenerateErrorDetailed("MountMount.NodeExpandVolume failed to mark node expansion as failed: %v", err).Error()) + } + } else { + ne.pvc, markFailedError = util.MarkNodeExpansionFailedCondition(ne.pvc, ne.kubeClient, resizeErr) + if markFailedError != nil { + klog.Errorf(ne.vmt.GenerateErrorDetailed("MountMount.NodeExpandVolume failed to mark node expansion as failed: %v", err).Error()) + } } } diff --git a/pkg/volume/util/operationexecutor/node_expander_test.go b/pkg/volume/util/operationexecutor/node_expander_test.go index 09d1620fd52..87d029004bc 100644 --- a/pkg/volume/util/operationexecutor/node_expander_test.go +++ b/pkg/volume/util/operationexecutor/node_expander_test.go @@ -17,17 +17,37 @@ limitations under the License. package operationexecutor import ( + "sync" "testing" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/volume" volumetesting "k8s.io/kubernetes/pkg/volume/testing" + volumetypes "k8s.io/kubernetes/pkg/volume/util/types" ) +type fakeActualStateOfWorld struct { + // nodeName is the name of this node. This value is passed to Attach/Detach + nodeName types.NodeName + + volumesWithFinalExpansionErrors sets.Set[v1.UniqueVolumeName] + sync.RWMutex +} + +var _ ActualStateOfWorldMounterUpdater = &fakeActualStateOfWorld{} + +func newFakeActualStateOfWorld() *fakeActualStateOfWorld { + return &fakeActualStateOfWorld{ + volumesWithFinalExpansionErrors: sets.New[v1.UniqueVolumeName](), + } +} + func TestNodeExpander(t *testing.T) { nodeResizeFailed := v1.PersistentVolumeClaimNodeResizeInfeasible @@ -46,6 +66,7 @@ func TestNodeExpander(t *testing.T) { expectedResizeStatus v1.ClaimResourceStatus expectedStatusSize resource.Quantity expectResizeCall bool + expectFinalErrors bool assumeResizeOpAsFinished bool expectError bool }{ @@ -57,6 +78,7 @@ func TestNodeExpander(t *testing.T) { expectedResizeStatus: nodeResizeFailed, expectResizeCall: false, assumeResizeOpAsFinished: true, + expectFinalErrors: false, expectedStatusSize: resource.MustParse("1G"), }, { @@ -66,16 +88,29 @@ func TestNodeExpander(t *testing.T) { expectedResizeStatus: "", expectResizeCall: true, assumeResizeOpAsFinished: true, + expectFinalErrors: false, expectedStatusSize: resource.MustParse("2G"), }, { - name: "pv.spec.cap > pvc.status.cap, resizeStatus=node_expansion_pending, reize_op=failing", - pvc: getTestPVC(volumetesting.AlwaysFailNodeExpansion, "2G", "1G", "2G", &nodeResizePending), - pv: getTestPV(volumetesting.AlwaysFailNodeExpansion, "2G"), + name: "pv.spec.cap > pvc.status.cap, resizeStatus=node_expansion_pending, reize_op=infeasible", + pvc: getTestPVC(volumetesting.InfeasibleNodeExpansion, "2G", "1G", "2G", &nodeResizePending), + pv: getTestPV(volumetesting.InfeasibleNodeExpansion, "2G"), expectError: true, expectedResizeStatus: nodeResizeFailed, expectResizeCall: true, assumeResizeOpAsFinished: true, + expectFinalErrors: true, + expectedStatusSize: resource.MustParse("1G"), + }, + { + name: "pv.spec.cap > pvc.status.cap, resizeStatus=node_expansion_pending, reize_op=failing", + pvc: getTestPVC(volumetesting.OtherFinalNodeExpansionError, "2G", "1G", "2G", &nodeResizePending), + pv: getTestPV(volumetesting.OtherFinalNodeExpansionError, "2G"), + expectError: true, + expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInProgress, + expectResizeCall: true, + assumeResizeOpAsFinished: true, + expectFinalErrors: true, expectedStatusSize: resource.MustParse("1G"), }, { @@ -86,6 +121,7 @@ func TestNodeExpander(t *testing.T) { expectedResizeStatus: "", expectResizeCall: true, assumeResizeOpAsFinished: true, + expectFinalErrors: false, expectedStatusSize: resource.MustParse("2G"), }, } @@ -114,12 +150,13 @@ func TestNodeExpander(t *testing.T) { if actualSize == nil { actualSize = pvc.Status.Capacity.Storage() } + asow := newFakeActualStateOfWorld() resizeOp := nodeResizeOperationOpts{ pvc: pvc, pv: pv, volumePlugin: fakePlugin, vmt: vmt, - actualStateOfWorld: nil, + actualStateOfWorld: asow, pluginResizeOpts: volume.NodeResizeOptions{ VolumeSpec: vmt.VolumeSpec, NewSize: *desiredSize, @@ -153,9 +190,110 @@ func TestNodeExpander(t *testing.T) { if test.expectedResizeStatus != resizeStatus { t.Errorf("For test %s, expected resizeStatus %v, got %v", test.name, test.expectedResizeStatus, resizeStatus) } + + if test.expectFinalErrors != asow.CheckVolumeInFailedExpansionWithFinalErrors(vmt.VolumeName) { + t.Errorf("For test %s, expected final errors %t, got %t", test.name, test.expectFinalErrors, !test.expectFinalErrors) + } if pvcStatusCap.Cmp(test.expectedStatusSize) != 0 { t.Errorf("For test %s, expected status size %s, got %s", test.name, test.expectedStatusSize.String(), pvcStatusCap.String()) } }) } } + +// CheckAndMarkDeviceUncertainViaReconstruction implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) CheckAndMarkDeviceUncertainViaReconstruction(volumeName v1.UniqueVolumeName, deviceMountPath string) bool { + panic("unimplemented") +} + +// CheckAndMarkVolumeAsUncertainViaReconstruction implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) CheckAndMarkVolumeAsUncertainViaReconstruction(opts MarkVolumeOpts) (bool, error) { + panic("unimplemented") +} + +// CheckVolumeInFailedExpansionWithFinalErrors implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) CheckVolumeInFailedExpansionWithFinalErrors(volumeName v1.UniqueVolumeName) bool { + f.RLock() + defer f.RUnlock() + return f.volumesWithFinalExpansionErrors.Has(volumeName) +} + +// GetDeviceMountState implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) GetDeviceMountState(volumeName v1.UniqueVolumeName) DeviceMountState { + panic("unimplemented") +} + +// GetVolumeMountState implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) GetVolumeMountState(volumName v1.UniqueVolumeName, podName volumetypes.UniquePodName) VolumeMountState { + panic("unimplemented") +} + +// IsVolumeDeviceReconstructed implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) IsVolumeDeviceReconstructed(volumeName v1.UniqueVolumeName) bool { + panic("unimplemented") +} + +// IsVolumeMountedElsewhere implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) IsVolumeMountedElsewhere(volumeName v1.UniqueVolumeName, podName volumetypes.UniquePodName) bool { + panic("unimplemented") +} + +// IsVolumeReconstructed implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) IsVolumeReconstructed(volumeName v1.UniqueVolumeName, podName volumetypes.UniquePodName) bool { + panic("unimplemented") +} + +// MarkDeviceAsMounted implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkDeviceAsMounted(volumeName v1.UniqueVolumeName, devicePath string, deviceMountPath string, seLinuxMountContext string) error { + panic("unimplemented") +} + +// MarkDeviceAsUncertain implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkDeviceAsUncertain(volumeName v1.UniqueVolumeName, devicePath string, deviceMountPath string, seLinuxMountContext string) error { + panic("unimplemented") +} + +// MarkDeviceAsUnmounted implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkDeviceAsUnmounted(volumeName v1.UniqueVolumeName) error { + panic("unimplemented") +} + +// MarkForInUseExpansionError implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkForInUseExpansionError(volumeName v1.UniqueVolumeName) { + panic("unimplemented") +} + +// MarkVolumeAsMounted implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkVolumeAsMounted(markVolumeOpts MarkVolumeOpts) error { + panic("unimplemented") +} + +// MarkVolumeAsResized implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkVolumeAsResized(volumeName v1.UniqueVolumeName, claimSize *resource.Quantity) bool { + panic("unimplemented") +} + +// MarkVolumeAsUnmounted implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkVolumeAsUnmounted(podName volumetypes.UniquePodName, volumeName v1.UniqueVolumeName) error { + panic("unimplemented") +} + +// MarkVolumeExpansionFailedWithFinalError implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkVolumeExpansionFailedWithFinalError(volumeName v1.UniqueVolumeName) { + f.Lock() + defer f.Unlock() + + f.volumesWithFinalExpansionErrors.Insert(volumeName) +} + +// MarkVolumeMountAsUncertain implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) MarkVolumeMountAsUncertain(markVolumeOpts MarkVolumeOpts) error { + panic("unimplemented") +} + +// RemoveVolumeFromFailedWithFinalErrors implements ActualStateOfWorldMounterUpdater. +func (f *fakeActualStateOfWorld) RemoveVolumeFromFailedWithFinalErrors(volumeName v1.UniqueVolumeName) { + f.Lock() + defer f.Unlock() + f.volumesWithFinalExpansionErrors.Delete(volumeName) +} diff --git a/pkg/volume/util/operationexecutor/operation_generator_test.go b/pkg/volume/util/operationexecutor/operation_generator_test.go index 6fc4497361d..2835941cb30 100644 --- a/pkg/volume/util/operationexecutor/operation_generator_test.go +++ b/pkg/volume/util/operationexecutor/operation_generator_test.go @@ -231,8 +231,8 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { }, { name: "pv.spec.cap > pvc.status.cap, resizeStatus=node_expansion_pending, reize_op=failing", - pvc: getTestPVC(volumetesting.AlwaysFailNodeExpansion, "2G", "1G", "2G", &nodeResizePending), - pv: getTestPV(volumetesting.AlwaysFailNodeExpansion, "2G"), + pvc: getTestPVC(volumetesting.InfeasibleNodeExpansion, "2G", "1G", "2G", &nodeResizePending), + pv: getTestPV(volumetesting.InfeasibleNodeExpansion, "2G"), expectError: true, expectedResizeStatus: v1.PersistentVolumeClaimNodeResizeInfeasible, resizeCallCount: 1, @@ -302,9 +302,10 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { NewSize: *desiredSize, OldSize: *actualSize, } + asow := newFakeActualStateOfWorld() ogInstance, _ := og.(*operationGenerator) - _, err := ogInstance.nodeExpandVolume(vmt, nil, pluginResizeOpts) + _, err := ogInstance.nodeExpandVolume(vmt, asow, pluginResizeOpts) if !test.expectError && err != nil { t.Errorf("For test %s, expected no error got: %v", test.name, err) diff --git a/pkg/volume/util/resize_util.go b/pkg/volume/util/resize_util.go index aaa93d7b1e5..6279ef9a39a 100644 --- a/pkg/volume/util/resize_util.go +++ b/pkg/volume/util/resize_util.go @@ -40,6 +40,8 @@ var ( knownResizeConditions = map[v1.PersistentVolumeClaimConditionType]bool{ v1.PersistentVolumeClaimFileSystemResizePending: true, v1.PersistentVolumeClaimResizing: true, + v1.PersistentVolumeClaimControllerResizeError: true, + v1.PersistentVolumeClaimNodeResizeError: true, } // AnnPreResizeCapacity annotation is added to a PV when expanding volume. @@ -234,11 +236,18 @@ func MarkFSResizeFinished( return updatedPVC, err } -// MarkNodeExpansionFailed marks a PVC for node expansion as failed. Kubelet should not retry expansion +// MarkNodeExpansionInfeasible marks a PVC for node expansion as failed. Kubelet should not retry expansion // of volumes which are in failed state. -func MarkNodeExpansionFailed(pvc *v1.PersistentVolumeClaim, kubeClient clientset.Interface) (*v1.PersistentVolumeClaim, error) { +func MarkNodeExpansionInfeasible(pvc *v1.PersistentVolumeClaim, kubeClient clientset.Interface, err error) (*v1.PersistentVolumeClaim, error) { newPVC := pvc.DeepCopy() newPVC = mergeStorageResourceStatus(newPVC, v1.PersistentVolumeClaimNodeResizeInfeasible) + errorCondition := v1.PersistentVolumeClaimCondition{ + Type: v1.PersistentVolumeClaimNodeResizeError, + Status: v1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Message: fmt.Sprintf("failed to expand pvc with %v", err), + } + newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{errorCondition}) patchBytes, err := createPVCPatch(pvc, newPVC, false /* addResourceVersionCheck */) if err != nil { @@ -253,6 +262,28 @@ func MarkNodeExpansionFailed(pvc *v1.PersistentVolumeClaim, kubeClient clientset return updatedClaim, nil } +func MarkNodeExpansionFailedCondition(pvc *v1.PersistentVolumeClaim, kubeClient clientset.Interface, err error) (*v1.PersistentVolumeClaim, error) { + newPVC := pvc.DeepCopy() + errorCondition := v1.PersistentVolumeClaimCondition{ + Type: v1.PersistentVolumeClaimNodeResizeError, + Status: v1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Message: fmt.Sprintf("failed to expand pvc with %v", err), + } + newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{errorCondition}) + patchBytes, err := createPVCPatch(pvc, newPVC, false /* addResourceVersionCheck */) + if err != nil { + return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %v", pvc.Name, err) + } + + updatedClaim, updateErr := kubeClient.CoreV1().PersistentVolumeClaims(pvc.Namespace). + Patch(context.TODO(), pvc.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") + if updateErr != nil { + return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %v", pvc.Name, updateErr) + } + return updatedClaim, nil +} + // MarkNodeExpansionInProgress marks pvc expansion in progress on node func MarkNodeExpansionInProgress(pvc *v1.PersistentVolumeClaim, kubeClient clientset.Interface) (*v1.PersistentVolumeClaim, error) { newPVC := pvc.DeepCopy() From 099cb71a53b8b123da7758b96b8357cecd8d826a Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 15 Jul 2024 17:36:30 -0400 Subject: [PATCH 10/15] Ensure that all options are correctly set when calling node-expand-during-mount --- pkg/volume/testing/testing.go | 2 + .../util/operationexecutor/node_expander.go | 2 +- .../operationexecutor/node_expander_test.go | 4 - .../operationexecutor/operation_generator.go | 1 + .../operation_generator_test.go | 84 +++++++++++++++++++ 5 files changed, 88 insertions(+), 5 deletions(-) diff --git a/pkg/volume/testing/testing.go b/pkg/volume/testing/testing.go index cd7c654610d..f0ebe444811 100644 --- a/pkg/volume/testing/testing.go +++ b/pkg/volume/testing/testing.go @@ -180,6 +180,7 @@ type FakeVolumePlugin struct { Host volume.VolumeHost Config volume.VolumeConfig LastProvisionerOptions volume.VolumeOptions + LastResizeOptions volume.NodeResizeOptions NewAttacherCallCount int NewDetacherCallCount int NodeExpandCallCount int @@ -494,6 +495,7 @@ func (plugin *FakeVolumePlugin) RequiresFSResize() bool { func (plugin *FakeVolumePlugin) NodeExpand(resizeOptions volume.NodeResizeOptions) (bool, error) { plugin.NodeExpandCallCount++ + plugin.LastResizeOptions = resizeOptions if resizeOptions.VolumeSpec.Name() == FailWithInUseVolumeName { return false, volumetypes.NewFailedPreconditionError("volume-in-use") } diff --git a/pkg/volume/util/operationexecutor/node_expander.go b/pkg/volume/util/operationexecutor/node_expander.go index 8e55d0b771c..743b5837497 100644 --- a/pkg/volume/util/operationexecutor/node_expander.go +++ b/pkg/volume/util/operationexecutor/node_expander.go @@ -39,7 +39,7 @@ type NodeExpander struct { pvCap resource.Quantity resizeStatus v1.ClaimResourceStatus - // indicates that pvc spec size has actually changed since last we observerd size + // indicates that pvc spec size has actually changed since last we observed size // on the kubelet markExpansionInfeasible bool diff --git a/pkg/volume/util/operationexecutor/node_expander_test.go b/pkg/volume/util/operationexecutor/node_expander_test.go index 87d029004bc..40d09e15c26 100644 --- a/pkg/volume/util/operationexecutor/node_expander_test.go +++ b/pkg/volume/util/operationexecutor/node_expander_test.go @@ -22,7 +22,6 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" @@ -33,9 +32,6 @@ import ( ) type fakeActualStateOfWorld struct { - // nodeName is the name of this node. This value is passed to Attach/Detach - nodeName types.NodeName - volumesWithFinalExpansionErrors sets.Set[v1.UniqueVolumeName] sync.RWMutex } diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 6accbb53fd1..2e92b30fd20 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -2003,6 +2003,7 @@ func (og *operationGenerator) expandVolumeDuringMount(volumeToMount VolumeToMoun rsOpts.NewSize = pvSpecCap rsOpts.OldSize = pvcStatusCap + // rsOpts.VolumeSpec = volumeToMount.VolumeSpec resizeOp := nodeResizeOperationOpts{ vmt: volumeToMount, pvc: pvc, diff --git a/pkg/volume/util/operationexecutor/operation_generator_test.go b/pkg/volume/util/operationexecutor/operation_generator_test.go index 2835941cb30..b9ca558be0c 100644 --- a/pkg/volume/util/operationexecutor/operation_generator_test.go +++ b/pkg/volume/util/operationexecutor/operation_generator_test.go @@ -320,6 +320,90 @@ func TestOperationGenerator_nodeExpandVolume(t *testing.T) { } } +func TestExpandDuringMount(t *testing.T) { + nodeResizePending := v1.PersistentVolumeClaimNodeResizePending + var tests = []struct { + name string + pvc *v1.PersistentVolumeClaim + pv *v1.PersistentVolume + + // desired size, defaults to pv.Spec.Capacity + desiredSize *resource.Quantity + // actualSize, defaults to pvc.Status.Capacity + actualSize *resource.Quantity + + // expectations of test + expectedResizeStatus v1.ClaimResourceStatus + expectedStatusSize resource.Quantity + resizeCallCount int + expectError bool + }{ + { + name: "pv.spec.cap > pvc.status.cap, resizeStatus=node_expansion_pending", + pvc: getTestPVC("test-vol0", "2G", "1G", "2G", &nodeResizePending), + pv: getTestPV("test-vol0", "2G"), + expectedResizeStatus: "", + resizeCallCount: 1, + expectedStatusSize: resource.MustParse("2G"), + }, + } + for i := range tests { + test := tests[i] + t.Run(test.name, func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.RecoverVolumeExpansionFailure, true) + volumePluginMgr, fakePlugin := volumetesting.GetTestKubeletVolumePluginMgr(t) + test.pv.Spec.ClaimRef = &v1.ObjectReference{ + Namespace: test.pvc.Namespace, + Name: test.pvc.Name, + } + + pvc := test.pvc + pv := test.pv + pod := getTestPod("test-pod", pvc.Name) + og := getTestOperatorGeneratorWithPVPVC(volumePluginMgr, pvc, pv) + vmt := VolumeToMount{ + Pod: pod, + VolumeName: v1.UniqueVolumeName(pv.Name), + VolumeSpec: volume.NewSpecFromPersistentVolume(pv, false), + } + desiredSize := test.desiredSize + if desiredSize == nil { + desiredSize = pv.Spec.Capacity.Storage() + } + actualSize := test.actualSize + if actualSize == nil { + actualSize = pvc.Status.Capacity.Storage() + } + pluginResizeOpts := volume.NodeResizeOptions{ + VolumeSpec: vmt.VolumeSpec, + NewSize: *desiredSize, + OldSize: *actualSize, + } + asow := newFakeActualStateOfWorld() + + ogInstance, _ := og.(*operationGenerator) + _, err := ogInstance.expandVolumeDuringMount(vmt, asow, pluginResizeOpts) + + if !test.expectError && err != nil { + t.Errorf("For test %s, expected no error got: %v", test.name, err) + } + if test.expectError && err == nil { + t.Errorf("For test %s, expected error but got none", test.name) + } + if test.resizeCallCount != fakePlugin.NodeExpandCallCount { + t.Errorf("for test %s, expected node-expand call count to be %d, got %d", test.name, test.resizeCallCount, fakePlugin.NodeExpandCallCount) + } + + if test.resizeCallCount > 0 { + resizeOptions := fakePlugin.LastResizeOptions + if resizeOptions.VolumeSpec == nil { + t.Errorf("for test %s, expected volume spec to be set", test.name) + } + } + }) + } +} + func getTestPod(podName, pvcName string) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ From b3db0ba04c8fcc8de96affd3af1c6a54fd9f3bd1 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 16 Jul 2024 12:07:46 -0400 Subject: [PATCH 11/15] Fix error about missing volumeSpec for expansion during mount --- pkg/volume/util/operationexecutor/operation_generator.go | 2 +- .../util/operationexecutor/operation_generator_test.go | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/volume/util/operationexecutor/operation_generator.go b/pkg/volume/util/operationexecutor/operation_generator.go index 2e92b30fd20..30788ee2844 100644 --- a/pkg/volume/util/operationexecutor/operation_generator.go +++ b/pkg/volume/util/operationexecutor/operation_generator.go @@ -2003,7 +2003,7 @@ func (og *operationGenerator) expandVolumeDuringMount(volumeToMount VolumeToMoun rsOpts.NewSize = pvSpecCap rsOpts.OldSize = pvcStatusCap - // rsOpts.VolumeSpec = volumeToMount.VolumeSpec + rsOpts.VolumeSpec = volumeToMount.VolumeSpec resizeOp := nodeResizeOperationOpts{ vmt: volumeToMount, pvc: pvc, diff --git a/pkg/volume/util/operationexecutor/operation_generator_test.go b/pkg/volume/util/operationexecutor/operation_generator_test.go index b9ca558be0c..0c267222e89 100644 --- a/pkg/volume/util/operationexecutor/operation_generator_test.go +++ b/pkg/volume/util/operationexecutor/operation_generator_test.go @@ -375,9 +375,8 @@ func TestExpandDuringMount(t *testing.T) { actualSize = pvc.Status.Capacity.Storage() } pluginResizeOpts := volume.NodeResizeOptions{ - VolumeSpec: vmt.VolumeSpec, - NewSize: *desiredSize, - OldSize: *actualSize, + NewSize: *desiredSize, + OldSize: *actualSize, } asow := newFakeActualStateOfWorld() From b59c3c5d3d4ddcc126e886efd2e401300c74f14d Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 16 Jul 2024 13:16:48 -0400 Subject: [PATCH 12/15] Preserve conditions in case we are retrying expansion in some cases When marking pvc expansion for failed condition, we should try and preserve old resizing conditions with different name. --- pkg/volume/util/resize_util.go | 26 +++++++++++++-------- pkg/volume/util/resize_util_test.go | 35 ++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/pkg/volume/util/resize_util.go b/pkg/volume/util/resize_util.go index 6279ef9a39a..2bf54b4b85c 100644 --- a/pkg/volume/util/resize_util.go +++ b/pkg/volume/util/resize_util.go @@ -142,7 +142,7 @@ func MarkResizeInProgressWithResizer( } conditions := []v1.PersistentVolumeClaimCondition{progressCondition} newPVC := pvc.DeepCopy() - newPVC = MergeResizeConditionOnPVC(newPVC, conditions) + newPVC = MergeResizeConditionOnPVC(newPVC, conditions, false /* keepOldResizeConditions */) newPVC = setResizer(newPVC, resizerName) return PatchPVCStatus(pvc /*oldPVC*/, newPVC, kubeClient) } @@ -156,7 +156,7 @@ func MarkControllerReisizeInProgress(pvc *v1.PersistentVolumeClaim, resizerName } conditions := []v1.PersistentVolumeClaimCondition{progressCondition} newPVC := pvc.DeepCopy() - newPVC = MergeResizeConditionOnPVC(newPVC, conditions) + newPVC = MergeResizeConditionOnPVC(newPVC, conditions, false /* keepOldResizeConditions */) newPVC = mergeStorageResourceStatus(newPVC, v1.PersistentVolumeClaimControllerResizeInProgress) newPVC = mergeStorageAllocatedResources(newPVC, newSize) newPVC = setResizer(newPVC, resizerName) @@ -198,7 +198,7 @@ func MarkForFSResize( newPVC = mergeStorageResourceStatus(newPVC, v1.PersistentVolumeClaimNodeResizePending) } - newPVC = MergeResizeConditionOnPVC(newPVC, conditions) + newPVC = MergeResizeConditionOnPVC(newPVC, conditions, true /* keepOldResizeConditions */) updatedPVC, err := PatchPVCStatus(pvc /*oldPVC*/, newPVC, kubeClient) return updatedPVC, err } @@ -231,7 +231,7 @@ func MarkFSResizeFinished( } } - newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{}) + newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{}, false /* keepOldResizeConditions */) updatedPVC, err := PatchPVCStatus(pvc /*oldPVC*/, newPVC, kubeClient) return updatedPVC, err } @@ -247,7 +247,9 @@ func MarkNodeExpansionInfeasible(pvc *v1.PersistentVolumeClaim, kubeClient clien LastTransitionTime: metav1.Now(), Message: fmt.Sprintf("failed to expand pvc with %v", err), } - newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{errorCondition}) + newPVC = MergeResizeConditionOnPVC(newPVC, + []v1.PersistentVolumeClaimCondition{errorCondition}, + true /* keepOldResizeConditions */) patchBytes, err := createPVCPatch(pvc, newPVC, false /* addResourceVersionCheck */) if err != nil { @@ -270,16 +272,18 @@ func MarkNodeExpansionFailedCondition(pvc *v1.PersistentVolumeClaim, kubeClient LastTransitionTime: metav1.Now(), Message: fmt.Sprintf("failed to expand pvc with %v", err), } - newPVC = MergeResizeConditionOnPVC(newPVC, []v1.PersistentVolumeClaimCondition{errorCondition}) + newPVC = MergeResizeConditionOnPVC(newPVC, + []v1.PersistentVolumeClaimCondition{errorCondition}, + true /* keepOldResizeConditions */) patchBytes, err := createPVCPatch(pvc, newPVC, false /* addResourceVersionCheck */) if err != nil { - return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %v", pvc.Name, err) + return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %w", pvc.Name, err) } updatedClaim, updateErr := kubeClient.CoreV1().PersistentVolumeClaims(pvc.Namespace). Patch(context.TODO(), pvc.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") if updateErr != nil { - return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %v", pvc.Name, updateErr) + return pvc, fmt.Errorf("patchPVCStatus failed to patch PVC %q: %w", pvc.Name, updateErr) } return updatedClaim, nil } @@ -364,7 +368,7 @@ func addResourceVersion(patchBytes []byte, resourceVersion string) ([]byte, erro // leaving other conditions untouched. func MergeResizeConditionOnPVC( pvc *v1.PersistentVolumeClaim, - resizeConditions []v1.PersistentVolumeClaimCondition) *v1.PersistentVolumeClaim { + resizeConditions []v1.PersistentVolumeClaimCondition, keepOldResizeConditions bool) *v1.PersistentVolumeClaim { resizeConditionMap := map[v1.PersistentVolumeClaimConditionType]*resizeProcessStatus{} for _, condition := range resizeConditions { @@ -387,6 +391,10 @@ func MergeResizeConditionOnPVC( newConditions = append(newConditions, condition) } newCondition.processed = true + } else if keepOldResizeConditions { + // if keepOldResizeConditions is true, we keep the old resize conditions that were present in the + // existing pvc.Status.Conditions field. + newConditions = append(newConditions, condition) } } diff --git a/pkg/volume/util/resize_util_test.go b/pkg/volume/util/resize_util_test.go index 7b244c96cfd..b10b04ceb02 100644 --- a/pkg/volume/util/resize_util_test.go +++ b/pkg/volume/util/resize_util_test.go @@ -33,10 +33,11 @@ import ( ) type conditionMergeTestCase struct { - description string - pvc *v1.PersistentVolumeClaim - newConditions []v1.PersistentVolumeClaimCondition - finalConditions []v1.PersistentVolumeClaimCondition + description string + pvc *v1.PersistentVolumeClaim + keepOldResizeConditions bool + newConditions []v1.PersistentVolumeClaimCondition + finalConditions []v1.PersistentVolumeClaimCondition } func TestMergeResizeCondition(t *testing.T) { @@ -132,10 +133,34 @@ func TestMergeResizeCondition(t *testing.T) { }, }, }, + { + description: "when adding new condition with existing resize conditions", + pvc: pvc.DeepCopy(), + keepOldResizeConditions: true, + newConditions: []v1.PersistentVolumeClaimCondition{ + { + Type: v1.PersistentVolumeClaimNodeResizeError, + Status: v1.ConditionTrue, + LastTransitionTime: currentTime, + }, + }, + finalConditions: []v1.PersistentVolumeClaimCondition{ + { + Type: v1.PersistentVolumeClaimResizing, + Status: v1.ConditionTrue, + LastTransitionTime: currentTime, + }, + { + Type: v1.PersistentVolumeClaimNodeResizeError, + Status: v1.ConditionTrue, + LastTransitionTime: currentTime, + }, + }, + }, } for _, testcase := range testCases { - updatePVC := MergeResizeConditionOnPVC(testcase.pvc, testcase.newConditions) + updatePVC := MergeResizeConditionOnPVC(testcase.pvc, testcase.newConditions, testcase.keepOldResizeConditions) updateConditions := updatePVC.Status.Conditions if !reflect.DeepEqual(updateConditions, testcase.finalConditions) { From 2115c3e7d875a00b75e175170bbfa8a8a4ded041 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 17 Jul 2024 10:43:36 -0400 Subject: [PATCH 13/15] Fix e2e test with new resizer --- .../storage/csimock/csi_volume_expansion.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/e2e/storage/csimock/csi_volume_expansion.go b/test/e2e/storage/csimock/csi_volume_expansion.go index 3edf98736fd..670e7012c9a 100644 --- a/test/e2e/storage/csimock/csi_volume_expansion.go +++ b/test/e2e/storage/csimock/csi_volume_expansion.go @@ -473,11 +473,11 @@ var _ = utils.SIGDescribe("CSI Mock volume expansion", func() { func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, test recoveryTest) { var err error ginkgo.By("Waiting for resizer to set allocated resource") - err = waitForAllocatedResource(pvc, m, test.allocatedResource) + err = waitForAllocatedResource(ctx, pvc, m, test.allocatedResource) framework.ExpectNoError(err, "While waiting for allocated resource to be updated") ginkgo.By("Waiting for resizer to set resize status") - err = waitForResizeStatus(pvc, m.cs, test.expectedResizeStatus) + err = waitForResizeStatus(ctx, pvc, m.cs, test.expectedResizeStatus) framework.ExpectNoError(err, "While waiting for resize status to be set") ginkgo.By("Recover pvc size") @@ -500,7 +500,7 @@ func validateRecoveryBehaviour(ctx context.Context, pvc *v1.PersistentVolumeClai // if expansion succeeded on controller but failed on the node if test.simulatedCSIDriverError == expansionFailedOnNode { ginkgo.By("Wait for expansion to fail on node again") - err = waitForResizeStatus(pvc, m.cs, v1.PersistentVolumeClaimNodeResizeInfeasible) + err = waitForResizeStatus(ctx, pvc, m.cs, v1.PersistentVolumeClaimNodeResizeInfeasible) framework.ExpectNoError(err, "While waiting for resize status to be set to expansion-failed-on-node") ginkgo.By("verify allocated resources after recovery") @@ -541,10 +541,10 @@ func validateExpansionSuccess(ctx context.Context, pvc *v1.PersistentVolumeClaim gomega.Expect(resizeStatus).To(gomega.BeZero(), "resize status should be empty") } -func waitForResizeStatus(pvc *v1.PersistentVolumeClaim, c clientset.Interface, expectedState v1.ClaimResourceStatus) error { - var actualResizeStatus *v1.ClaimResourceStatus +func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface, expectedState v1.ClaimResourceStatus) error { + var actualResizeStatus v1.ClaimResourceStatus - waitErr := wait.PollImmediate(resizePollInterval, csiResizeWaitPeriod, func() (bool, error) { + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(ctx context.Context) (bool, error) { var err error updatedPVC, err := c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) @@ -552,18 +552,18 @@ func waitForResizeStatus(pvc *v1.PersistentVolumeClaim, c clientset.Interface, e return false, fmt.Errorf("error fetching pvc %q for checking for resize status: %w", pvc.Name, err) } - actualResizeStatus := updatedPVC.Status.AllocatedResourceStatuses[v1.ResourceStorage] + actualResizeStatus = updatedPVC.Status.AllocatedResourceStatuses[v1.ResourceStorage] return (actualResizeStatus == expectedState), nil }) if waitErr != nil { - return fmt.Errorf("error while waiting for resize status to sync to %v, actualStatus %s: %v", expectedState, *actualResizeStatus, waitErr) + return fmt.Errorf("error while waiting for resize status to sync to %v, actualStatus %s: %v", expectedState, actualResizeStatus, waitErr) } return nil } -func waitForAllocatedResource(pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, expectedSize string) error { +func waitForAllocatedResource(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, expectedSize string) error { expectedQuantity := resource.MustParse(expectedSize) - waitErr := wait.PollImmediate(resizePollInterval, csiResizeWaitPeriod, func() (bool, error) { + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(ctx context.Context) (bool, error) { var err error updatedPVC, err := m.cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) From 68df11f4f80c6e89a540ee9bbc199b28fb344b9d Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Wed, 17 Jul 2024 10:59:55 -0400 Subject: [PATCH 14/15] Use context of polling function for API calls in e2e --- test/e2e/storage/csimock/csi_volume_expansion.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/e2e/storage/csimock/csi_volume_expansion.go b/test/e2e/storage/csimock/csi_volume_expansion.go index 670e7012c9a..835fd43484d 100644 --- a/test/e2e/storage/csimock/csi_volume_expansion.go +++ b/test/e2e/storage/csimock/csi_volume_expansion.go @@ -544,9 +544,9 @@ func validateExpansionSuccess(ctx context.Context, pvc *v1.PersistentVolumeClaim func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c clientset.Interface, expectedState v1.ClaimResourceStatus) error { var actualResizeStatus v1.ClaimResourceStatus - waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(ctx context.Context) (bool, error) { + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) { var err error - updatedPVC, err := c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) + updatedPVC, err := c.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(pollContext, pvc.Name, metav1.GetOptions{}) if err != nil { return false, fmt.Errorf("error fetching pvc %q for checking for resize status: %w", pvc.Name, err) @@ -563,9 +563,9 @@ func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c c func waitForAllocatedResource(ctx context.Context, pvc *v1.PersistentVolumeClaim, m *mockDriverSetup, expectedSize string) error { expectedQuantity := resource.MustParse(expectedSize) - waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(ctx context.Context) (bool, error) { + waitErr := wait.PollUntilContextTimeout(ctx, resizePollInterval, csiResizeWaitPeriod, true, func(pollContext context.Context) (bool, error) { var err error - updatedPVC, err := m.cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(context.TODO(), pvc.Name, metav1.GetOptions{}) + updatedPVC, err := m.cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(pollContext, pvc.Name, metav1.GetOptions{}) if err != nil { return false, fmt.Errorf("error fetching pvc %q for checking for resize status: %w", pvc.Name, err) From f7f1a6c81a004c106d97593a08b041b450df171e Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 22 Jul 2024 10:43:38 -0400 Subject: [PATCH 15/15] Address review comments and return nicer errors --- pkg/volume/csi/expander.go | 2 +- pkg/volume/util/operationexecutor/node_expander.go | 14 ++++++++------ test/e2e/storage/csimock/csi_volume_expansion.go | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pkg/volume/csi/expander.go b/pkg/volume/csi/expander.go index f5e6738434c..e56627203af 100644 --- a/pkg/volume/csi/expander.go +++ b/pkg/volume/csi/expander.go @@ -121,7 +121,7 @@ func (c *csiPlugin) nodeExpandWithClient( } if isInfeasibleError(err) { - infeasibleError := fmt.Errorf("Expander.NodeExpand failed to expand the volume: %w", volumetypes.NewInfeasibleError(err.Error())) + infeasibleError := volumetypes.NewInfeasibleError(fmt.Sprintf("Expander.NodeExpand failed to expand the volume %s", err.Error())) return false, infeasibleError } return false, fmt.Errorf("Expander.NodeExpand failed to expand the volume : %w", err) diff --git a/pkg/volume/util/operationexecutor/node_expander.go b/pkg/volume/util/operationexecutor/node_expander.go index 743b5837497..19547c94e09 100644 --- a/pkg/volume/util/operationexecutor/node_expander.go +++ b/pkg/volume/util/operationexecutor/node_expander.go @@ -39,9 +39,9 @@ type NodeExpander struct { pvCap resource.Quantity resizeStatus v1.ClaimResourceStatus - // indicates that pvc spec size has actually changed since last we observed size - // on the kubelet - markExpansionInfeasible bool + // indicates that if volume expansion failed on the node, then current expansion should be marked + // as infeasible so as controller can reconcile the resizing operation by using new user requested size. + markExpansionInfeasibleOnFailure bool // pvcAlreadyUpdated if true indicates that although we are calling NodeExpandVolume on the kubelet // PVC has already been updated - possibly because expansion already succeeded on different node. @@ -84,12 +84,14 @@ func (ne *NodeExpander) runPreCheck() bool { } pvcSpecCap := ne.pvc.Spec.Resources.Requests[v1.ResourceStorage] + // usually when are performing node expansion, we expect pv size and pvc spec size // to be the same, but if user has edited pvc since then and volume expansion failed - // with final error, then we should let controller reconcile this state. + // with final error, then we should let controller reconcile this state, by marking entire + // node expansion as infeasible. if pvcSpecCap.Cmp(ne.pluginResizeOpts.NewSize) != 0 && ne.actualStateOfWorld.CheckVolumeInFailedExpansionWithFinalErrors(ne.vmt.VolumeName) { - ne.markExpansionInfeasible = true + ne.markExpansionInfeasibleOnFailure = true } // PVC is already expanded but we are still trying to expand the volume because @@ -138,7 +140,7 @@ func (ne *NodeExpander) expandOnPlugin() (bool, error, testResponseData) { if volumetypes.IsOperationFinishedError(resizeErr) { var markFailedError error ne.actualStateOfWorld.MarkVolumeExpansionFailedWithFinalError(ne.vmt.VolumeName) - if volumetypes.IsInfeasibleError(resizeErr) || ne.markExpansionInfeasible { + if volumetypes.IsInfeasibleError(resizeErr) || ne.markExpansionInfeasibleOnFailure { ne.pvc, markFailedError = util.MarkNodeExpansionInfeasible(ne.pvc, ne.kubeClient, resizeErr) if markFailedError != nil { klog.Errorf(ne.vmt.GenerateErrorDetailed("MountMount.NodeExpandVolume failed to mark node expansion as failed: %v", err).Error()) diff --git a/test/e2e/storage/csimock/csi_volume_expansion.go b/test/e2e/storage/csimock/csi_volume_expansion.go index 835fd43484d..47ae8be9884 100644 --- a/test/e2e/storage/csimock/csi_volume_expansion.go +++ b/test/e2e/storage/csimock/csi_volume_expansion.go @@ -556,7 +556,7 @@ func waitForResizeStatus(ctx context.Context, pvc *v1.PersistentVolumeClaim, c c return (actualResizeStatus == expectedState), nil }) if waitErr != nil { - return fmt.Errorf("error while waiting for resize status to sync to %v, actualStatus %s: %v", expectedState, actualResizeStatus, waitErr) + return fmt.Errorf("error while waiting for resize status to sync to %v, actualStatus %s: %w", expectedState, actualResizeStatus, waitErr) } return nil }