Merge pull request #116675 from pacoxu/volume-flake

deflake: Add retry with timeout to wait for final conditions
2025-08-19 00:31:00 +00:00 · 2023-04-11 18:19:09 -07:00 · 2023-04-11 18:19:09 -07:00 · 8cdc7fa542
commit 8cdc7fa542
parent 3c076e4b7d 8e36e948ce
1 changed files with 31 additions and 37 deletions
--- a/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go
+++ b/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go
@ -18,6 +18,7 @@ package reconciler

 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"

@ -47,6 +48,7 @@ const (
 	syncLoopPeriod                = 100 * time.Minute
 	maxWaitForUnmountDuration     = 50 * time.Millisecond
 	maxLongWaitForUnmountDuration = 4200 * time.Second
+	volumeAttachedCheckTimeout    = 5 * time.Second
 )

 // Calls Run()
@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing
 	// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
 	waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)

 	// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
 	// Without this, the delete operation will be delayed due to mounted status
@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
 	// Volume is added to asw, volume should be reported as attached to the node.
 	waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)

 	// Delete the pod
 	dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
 	// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
 	// will not need to be updated until new changes are applied (detach is triggered again)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)

 }

@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
 		t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
 	}

-	time.Sleep(1000 * time.Millisecond)
 	// Volume is added to asw, volume should be reported as attached to the node.
 	waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)

 	// Delete the pod, but detach will fail
 	dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -747,22 +748,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
 	// will not need to be updated until new changes are applied (detach is triggered again)
 	time.Sleep(100 * time.Millisecond)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
-
-	// After the first detach fails, reconciler will wait for a period of time before retrying to detach.
-	// The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
-	// The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
-	// the first detach operation. At this point, volumes status should not be updated
-	time.Sleep(100 * time.Millisecond)
-	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
-
-	// Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
-	// removed from being reported as attached on node status and then added back as attached.
-	// The volume will be in the list of attached volumes that need to be updated to node status.
-	time.Sleep(600 * time.Millisecond)
-	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)

 	// Add a second pod which tries to attach the volume to the same node.
 	// After adding pod to the same node, detach will not be triggered any more.
@ -773,7 +759,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
 	// Sleep 1s to verify no detach are triggered after second pod is added in the future.
 	time.Sleep(1000 * time.Millisecond)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
-	verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
+	// verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)

 	// Add a third pod which tries to attach the volume to a different node.
 	// At this point, volume is still attached to first node. There are no status update for both nodes.
@ -834,7 +820,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
 	// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
 	waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
 	verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw)
-	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw)
+	verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout)

 	// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
 	// Without this, the delete operation will be delayed due to mounted status
@ -1612,24 +1598,32 @@ func verifyVolumeReportedAsAttachedToNode(
 	nodeName k8stypes.NodeName,
 	isAttached bool,
 	asw cache.ActualStateOfWorld,
+	timeout time.Duration,
 ) {
-	result := false
-	volumes := asw.GetVolumesToReportAttached(logger)
-	for _, volume := range volumes[nodeName] {
-		if volume.Name == volumeName {
-			result = true
+	var result bool
+	var lastErr error
+	err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) {
+		volumes := asw.GetVolumesToReportAttached(logger)
+		for _, volume := range volumes[nodeName] {
+			if volume.Name == volumeName {
+				result = true
+			}
 		}
-	}

-	if result == isAttached {
-		t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
-		return
+		if result == isAttached {
+			t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
+			return true, nil
+		}
+		lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
+			volumeName,
+			nodeName,
+			result,
+			isAttached)
+		return false, nil
+	})
+	if err != nil {
+		t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error())
 	}
-	t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
-		volumeName,
-		nodeName,
-		result,
-		isAttached)

 }