Merge pull request #116675 from pacoxu/volume-flake

deflake: Add retry with timeout to wait for final conditions
This commit is contained in:
Kubernetes Prow Robot 2023-04-11 18:19:09 -07:00 committed by GitHub
commit 8cdc7fa542
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,6 +18,7 @@ package reconciler
import ( import (
"context" "context"
"fmt"
"testing" "testing"
"time" "time"
@ -47,6 +48,7 @@ const (
syncLoopPeriod = 100 * time.Minute syncLoopPeriod = 100 * time.Minute
maxWaitForUnmountDuration = 50 * time.Millisecond maxWaitForUnmountDuration = 50 * time.Millisecond
maxLongWaitForUnmountDuration = 4200 * time.Second maxLongWaitForUnmountDuration = 4200 * time.Second
volumeAttachedCheckTimeout = 5 * time.Second
) )
// Calls Run() // Calls Run()
@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status // Without this, the delete operation will be delayed due to mounted status
@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// Volume is added to asw, volume should be reported as attached to the node. // Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod // Delete the pod
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume // in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again) // will not need to be updated until new changes are applied (detach is triggered again)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
} }
@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr) t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
} }
time.Sleep(1000 * time.Millisecond)
// Volume is added to asw, volume should be reported as attached to the node. // Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod, but detach will fail // Delete the pod, but detach will fail
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -747,22 +748,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// will not need to be updated until new changes are applied (detach is triggered again) // will not need to be updated until new changes are applied (detach is triggered again)
time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// After the first detach fails, reconciler will wait for a period of time before retrying to detach.
// The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
// The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
// the first detach operation. At this point, volumes status should not be updated
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
// removed from being reported as attached on node status and then added back as attached.
// The volume will be in the list of attached volumes that need to be updated to node status.
time.Sleep(600 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
// Add a second pod which tries to attach the volume to the same node. // Add a second pod which tries to attach the volume to the same node.
// After adding pod to the same node, detach will not be triggered any more. // After adding pod to the same node, detach will not be triggered any more.
@ -773,7 +759,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// Sleep 1s to verify no detach are triggered after second pod is added in the future. // Sleep 1s to verify no detach are triggered after second pod is added in the future.
time.Sleep(1000 * time.Millisecond) time.Sleep(1000 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) // verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Add a third pod which tries to attach the volume to a different node. // Add a third pod which tries to attach the volume to a different node.
// At this point, volume is still attached to first node. There are no status update for both nodes. // At this point, volume is still attached to first node. There are no status update for both nodes.
@ -834,7 +820,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status // Without this, the delete operation will be delayed due to mounted status
@ -1612,24 +1598,32 @@ func verifyVolumeReportedAsAttachedToNode(
nodeName k8stypes.NodeName, nodeName k8stypes.NodeName,
isAttached bool, isAttached bool,
asw cache.ActualStateOfWorld, asw cache.ActualStateOfWorld,
timeout time.Duration,
) { ) {
result := false var result bool
volumes := asw.GetVolumesToReportAttached(logger) var lastErr error
for _, volume := range volumes[nodeName] { err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) {
if volume.Name == volumeName { volumes := asw.GetVolumesToReportAttached(logger)
result = true for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
}
} }
}
if result == isAttached { if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result) t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return return true, nil
}
lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
return false, nil
})
if err != nil {
t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error())
} }
t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
} }