Merge pull request #116675 from pacoxu/volume-flake

deflake: Add retry with timeout to wait for final conditions
This commit is contained in:
Kubernetes Prow Robot 2023-04-11 18:19:09 -07:00 committed by GitHub
commit 8cdc7fa542
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,6 +18,7 @@ package reconciler
import (
"context"
"fmt"
"testing"
"time"
@ -47,6 +48,7 @@ const (
syncLoopPeriod = 100 * time.Minute
maxWaitForUnmountDuration = 50 * time.Millisecond
maxLongWaitForUnmountDuration = 4200 * time.Second
volumeAttachedCheckTimeout = 5 * time.Second
)
// Calls Run()
@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status
@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
}
@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
}
time.Sleep(1000 * time.Millisecond)
// Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod, but detach will fail
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -747,22 +748,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// will not need to be updated until new changes are applied (detach is triggered again)
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
// After the first detach fails, reconciler will wait for a period of time before retrying to detach.
// The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
// The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
// the first detach operation. At this point, volumes status should not be updated
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
// removed from being reported as attached on node status and then added back as attached.
// The volume will be in the list of attached volumes that need to be updated to node status.
time.Sleep(600 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Add a second pod which tries to attach the volume to the same node.
// After adding pod to the same node, detach will not be triggered any more.
@ -773,7 +759,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// Sleep 1s to verify no detach are triggered after second pod is added in the future.
time.Sleep(1000 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Add a third pod which tries to attach the volume to a different node.
// At this point, volume is still attached to first node. There are no status update for both nodes.
@ -834,7 +820,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status
@ -1612,24 +1598,32 @@ func verifyVolumeReportedAsAttachedToNode(
nodeName k8stypes.NodeName,
isAttached bool,
asw cache.ActualStateOfWorld,
timeout time.Duration,
) {
result := false
volumes := asw.GetVolumesToReportAttached(logger)
for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
var result bool
var lastErr error
err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) {
volumes := asw.GetVolumesToReportAttached(logger)
for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
}
}
}
if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return
if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return true, nil
}
lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
return false, nil
})
if err != nil {
t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error())
}
t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
}