From c143a875edbbe7de544a3ec62873797687cf760d Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 10 Jul 2023 12:13:41 +0200 Subject: [PATCH] dra e2e: fix "reallocation works" flake The main problem probably was that https://github.com/kubernetes/kubernetes/pull/118862 moved creating the first pod before setting up the callback which blocks allocating one claim for that pod. This is racy because allocations happen in the background. The test also was unnecessarily complex and hard to read: - The intended effect can be achieved with three instead of four claims. - It wasn't clear which claim has "external-claim-other" as name. Using the claim variable avoids that. --- test/e2e/dra/dra.go | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go index 03fb397bfba..1f2e8a258ae 100644 --- a/test/e2e/dra/dra.go +++ b/test/e2e/dra/dra.go @@ -430,14 +430,14 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu driver := NewDriver(f, nodes, func() app.Resources { return app.Resources{ NodeLocal: true, - MaxAllocations: 2, + MaxAllocations: 1, Nodes: nodes.NodeNames, } }) driver2 := NewDriver(f, nodes, func() app.Resources { return app.Resources{ NodeLocal: true, - MaxAllocations: 2, + MaxAllocations: 1, Nodes: nodes.NodeNames, AllocateWrapper: func( @@ -476,35 +476,27 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu parameters1 := b.parameters() parameters2 := b2.parameters() - pod1 := b.podExternal() - pod2 := b2.podExternal() + // Order is relevant here: each pod must be matched with its own claim. pod1claim1 := b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + pod1 := b.podExternal() pod2claim1 := b2.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer) - pod1claim2 := b2.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer) - pod1claim3 := b2.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + pod2 := b2.podExternal() + // Add another claim to pod1. + pod1claim2 := b2.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer) pod1.Spec.ResourceClaims = append(pod1.Spec.ResourceClaims, v1.PodResourceClaim{ - Name: "claim-other1", + Name: "claim-other", Source: v1.ClaimSource{ ResourceClaimName: &pod1claim2.Name, }, }, - v1.PodResourceClaim{ - Name: "claim-other2", - Source: v1.ClaimSource{ - ResourceClaimName: &pod1claim3.Name, - }, - }, ) - // Block on the second, third external claim from driver2 that is to be allocated. + // Allocating the second claim in pod1 has to wait until pod2 has + // consumed the available resources on the node. blockClaim, cancelBlockClaim := context.WithCancel(ctx) defer cancelBlockClaim() - - b2.create(ctx, parameters2, pod1claim2, pod1claim3) - b.create(ctx, parameters1, pod1claim1, pod1) - allocateWrapper2 = func(ctx context.Context, claimAllocations []*controller.ClaimAllocation, selectedNode string, @@ -512,15 +504,15 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu claimAllocations []*controller.ClaimAllocation, selectedNode string), ) { - // pod1 will have only external-claim[2-3], it has to wait - // for pod2 to consume resources from driver2 - if claimAllocations[0].Claim.Name != "external-claim-other" { + if claimAllocations[0].Claim.Name == pod1claim2.Name { <-blockClaim.Done() } handler(ctx, claimAllocations, selectedNode) return } + b.create(ctx, parameters1, parameters2, pod1claim1, pod1claim2, pod1) + ginkgo.By("waiting for one claim from driver1 to be allocated") var nodeSelector *v1.NodeSelector gomega.Eventually(ctx, func(ctx context.Context) (int, error) { @@ -552,7 +544,7 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu b2.create(ctx, pod2claim1, pod2) framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod2), "start pod 2") - // Allow allocation of pod1 claim2, claim3 to proceed. It should fail now + // Allow allocation of second claim in pod1 to proceed. It should fail now // and the other node must be used instead, after deallocating // the first claim. ginkgo.By("move first pod to other node")