From 6f9140e421c09ef0955d45916c51d25eabaa53a6 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Tue, 5 Sep 2023 15:01:09 +0200 Subject: [PATCH] DRA scheduler: stop allocating before deallocation This fixes a test flake: [sig-node] DRA [Feature:DynamicResourceAllocation] multiple nodes reallocation [It] works /nvme/gopath/src/k8s.io/kubernetes/test/e2e/dra/dra.go:552 [FAILED] number of deallocations Expected : 2 to equal : 1 In [It] at: /nvme/gopath/src/k8s.io/kubernetes/test/e2e/dra/dra.go:651 @ 09/05/23 14:01:54.652 This can be reproduced locally with stress -p 10 go test ./test/e2e -args -ginkgo.focus=DynamicResourceAllocation.*reallocation.works -ginkgo.no-color -v=4 -ginkgo.v Log output showed that the sequence of events leading to this was: - claim gets allocated because of selected node - a different node has to be used, so PostFilter sets claim.status.deallocationRequested - the driver deallocates - before the scheduler can react and select a different node, the driver allocates *again* for the original node - the scheduler asks for deallocation again - the driver deallocates again (causing the test failure) - eventually the pod runs The fix is to disable allocations first by removing the selected node and then starting to deallocate. --- .../plugins/dynamicresources/dynamicresources.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 4f70fba83a2..2c0541b1edd 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -42,6 +42,7 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" schedutil "k8s.io/kubernetes/pkg/scheduler/util" + "k8s.io/utils/ptr" ) const ( @@ -762,6 +763,19 @@ func (pl *dynamicResources) PostFilter(ctx context.Context, cs *framework.CycleS claim := state.claims[index] if len(claim.Status.ReservedFor) == 0 || len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID { + // Before we tell a driver to deallocate a claim, we + // have to stop telling it to allocate. Otherwise, + // depending on timing, it will deallocate the claim, + // see a PodSchedulingContext with selected node, and + // allocate again for that same node. + if state.podSchedulingState.schedulingCtx != nil && + state.podSchedulingState.schedulingCtx.Spec.SelectedNode != "" { + state.podSchedulingState.selectedNode = ptr.To("") + if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { + return nil, statusError(logger, err) + } + } + claim := state.claims[index].DeepCopy() claim.Status.DeallocationRequested = true claim.Status.ReservedFor = nil