From 6f9140e421c09ef0955d45916c51d25eabaa53a6 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Tue, 5 Sep 2023 15:01:09 +0200
Subject: [PATCH] DRA scheduler: stop allocating before deallocation

This fixes a test flake:

    [sig-node] DRA [Feature:DynamicResourceAllocation] multiple nodes reallocation [It] works
    /nvme/gopath/src/k8s.io/kubernetes/test/e2e/dra/dra.go:552

      [FAILED] number of deallocations
      Expected
          <int64>: 2
      to equal
          <int64>: 1
      In [It] at: /nvme/gopath/src/k8s.io/kubernetes/test/e2e/dra/dra.go:651 @ 09/05/23 14:01:54.652

This can be reproduced locally with

    stress -p 10 go test ./test/e2e -args -ginkgo.focus=DynamicResourceAllocation.*reallocation.works  -ginkgo.no-color -v=4 -ginkgo.v

Log output showed that the sequence of events leading to this was:
- claim gets allocated because of selected node
- a different node has to be used, so PostFilter sets
  claim.status.deallocationRequested
- the driver deallocates
- before the scheduler can react and select a different node,
  the driver allocates *again* for the original node
- the scheduler asks for deallocation again
- the driver deallocates again (causing the test failure)
- eventually the pod runs

The fix is to disable allocations first by removing the selected node and then
starting to deallocate.
---
 .../plugins/dynamicresources/dynamicresources.go   | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
index 4f70fba83a2..2c0541b1edd 100644
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
@@ -42,6 +42,7 @@ import (
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
 	schedutil "k8s.io/kubernetes/pkg/scheduler/util"
+	"k8s.io/utils/ptr"
 )
 
 const (
@@ -762,6 +763,19 @@ func (pl *dynamicResources) PostFilter(ctx context.Context, cs *framework.CycleS
 		claim := state.claims[index]
 		if len(claim.Status.ReservedFor) == 0 ||
 			len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID {
+			// Before we tell a driver to deallocate a claim, we
+			// have to stop telling it to allocate. Otherwise,
+			// depending on timing, it will deallocate the claim,
+			// see a PodSchedulingContext with selected node, and
+			// allocate again for that same node.
+			if state.podSchedulingState.schedulingCtx != nil &&
+				state.podSchedulingState.schedulingCtx.Spec.SelectedNode != "" {
+				state.podSchedulingState.selectedNode = ptr.To("")
+				if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
+					return nil, statusError(logger, err)
+				}
+			}
+
 			claim := state.claims[index].DeepCopy()
 			claim.Status.DeallocationRequested = true
 			claim.Status.ReservedFor = nil