dra delayed allocation: deallocate when a pod is done

This releases the underlying resource sooner and ensures that another consumer can get scheduled without being influenced by a decision that was made for the previous consumer. An alternative would have been to have the apiserver trigger the deallocation whenever it sees the `status.reservedFor` getting reduced to zero. But that then also triggers deallocation when kube-scheduler removes the last reservation after a failed scheduling cycle. In that case we want to keep the claim allocated and let the kube-scheduler decide on a case-by-case basis which claim should get deallocated.
2026-01-05 15:37:24 +00:00 · 2023-06-28 15:33:07 +02:00
parent 4a5a242a68
commit 1b47e6433b
2 changed files with 43 additions and 0 deletions
--- a/pkg/controller/resourceclaim/controller.go
+++ b/pkg/controller/resourceclaim/controller.go
@@ -456,6 +456,28 @@ func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) err
 		// TODO (#113700): patch
 		claim := claim.DeepCopy()
 		claim.Status.ReservedFor = valid
+
+		// When a ResourceClaim uses delayed allocation, then it makes sense to
+		// deallocate the claim as soon as the last consumer stops using
+		// it. This ensures that the claim can be allocated again as needed by
+		// some future consumer instead of trying to schedule that consumer
+		// onto the node that was chosen for the previous consumer. It also
+		// releases the underlying resources for use by other claims.
+		//
+		// This has to be triggered by the transition from "was being used" to
+		// "is not used anymore" because a DRA driver is not required to set
+		// `status.reservedFor` together with `status.allocation`, i.e. a claim
+		// that is "currently unused" should not get deallocated.
+		//
+		// This does not matter for claims that were created for a pod. For
+		// those, the resource claim controller will trigger deletion when the
+		// pod is done. However, it doesn't hurt to also trigger deallocation
+		// for such claims and not checking for them keeps this code simpler.
+		if len(valid) == 0 &&
+			claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer {
+			claim.Status.DeallocationRequested = true
+		}
+
 		_, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
 		if err != nil {
 			return err
--- a/test/e2e/dra/dra.go
+++ b/test/e2e/dra/dra.go
@@ -254,6 +254,27 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 		ginkgo.Context("with immediate allocation", func() {
 			claimTests(resourcev1alpha2.AllocationModeImmediate)
 		})
+
+		ginkgo.It("must deallocate after use when using delayed allocation", func(ctx context.Context) {
+			parameters := b.parameters()
+			pod := b.podExternal()
+			claim := b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer)
+			b.create(ctx, parameters, claim, pod)
+
+			gomega.Eventually(ctx, func(ctx context.Context) (*resourcev1alpha2.ResourceClaim, error) {
+				return b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
+			}).WithTimeout(f.Timeouts.PodDelete).ShouldNot(gomega.HaveField("Status.Allocation", (*resourcev1alpha2.AllocationResult)(nil)))
+
+			b.testPod(ctx, f.ClientSet, pod)
+
+			ginkgo.By(fmt.Sprintf("deleting pod %s", klog.KObj(pod)))
+			framework.ExpectNoError(b.f.ClientSet.CoreV1().Pods(b.f.Namespace.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{}))
+
+			ginkgo.By("waiting for claim to get deallocated")
+			gomega.Eventually(ctx, func(ctx context.Context) (*resourcev1alpha2.ResourceClaim, error) {
+				return b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
+			}).WithTimeout(f.Timeouts.PodDelete).Should(gomega.HaveField("Status.Allocation", (*resourcev1alpha2.AllocationResult)(nil)))
+		})
 	})

 	ginkgo.Context("multiple nodes", func() {