dra: handle scheduled pods in kube-controller-manager

When someone decides that a Pod should definitely run on a specific node, they can create the Pod with spec.nodeName already set. Some custom scheduler might do that. Then kubelet starts to check the pod and (if DRA is enabled) will refuse to run it, either because the claims are still waiting for the first consumer or the pod wasn't added to reservedFor. Both are things the scheduler normally does. Also, if a pod got scheduled while the DRA feature was off in the kube-scheduler, a pod can reach the same state. The resource claim controller can handle these two cases by taking over for the kube-scheduler when nodeName is set. Triggering an allocation is simpler than in the scheduler because all it takes is creating the right PodSchedulingContext with spec.selectedNode set. There's no need to list nodes because that choice was already made, permanently. Adding the pod to reservedFor also isn't hard. What's currently missing is triggering de-allocation of claims to re-allocate them for the desired node. This is not important for claims that get created for the pod from a template and then only get used once, but it might be worthwhile to add de-allocation in the future.
2026-01-05 07:27:21 +00:00 · 2023-05-22 19:44:58 +02:00
parent cffbb1f1b2
commit 80ab8f0542
6 changed files with 337 additions and 71 deletions
--- a/cmd/kube-controller-manager/app/core.go
+++ b/cmd/kube-controller-manager/app/core.go
@@ -356,6 +356,7 @@ func startResourceClaimController(ctx context.Context, controllerContext Control
 		klog.FromContext(ctx),
 		controllerContext.ClientBuilder.ClientOrDie("resource-claim-controller"),
 		controllerContext.InformerFactory.Core().V1().Pods(),
+		controllerContext.InformerFactory.Resource().V1alpha2().PodSchedulingContexts(),
 		controllerContext.InformerFactory.Resource().V1alpha2().ResourceClaims(),
 		controllerContext.InformerFactory.Resource().V1alpha2().ResourceClaimTemplates())
 	if err != nil {
--- a/pkg/controller/resourceclaim/controller.go
+++ b/pkg/controller/resourceclaim/controller.go
@@ -87,6 +87,13 @@ type Controller struct {
 	podLister v1listers.PodLister
 	podSynced cache.InformerSynced

+	// podSchedulingList is the shared PodSchedulingContext lister used to
+	// fetch scheduling objects from the API server. It is shared with other
+	// controllers and therefore the objects in its store should be treated
+	// as immutable.
+	podSchedulingLister resourcev1alpha2listers.PodSchedulingContextLister
+	podSchedulingSynced cache.InformerSynced
+
 	// templateLister is the shared ResourceClaimTemplate lister used to
 	// fetch template objects from the API server. It is shared with other
 	// controllers and therefore the objects in its store should be treated
@@ -119,20 +126,23 @@ func NewController(
 	logger klog.Logger,
 	kubeClient clientset.Interface,
 	podInformer v1informers.PodInformer,
+	podSchedulingInformer resourcev1alpha2informers.PodSchedulingContextInformer,
 	claimInformer resourcev1alpha2informers.ResourceClaimInformer,
 	templateInformer resourcev1alpha2informers.ResourceClaimTemplateInformer) (*Controller, error) {

 	ec := &Controller{
-		kubeClient:      kubeClient,
-		podLister:       podInformer.Lister(),
-		podIndexer:      podInformer.Informer().GetIndexer(),
-		podSynced:       podInformer.Informer().HasSynced,
-		claimLister:     claimInformer.Lister(),
-		claimsSynced:    claimInformer.Informer().HasSynced,
-		templateLister:  templateInformer.Lister(),
-		templatesSynced: templateInformer.Informer().HasSynced,
-		queue:           workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "resource_claim"),
-		deletedObjects:  newUIDCache(maxUIDCacheEntries),
+		kubeClient:          kubeClient,
+		podLister:           podInformer.Lister(),
+		podIndexer:          podInformer.Informer().GetIndexer(),
+		podSynced:           podInformer.Informer().HasSynced,
+		podSchedulingLister: podSchedulingInformer.Lister(),
+		podSchedulingSynced: podSchedulingInformer.Informer().HasSynced,
+		claimLister:         claimInformer.Lister(),
+		claimsSynced:        claimInformer.Informer().HasSynced,
+		templateLister:      templateInformer.Lister(),
+		templatesSynced:     templateInformer.Informer().HasSynced,
+		queue:               workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "resource_claim"),
+		deletedObjects:      newUIDCache(maxUIDCacheEntries),
 	}

 	metrics.RegisterMetrics()
@@ -300,9 +310,40 @@ func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) {
 			return false, "conflicting claim needs to be removed by user"
 		}

+		// This check skips over the reasons below that only apply
+		// when a pod has been scheduled already. We need to keep checking
+		// for more claims that might need to be created.
 		if pod.Spec.NodeName == "" {
-			// Scheduler will handle PodSchedulingContext and reservations.
-			return false, "pod not scheduled"
+			continue
+		}
+
+		// Create PodSchedulingContext if the pod got scheduled without triggering
+		// delayed allocation.
+		//
+		// These can happen when:
+		// - a user created a pod with spec.nodeName set, perhaps for testing
+		// - some scheduler was used which is unaware of DRA
+		// - DRA was not enabled in kube-scheduler (version skew, configuration)
+		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
+			claim.Status.Allocation == nil {
+			scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
+			if apierrors.IsNotFound(err) {
+				return true, "need to create PodSchedulingContext for scheduled pod"
+			}
+			if err != nil {
+				// Shouldn't happen.
+				return true, fmt.Sprintf("internal error while checking for PodSchedulingContext: %v", err)
+			}
+			if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
+				// Need to update PodSchedulingContext.
+				return true, "need to updated PodSchedulingContext for scheduled pod"
+			}
+		}
+		if claim.Status.Allocation != nil &&
+			!resourceclaim.IsReservedForPod(pod, claim) &&
+			resourceclaim.CanBeReserved(claim) {
+			// Need to reserve it.
+			return true, "need to reserve claim for pod"
 		}
 	}

@@ -459,6 +500,49 @@ func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error
 		}
 	}

+	if pod.Spec.NodeName == "" {
+		// Scheduler will handle PodSchedulingContext and reservations.
+		logger.V(5).Info("nothing to do for pod, scheduler will deal with it")
+		return nil
+	}
+
+	for _, podClaim := range pod.Spec.ResourceClaims {
+		claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim)
+		if err != nil {
+			return err
+		}
+		// If nil, then it has been determined that the claim is not needed
+		// and can be skipped.
+		if claimName == nil {
+			continue
+		}
+		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		if err != nil {
+			return fmt.Errorf("retrieve claim: %v", err)
+		}
+		if checkOwner {
+			if err := resourceclaim.IsForPod(pod, claim); err != nil {
+				return err
+			}
+		}
+		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
+			claim.Status.Allocation == nil {
+			logger.V(5).Info("create PodSchedulingContext because claim needs to be allocated", "resourceClaim", klog.KObj(claim))
+			return ec.ensurePodSchedulingContext(ctx, pod)
+		}
+		if claim.Status.Allocation != nil &&
+			!resourceclaim.IsReservedForPod(pod, claim) &&
+			resourceclaim.CanBeReserved(claim) {
+			logger.V(5).Info("reserve claim for pod", "resourceClaim", klog.KObj(claim))
+			if err := ec.reserveForPod(ctx, pod, claim); err != nil {
+				return err
+			}
+		}
+	}
+
 	return nil
 }

@@ -618,6 +702,64 @@ func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceC
 	return nil, nil
 }

+func (ec *Controller) ensurePodSchedulingContext(ctx context.Context, pod *v1.Pod) error {
+	scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
+	if err != nil && !apierrors.IsNotFound(err) {
+		return fmt.Errorf("retrieve PodSchedulingContext: %v", err)
+	}
+	if scheduling == nil {
+		scheduling = &resourcev1alpha2.PodSchedulingContext{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      pod.Name,
+				Namespace: pod.Namespace,
+				OwnerReferences: []metav1.OwnerReference{
+					{
+						APIVersion: "v1",
+						Kind:       "Pod",
+						Name:       pod.Name,
+						UID:        pod.UID,
+						Controller: pointer.Bool(true),
+					},
+				},
+			},
+			Spec: resourcev1alpha2.PodSchedulingContextSpec{
+				SelectedNode: pod.Spec.NodeName,
+				// There is no need for negotiation about
+				// potential and suitable nodes anymore, so
+				// PotentialNodes can be left empty.
+			},
+		}
+		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Create(ctx, scheduling, metav1.CreateOptions{}); err != nil {
+			return fmt.Errorf("create PodSchedulingContext: %v", err)
+		}
+		return nil
+	}
+
+	if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
+		scheduling := scheduling.DeepCopy()
+		scheduling.Spec.SelectedNode = pod.Spec.NodeName
+		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Update(ctx, scheduling, metav1.UpdateOptions{}); err != nil {
+			return fmt.Errorf("update spec.selectedNode in PodSchedulingContext: %v", err)
+		}
+	}
+
+	return nil
+}
+
+func (ec *Controller) reserveForPod(ctx context.Context, pod *v1.Pod, claim *resourcev1alpha2.ResourceClaim) error {
+	claim = claim.DeepCopy()
+	claim.Status.ReservedFor = append(claim.Status.ReservedFor,
+		resourcev1alpha2.ResourceClaimConsumerReference{
+			Resource: "pods",
+			Name:     pod.Name,
+			UID:      pod.UID,
+		})
+	if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("reserve claim for pod: %v", err)
+	}
+	return nil
+}
+
 func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error {
 	logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name))
 	ctx = klog.NewContext(ctx, logger)
@@ -772,7 +914,7 @@ func owningPod(claim *resourcev1alpha2.ResourceClaim) (string, types.UID) {
 }

 // podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (=
-// namespace/name) for ResourceClaimTemplates in a given pod.
+// namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod.
 func podResourceClaimIndexFunc(obj interface{}) ([]string, error) {
 	pod, ok := obj.(*v1.Pod)
 	if !ok {
@@ -780,16 +922,14 @@ func podResourceClaimIndexFunc(obj interface{}) ([]string, error) {
 	}
 	keys := []string{}
 	for _, podClaim := range pod.Spec.ResourceClaims {
-		if podClaim.Source.ResourceClaimTemplateName != nil {
-			claimName, _, err := resourceclaim.Name(pod, &podClaim)
-			if err != nil || claimName == nil {
-				// Index functions are not supposed to fail, the caller will panic.
-				// For both error reasons (claim not created yet, unknown API)
-				// we simply don't index.
-				continue
-			}
-			keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName))
+		claimName, _, err := resourceclaim.Name(pod, &podClaim)
+		if err != nil || claimName == nil {
+			// Index functions are not supposed to fail, the caller will panic.
+			// For both error reasons (claim not created yet, unknown API)
+			// we simply don't index.
+			continue
 		}
+		keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName))
 	}
 	return keys, nil
 }
--- a/pkg/controller/resourceclaim/controller_test.go
+++ b/pkg/controller/resourceclaim/controller_test.go
@@ -40,6 +40,7 @@ import (
 	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/pkg/controller"
 	ephemeralvolumemetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
+	"k8s.io/utils/pointer"
 )

 var (
@@ -50,44 +51,53 @@ var (
 	podResourceClaimName = "acme-resource"
 	templateName         = "my-template"
 	className            = "my-resource-class"
+	nodeName             = "worker"

 	testPod             = makePod(testPodName, testNamespace, testPodUID)
 	testPodWithResource = makePod(testPodName, testNamespace, testPodUID, *makePodResourceClaim(podResourceClaimName, templateName))
 	otherTestPod        = makePod(testPodName+"-II", testNamespace, testPodUID+"-II")
-	testClaim           = makeClaim(testPodName+"-"+podResourceClaimName, testNamespace, className, makeOwnerReference(testPodWithResource, true))
-	generatedTestClaim  = makeGeneratedClaim(podResourceClaimName, testPodName+"-"+podResourceClaimName, testNamespace, className, 1, makeOwnerReference(testPodWithResource, true))
-	testClaimAllocated  = func() *resourcev1alpha2.ResourceClaim {
-		claim := testClaim.DeepCopy()
-		claim.Status.Allocation = &resourcev1alpha2.AllocationResult{
-			Shareable: true,
-		}
-		return claim
-	}()
-	testClaimReserved = func() *resourcev1alpha2.ResourceClaim {
-		claim := testClaimAllocated.DeepCopy()
-		claim.Status.ReservedFor = append(claim.Status.ReservedFor,
-			resourcev1alpha2.ResourceClaimConsumerReference{
-				Resource: "pods",
-				Name:     testPodWithResource.Name,
-				UID:      testPodWithResource.UID,
-			},
-		)
-		return claim
-	}()
-	testClaimReservedTwice = func() *resourcev1alpha2.ResourceClaim {
-		claim := testClaimReserved.DeepCopy()
-		claim.Status.ReservedFor = append(claim.Status.ReservedFor,
-			resourcev1alpha2.ResourceClaimConsumerReference{
-				Resource: "pods",
-				Name:     otherTestPod.Name,
-				UID:      otherTestPod.UID,
-			},
-		)
-		return claim
-	}()
+
+	testClaim              = makeClaim(testPodName+"-"+podResourceClaimName, testNamespace, className, makeOwnerReference(testPodWithResource, true))
+	testClaimAllocated     = allocateClaim(testClaim)
+	testClaimReserved      = reserveClaim(testClaimAllocated, testPodWithResource)
+	testClaimReservedTwice = reserveClaim(testClaimReserved, otherTestPod)
+
+	generatedTestClaim          = makeGeneratedClaim(podResourceClaimName, testPodName+"-"+podResourceClaimName, testNamespace, className, 1, makeOwnerReference(testPodWithResource, true))
+	generatedTestClaimAllocated = allocateClaim(generatedTestClaim)
+	generatedTestClaimReserved  = reserveClaim(generatedTestClaimAllocated, testPodWithResource)
+
 	conflictingClaim    = makeClaim(testPodName+"-"+podResourceClaimName, testNamespace, className, nil)
 	otherNamespaceClaim = makeClaim(testPodName+"-"+podResourceClaimName, otherNamespace, className, nil)
 	template            = makeTemplate(templateName, testNamespace, className)
+
+	testPodWithNodeName = func() *v1.Pod {
+		pod := testPodWithResource.DeepCopy()
+		pod.Spec.NodeName = nodeName
+		pod.Status.ResourceClaimStatuses = append(pod.Status.ResourceClaimStatuses, v1.PodResourceClaimStatus{
+			Name:              pod.Spec.ResourceClaims[0].Name,
+			ResourceClaimName: &generatedTestClaim.Name,
+		})
+		return pod
+	}()
+
+	podSchedulingContext = resourcev1alpha2.PodSchedulingContext{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      testPodName,
+			Namespace: testNamespace,
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					APIVersion: "v1",
+					Kind:       "Pod",
+					Name:       testPodName,
+					UID:        testPodUID,
+					Controller: pointer.Bool(true),
+				},
+			},
+		},
+		Spec: resourcev1alpha2.PodSchedulingContextSpec{
+			SelectedNode: nodeName,
+		},
+	}
 )

 func init() {
@@ -96,17 +106,18 @@ func init() {

 func TestSyncHandler(t *testing.T) {
 	tests := []struct {
-		name             string
-		key              string
-		claims           []*resourcev1alpha2.ResourceClaim
-		claimsInCache    []*resourcev1alpha2.ResourceClaim
-		pods             []*v1.Pod
-		podsLater        []*v1.Pod
-		templates        []*resourcev1alpha2.ResourceClaimTemplate
-		expectedClaims   []resourcev1alpha2.ResourceClaim
-		expectedStatuses map[string][]v1.PodResourceClaimStatus
-		expectedError    bool
-		expectedMetrics  expectedMetrics
+		name                          string
+		key                           string
+		claims                        []*resourcev1alpha2.ResourceClaim
+		claimsInCache                 []*resourcev1alpha2.ResourceClaim
+		pods                          []*v1.Pod
+		podsLater                     []*v1.Pod
+		templates                     []*resourcev1alpha2.ResourceClaimTemplate
+		expectedClaims                []resourcev1alpha2.ResourceClaim
+		expectedPodSchedulingContexts []resourcev1alpha2.PodSchedulingContext
+		expectedStatuses              map[string][]v1.PodResourceClaimStatus
+		expectedError                 bool
+		expectedMetrics               expectedMetrics
 	}{
 		{
 			name:           "create",
@@ -361,6 +372,35 @@ func TestSyncHandler(t *testing.T) {
 			expectedClaims:  nil,
 			expectedMetrics: expectedMetrics{0, 0},
 		},
+		{
+			name:           "trigger-allocation",
+			pods:           []*v1.Pod{testPodWithNodeName},
+			key:            podKey(testPodWithNodeName),
+			templates:      []*resourcev1alpha2.ResourceClaimTemplate{template},
+			claims:         []*resourcev1alpha2.ResourceClaim{generatedTestClaim},
+			expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaim},
+			expectedStatuses: map[string][]v1.PodResourceClaimStatus{
+				testPodWithNodeName.Name: {
+					{Name: testPodWithNodeName.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name},
+				},
+			},
+			expectedPodSchedulingContexts: []resourcev1alpha2.PodSchedulingContext{podSchedulingContext},
+			expectedMetrics:               expectedMetrics{0, 0},
+		},
+		{
+			name:           "add-reserved",
+			pods:           []*v1.Pod{testPodWithNodeName},
+			key:            podKey(testPodWithNodeName),
+			templates:      []*resourcev1alpha2.ResourceClaimTemplate{template},
+			claims:         []*resourcev1alpha2.ResourceClaim{generatedTestClaimAllocated},
+			expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaimReserved},
+			expectedStatuses: map[string][]v1.PodResourceClaimStatus{
+				testPodWithNodeName.Name: {
+					{Name: testPodWithNodeName.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name},
+				},
+			},
+			expectedMetrics: expectedMetrics{0, 0},
+		},
 	}

 	for _, tc := range tests {
@@ -389,10 +429,11 @@ func TestSyncHandler(t *testing.T) {
 			setupMetrics()
 			informerFactory := informers.NewSharedInformerFactory(fakeKubeClient, controller.NoResyncPeriodFunc())
 			podInformer := informerFactory.Core().V1().Pods()
+			podSchedulingInformer := informerFactory.Resource().V1alpha2().PodSchedulingContexts()
 			claimInformer := informerFactory.Resource().V1alpha2().ResourceClaims()
 			templateInformer := informerFactory.Resource().V1alpha2().ResourceClaimTemplates()

-			ec, err := NewController(klog.FromContext(ctx), fakeKubeClient, podInformer, claimInformer, templateInformer)
+			ec, err := NewController(klog.FromContext(ctx), fakeKubeClient, podInformer, podSchedulingInformer, claimInformer, templateInformer)
 			if err != nil {
 				t.Fatalf("error creating ephemeral controller : %v", err)
 			}
@@ -451,6 +492,12 @@ func TestSyncHandler(t *testing.T) {
 			}
 			assert.Equal(t, tc.expectedStatuses, actualStatuses, "pod resource claim statuses")

+			scheduling, err := fakeKubeClient.ResourceV1alpha2().PodSchedulingContexts("").List(ctx, metav1.ListOptions{})
+			if err != nil {
+				t.Fatalf("unexpected error while listing claims: %v", err)
+			}
+			assert.Equal(t, normalizeScheduling(tc.expectedPodSchedulingContexts), normalizeScheduling(scheduling.Items))
+
 			expectMetrics(t, tc.expectedMetrics)
 		})
 	}
@@ -481,6 +528,7 @@ func makeGeneratedClaim(podClaimName, generateName, namespace, classname string,
 		},
 		Spec: resourcev1alpha2.ResourceClaimSpec{
 			ResourceClassName: classname,
+			AllocationMode:    resourcev1alpha2.AllocationModeWaitForFirstConsumer,
 		},
 	}
 	if owner != nil {
@@ -490,6 +538,26 @@ func makeGeneratedClaim(podClaimName, generateName, namespace, classname string,
 	return claim
 }

+func allocateClaim(claim *resourcev1alpha2.ResourceClaim) *resourcev1alpha2.ResourceClaim {
+	claim = claim.DeepCopy()
+	claim.Status.Allocation = &resourcev1alpha2.AllocationResult{
+		Shareable: true,
+	}
+	return claim
+}
+
+func reserveClaim(claim *resourcev1alpha2.ResourceClaim, pod *v1.Pod) *resourcev1alpha2.ResourceClaim {
+	claim = claim.DeepCopy()
+	claim.Status.ReservedFor = append(claim.Status.ReservedFor,
+		resourcev1alpha2.ResourceClaimConsumerReference{
+			Resource: "pods",
+			Name:     pod.Name,
+			UID:      pod.UID,
+		},
+	)
+	return claim
+}
+
 func makePodResourceClaim(name, templateName string) *v1.PodResourceClaim {
 	return &v1.PodResourceClaim{
 		Name: name,
@@ -564,6 +632,14 @@ func normalizeClaims(claims []resourcev1alpha2.ResourceClaim) []resourcev1alpha2
 	return claims
 }

+func normalizeScheduling(scheduling []resourcev1alpha2.PodSchedulingContext) []resourcev1alpha2.PodSchedulingContext {
+	sort.Slice(scheduling, func(i, j int) bool {
+		return scheduling[i].Namespace < scheduling[j].Namespace ||
+			scheduling[i].Name < scheduling[j].Name
+	})
+	return scheduling
+}
+
 func createTestClient(objects ...runtime.Object) *fake.Clientset {
 	fakeClient := fake.NewSimpleClientset(objects...)
 	fakeClient.PrependReactor("create", "resourceclaims", createResourceClaimReactor())
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go
@@ -213,6 +213,7 @@ func buildControllerRoles() ([]rbacv1.ClusterRole, []rbacv1.ClusterRoleBinding)
 				rbacv1helpers.NewRule("get", "list", "watch").Groups(legacyGroup).Resources("pods").RuleOrDie(),
 				rbacv1helpers.NewRule("update").Groups(legacyGroup).Resources("pods/finalizers").RuleOrDie(),
 				rbacv1helpers.NewRule("get", "list", "watch", "create", "delete").Groups(resourceGroup).Resources("resourceclaims").RuleOrDie(),
+				rbacv1helpers.NewRule("get", "list", "watch", "create", "update", "patch").Groups(resourceGroup).Resources("podschedulingcontexts").RuleOrDie(),
 				rbacv1helpers.NewRule("update", "patch").Groups(resourceGroup).Resources("resourceclaims/status").RuleOrDie(),
 				rbacv1helpers.NewRule("update", "patch").Groups(legacyGroup).Resources("pods/status").RuleOrDie(),
 				eventsRule(),
--- a/test/e2e/dra/dra.go
+++ b/test/e2e/dra/dra.go
@@ -98,16 +98,31 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 		})

 		ginkgo.It("must not run a pod if a claim is not reserved for it", func(ctx context.Context) {
-			parameters := b.parameters()
-			claim := b.externalClaim(resourcev1alpha2.AllocationModeImmediate)
+			// Pretend that the resource is allocated and reserved for some other entity.
+			// Until the resourceclaim controller learns to remove reservations for
+			// arbitrary types we can simply fake somthing here.
+			claim := b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer)
+			b.create(ctx, claim)
+			claim, err := f.ClientSet.ResourceV1alpha2().ResourceClaims(f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
+			framework.ExpectNoError(err, "get claim")
+			claim.Status.Allocation = &resourcev1alpha2.AllocationResult{}
+			claim.Status.DriverName = driver.Name
+			claim.Status.ReservedFor = append(claim.Status.ReservedFor, resourcev1alpha2.ResourceClaimConsumerReference{
+				APIGroup: "example.com",
+				Resource: "some",
+				Name:     "thing",
+				UID:      "12345",
+			})
+			_, err = f.ClientSet.ResourceV1alpha2().ResourceClaims(f.Namespace.Name).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
+			framework.ExpectNoError(err, "update claim")
+
 			pod := b.podExternal()

 			// This bypasses scheduling and therefore the pod gets
 			// to run on the node although it never gets added to
 			// the `ReservedFor` field of the claim.
 			pod.Spec.NodeName = nodes.NodeNames[0]
-
-			b.create(ctx, parameters, claim, pod)
+			b.create(ctx, pod)

 			gomega.Consistently(func() error {
 				testPod, err := b.f.ClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{})
@@ -178,7 +193,9 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 	})

 	ginkgo.Context("cluster", func() {
-		nodes := NewNodes(f, 1, 4)
+		nodes := NewNodes(f, 1, 1)
+		driver := NewDriver(f, nodes, networkResources)
+		b := newBuilder(f, driver)

 		ginkgo.It("truncates the name of a generated resource claim", func(ctx context.Context) {
 			parameters := b.parameters()
@@ -208,6 +225,10 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 			framework.ExpectNoError(err)
 			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod))
 		})
+	})
+
+	ginkgo.Context("cluster", func() {
+		nodes := NewNodes(f, 1, 4)

 		// claimTests tries out several different combinations of pods with
 		// claims, both inline and external.
@@ -341,6 +362,32 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 					return b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
 				}).WithTimeout(f.Timeouts.PodDelete).Should(gomega.HaveField("Status.Allocation", (*resourcev1alpha2.AllocationResult)(nil)))
 			})
+
+			// kube-controller-manager can trigger delayed allocation for pods where the
+			// node name was already selected when creating the pod. For immediate
+			// allocation, the creator has to ensure that the node matches the claims.
+			// This does not work for resource claim templates and only isn't
+			// a problem here because the resource is network-attached and available
+			// on all nodes.
+
+			ginkgo.It("supports scheduled pod referencing inline resource claim", func(ctx context.Context) {
+				parameters := b.parameters()
+				pod, template := b.podInline(allocationMode)
+				pod.Spec.NodeName = nodes.NodeNames[0]
+				b.create(ctx, parameters, pod, template)
+
+				b.testPod(ctx, f.ClientSet, pod)
+			})
+
+			ginkgo.It("supports scheduled pod referencing external resource claim", func(ctx context.Context) {
+				parameters := b.parameters()
+				claim := b.externalClaim(allocationMode)
+				pod := b.podExternal()
+				pod.Spec.NodeName = nodes.NodeNames[0]
+				b.create(ctx, parameters, claim, pod)
+
+				b.testPod(ctx, f.ClientSet, pod)
+			})
 		}

 		ginkgo.Context("with delayed allocation and setting ReservedFor", func() {
--- a/test/integration/util/util.go
+++ b/test/integration/util/util.go
@@ -121,9 +121,10 @@ func StartScheduler(ctx context.Context, clientSet clientset.Interface, kubeConf

 func CreateResourceClaimController(ctx context.Context, tb testing.TB, clientSet clientset.Interface, informerFactory informers.SharedInformerFactory) func() {
 	podInformer := informerFactory.Core().V1().Pods()
+	schedulingInformer := informerFactory.Resource().V1alpha2().PodSchedulingContexts()
 	claimInformer := informerFactory.Resource().V1alpha2().ResourceClaims()
 	claimTemplateInformer := informerFactory.Resource().V1alpha2().ResourceClaimTemplates()
-	claimController, err := resourceclaim.NewController(klog.FromContext(ctx), clientSet, podInformer, claimInformer, claimTemplateInformer)
+	claimController, err := resourceclaim.NewController(klog.FromContext(ctx), clientSet, podInformer, schedulingInformer, claimInformer, claimTemplateInformer)
 	if err != nil {
 		tb.Fatalf("Error creating claim controller: %v", err)
 	}