Merge pull request #136480 from rogowski-piotr/automated-cherry-pick-of-#135919-upstream-release-1.34

Automated cherry pick of #135919: kubelet(dra): fix handling of multiple ResourceClaims when one is already prepared
2026-02-21 22:57:15 +00:00 · 2026-02-06 05:38:30 +05:30
parent 36b7bc8fb5 dacc7257f6
commit e71810983a
3 changed files with 179 additions and 39 deletions
--- a/pkg/kubelet/cm/dra/manager.go
+++ b/pkg/kubelet/cm/dra/manager.go
@@ -359,10 +359,10 @@ func (m *Manager) prepareResources(ctx context.Context, pod *v1.Pod) error {
 				return fmt.Errorf("checkpoint ResourceClaim cache: %w", err)
 			}

-			// If this claim is already prepared, there is no need to prepare it again.
+			// If this claim is already prepared, continue preparing for any remaining claims.
 			if claimInfo.isPrepared() {
 				logger.V(5).Info("Resources already prepared", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim))
-				return nil
+				continue
 			}

 			// This saved claim will be used to update ClaimInfo cache
--- a/pkg/kubelet/cm/dra/manager_test.go
+++ b/pkg/kubelet/cm/dra/manager_test.go
@@ -61,6 +61,10 @@ const (
 	containerName   = "test-container"
 )

+var (
+	testPodCounter atomic.Uint32
+)
+
 type fakeDRADriverGRPCServer struct {
 	drapb.UnimplementedDRAPluginServer
 	drahealthv1alpha1.UnimplementedDRAResourceHealthServer
@@ -276,6 +280,23 @@ func setupFakeDRADriverGRPCServer(ctx context.Context, shouldTimeout bool, plugi
 	}, nil
 }

+func genPrepareResourcesResponse(claimUID types.UID) *drapb.NodePrepareResourcesResponse {
+	return &drapb.NodePrepareResourcesResponse{
+		Claims: map[string]*drapb.NodePrepareResourceResponse{
+			string(claimUID): {
+				Devices: []*drapb.Device{
+					{
+						PoolName:     poolName,
+						DeviceName:   deviceName,
+						RequestNames: []string{requestName},
+						CDIDeviceIDs: []string{cdiID},
+					},
+				},
+			},
+		},
+	}
+}
+
 func TestNewManagerImpl(t *testing.T) {
 	kubeClient := fake.NewSimpleClientset()
 	for _, test := range []struct {
@@ -339,6 +360,45 @@ func genTestPod() *v1.Pod {
 	}
 }

+func genTestPodWithClaims(claimNames ...string) *v1.Pod {
+	podCounter := testPodCounter.Add(1)
+
+	podName := fmt.Sprintf("test-pod-%d", podCounter)
+	podUID := types.UID(fmt.Sprintf("test-pod-uid-%d", podCounter))
+
+	resourceClaims := make([]v1.PodResourceClaim, 0, len(claimNames))
+	containerClaims := make([]v1.ResourceClaim, 0, len(claimNames))
+
+	for _, claimName := range claimNames {
+		cn := claimName
+		resourceClaims = append(resourceClaims, v1.PodResourceClaim{
+			Name:              cn,
+			ResourceClaimName: &cn,
+		})
+		containerClaims = append(containerClaims, v1.ResourceClaim{
+			Name: cn,
+		})
+	}
+
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      podName,
+			Namespace: namespace,
+			UID:       podUID,
+		},
+		Spec: v1.PodSpec{
+			ResourceClaims: resourceClaims,
+			Containers: []v1.Container{
+				{
+					Resources: v1.ResourceRequirements{
+						Claims: containerClaims,
+					},
+				},
+			},
+		},
+	}
+}
+
 // genTestPodWithExtendedResource generates pod object
 func genTestPodWithExtendedResource() *v1.Pod {
 	return &v1.Pod{
@@ -692,18 +752,7 @@ func TestPrepareResources(t *testing.T) {
 			claim:                  genTestClaim(claimName, driverName, deviceName, podUID),
 			expectedPrepareCalls:   1,
 			expectedClaimInfoState: genClaimInfoState(cdiID),
-			resp: &drapb.NodePrepareResourcesResponse{Claims: map[string]*drapb.NodePrepareResourceResponse{
-				string(claimUID): {
-					Devices: []*drapb.Device{
-						{
-							PoolName:     poolName,
-							DeviceName:   deviceName,
-							RequestNames: []string{requestName},
-							CDIDeviceIDs: []string{cdiID},
-						},
-					},
-				},
-			}},
+			resp:                   genPrepareResourcesResponse(claimUID),
 		},
 		{
 			description:            "resource already prepared",
@@ -712,18 +761,7 @@ func TestPrepareResources(t *testing.T) {
 			claim:                  genTestClaim(claimName, driverName, deviceName, podUID),
 			claimInfo:              genTestClaimInfo(claimUID, []string{podUID}, true),
 			expectedClaimInfoState: genClaimInfoState(cdiID),
-			resp: &drapb.NodePrepareResourcesResponse{Claims: map[string]*drapb.NodePrepareResourceResponse{
-				string(claimUID): {
-					Devices: []*drapb.Device{
-						{
-							PoolName:     poolName,
-							DeviceName:   deviceName,
-							RequestNames: []string{requestName},
-							CDIDeviceIDs: []string{cdiID},
-						},
-					},
-				},
-			}},
+			resp:                   genPrepareResourcesResponse(claimUID),
 		},
 		{
 			description:          "should timeout",
@@ -740,19 +778,8 @@ func TestPrepareResources(t *testing.T) {
 			pod:                    genTestPod(),
 			claim:                  genTestClaim(claimName, driverName, deviceName, podUID),
 			expectedClaimInfoState: genClaimInfoState(cdiID),
-			resp: &drapb.NodePrepareResourcesResponse{Claims: map[string]*drapb.NodePrepareResourceResponse{
-				string(claimUID): {
-					Devices: []*drapb.Device{
-						{
-							PoolName:     poolName,
-							DeviceName:   deviceName,
-							RequestNames: []string{requestName},
-							CDIDeviceIDs: []string{cdiID},
-						},
-					},
-				},
-			}},
-			expectedPrepareCalls: 1,
+			resp:                   genPrepareResourcesResponse(claimUID),
+			expectedPrepareCalls:   1,
 		},
 		{
 			description:            "should prepare extended resource claim backed by DRA",
@@ -863,6 +890,74 @@ func TestPrepareResources(t *testing.T) {
 	}
 }

+// TestPrepareResourcesWithPreparedAndNewClaim verifies that PrepareResources
+// correctly handles a pod that references a mix of ResourceClaims:
+// - first claim already prepared by a previous pod
+// - second claim is new and needs to be prepared
+func TestPrepareResourcesWithPreparedAndNewClaim(t *testing.T) {
+	logger, tCtx := ktesting.NewTestContext(t)
+	fakeKubeClient := fake.NewClientset()
+
+	manager, err := NewManager(logger, fakeKubeClient, t.TempDir())
+	require.NoError(t, err)
+	manager.initDRAPluginManager(tCtx, getFakeNode, time.Second)
+
+	secondClaimName := fmt.Sprintf("%s-second", claimName)
+
+	// Generate two pods where the second pod reuses an existing claim and adds a new one
+	firstPod := genTestPodWithClaims(claimName)
+	secondPod := genTestPodWithClaims(claimName, secondClaimName)
+
+	firstClaim := genTestClaim(claimName, driverName, deviceName, string(firstPod.UID))
+	secondClaim := genTestClaim(secondClaimName, driverName, deviceName, string(secondPod.UID))
+
+	// Make firstClaim reserved for first and second pod
+	firstClaim.Status.ReservedFor = append(
+		firstClaim.Status.ReservedFor,
+		resourceapi.ResourceClaimConsumerReference{UID: secondPod.UID},
+	)
+
+	_, err = fakeKubeClient.ResourceV1().ResourceClaims(namespace).Create(tCtx, firstClaim, metav1.CreateOptions{})
+	require.NoError(t, err)
+
+	_, err = fakeKubeClient.ResourceV1().ResourceClaims(namespace).Create(tCtx, secondClaim, metav1.CreateOptions{})
+	require.NoError(t, err)
+
+	respFirst := genPrepareResourcesResponse(firstClaim.UID)
+	draServerInfo, err := setupFakeDRADriverGRPCServer(tCtx, false, nil, respFirst, nil, nil)
+	require.NoError(t, err)
+	defer draServerInfo.teardownFn()
+
+	plg := manager.GetWatcherHandler()
+	require.NoError(t, plg.RegisterPlugin(driverName, draServerInfo.socketName, []string{drapb.DRAPluginService}, nil))
+
+	err = manager.PrepareResources(tCtx, firstPod)
+	require.NoError(t, err)
+
+	assert.Equal(t, uint32(1),
+		draServerInfo.server.prepareResourceCalls.Load(),
+		"first pod should trigger one prepare call",
+	)
+
+	respSecond := genPrepareResourcesResponse(secondClaim.UID)
+	draServerInfo.server.prepareResourcesResponse = respSecond
+
+	err = manager.PrepareResources(tCtx, secondPod)
+	require.NoError(t, err)
+
+	// second pod triggered exactly one prepare call (new claim only) + previous one call
+	assert.Equal(t, uint32(2),
+		draServerInfo.server.prepareResourceCalls.Load(),
+		"second pod should trigger one prepare call for the new claim",
+	)
+
+	for _, claimName := range []string{firstClaim.Name, secondClaim.Name} {
+		claimInfo, exists := manager.cache.get(claimName, namespace)
+		require.True(t, exists, "claim %s should exist in cache", claimName)
+		assert.True(t, claimInfo.prepared, "claim %s should be marked as prepared", claimName)
+	}
+}
+
 func TestUnprepareResources(t *testing.T) {
 	fakeKubeClient := fake.NewSimpleClientset()
 	for _, test := range []struct {
--- a/test/e2e/dra/dra.go
+++ b/test/e2e/dra/dra.go
@@ -942,6 +942,50 @@ var _ = framework.SIGDescribe("node")(framework.WithLabel("DRA"), func() {
 		}
 	}

+	singleNodeMultipleClaimsTests := func() {
+		nodes := drautils.NewNodes(f, 1, 1)
+		// Allow allocating more than one device so that multiple claims can be prepared on the same node.
+		maxAllocations := 2
+		driver := drautils.NewDriver(f, nodes, drautils.DriverResources(maxAllocations)) // All tests get their own driver instance.
+		driver.WithKubelet = true
+		b := drautils.NewBuilder(f, driver)
+
+		ginkgo.It("requests an already allocated and a new claim for a pod", func(ctx context.Context) {
+			// This test covers a situation when a pod references a mix of already-prepared and new claims.
+			firstClaim := b.ExternalClaim()
+			secondClaim := b.ExternalClaim()
+
+			b.Create(ctx, firstClaim, secondClaim)
+
+			// First pod uses only firstClaim
+			firstPod := b.PodExternal()
+			b.Create(ctx, firstPod)
+			b.TestPod(ctx, f, firstPod)
+
+			// Second pod uses firstClaim (already prepared) + secondClaim (new)
+			secondPod := b.PodExternal()
+
+			secondPod.Spec.ResourceClaims = []v1.PodResourceClaim{
+				{
+					Name:              "first",
+					ResourceClaimName: &firstClaim.Name,
+				},
+				{
+					Name:              "second",
+					ResourceClaimName: &secondClaim.Name,
+				},
+			}
+
+			secondPod.Spec.Containers[0].Resources.Claims = []v1.ResourceClaim{
+				{Name: "first"},
+				{Name: "second"},
+			}
+
+			b.Create(ctx, secondPod)
+			b.TestPod(ctx, f, secondPod)
+		})
+	}
+
 	// The following tests only make sense when there is more than one node.
 	// They get skipped when there's only one node.
 	multiNodeTests := func(withKubelet bool) {
@@ -1806,6 +1850,7 @@ var _ = framework.SIGDescribe("node")(framework.WithLabel("DRA"), func() {

 	framework.Context("control plane", framework.WithLabel("ConformanceCandidate") /* TODO: replace with framework.WithConformance() */, func() { singleNodeTests(false) })
 	framework.Context("kubelet", feature.DynamicResourceAllocation, "on single node", func() { singleNodeTests(true) })
+	framework.Context("kubelet", feature.DynamicResourceAllocation, "on single node with multiple claims allocation", singleNodeMultipleClaimsTests)

 	framework.Context("control plane", framework.WithLabel("ConformanceCandidate") /* TODO: replace with framework.WithConformance() */, func() { multiNodeTests(false) })
 	framework.Context("kubelet", feature.DynamicResourceAllocation, "on multiple nodes", func() { multiNodeTests(true) })