scheduler: publish PodSchedulingContext during PreBind

Blocking API calls during a scheduling cycle like the DRA plugin is doing slow down overall scheduling, i.e. also affecting pods which don't use DRA. It is easy to move the blocking calls into a goroutine while the scheduling cycle ends with "pod unschedulable". The hard part is handling an error when those API calls then fail in the background. There is a solution for that (see https://github.com/kubernetes/kubernetes/pull/120963), but it's complex. Instead, publishing the modified PodSchedulingContext can also be done later. In the more common case of a pod which is ready for binding except for its claims, that'll be in PreBind, which runs in a separate goroutine already. In the less common case that a pod cannot be scheduled, that'll be in Unreserve which is still blocking.
2025-09-14 21:53:52 +00:00 · 2023-10-02 13:08:55 +02:00
parent 5d1509126f
commit a809a6353b
2 changed files with 34 additions and 14 deletions
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
@@ -958,16 +958,15 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
 			state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName {
 			state.podSchedulingState.selectedNode = &nodeName
 			logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
-			if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
-				return statusError(logger, err)
-			}
-			return statusPending(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
+			// The actual publish happens in PreBind or Unreserve.
+			return nil
 		}
 	}

 	// May have been modified earlier in PreScore or above.
-	if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
-		return statusError(logger, err)
+	if state.podSchedulingState.isDirty() {
+		// The actual publish happens in PreBind or Unreserve.
+		return nil
 	}

 	// More than one pending claim and not enough information about all of them.
@@ -1004,6 +1003,18 @@ func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleSt
 	}

 	logger := klog.FromContext(ctx)
+
+	// Was publishing delayed? If yes, do it now.
+	//
+	// The most common scenario is that a different set of potential nodes
+	// was identified. This revised set needs to be published to enable DRA
+	// drivers to provide better guidance for future scheduling attempts.
+	if state.podSchedulingState.isDirty() {
+		if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
+			logger.Error(err, "publish PodSchedulingContext")
+		}
+	}
+
 	for _, claim := range state.claims {
 		if claim.Status.Allocation != nil &&
 			resourceclaim.IsReservedForPod(pod, claim) {
@@ -1042,6 +1053,15 @@ func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleStat
 	}

 	logger := klog.FromContext(ctx)
+
+	// Was publishing delayed? If yes, do it now and then cause binding to stop.
+	if state.podSchedulingState.isDirty() {
+		if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
+			return statusError(logger, err)
+		}
+		return statusPending(logger, "waiting for resource driver", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
+	}
+
 	for index, claim := range state.claims {
 		if !resourceclaim.IsReservedForPod(pod, claim) {
 			// The claim might be stale, for example because the claim can get shared and some
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
@@ -351,8 +351,8 @@ func TestPlugin(t *testing.T) {
 			claims:  []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
 			classes: []*resourcev1alpha2.ResourceClass{resourceClass},
 			want: want{
-				reserve: result{
-					status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`),
+				prebind: result{
+					status: framework.NewStatus(framework.Pending, `waiting for resource driver`),
 					added:  []metav1.Object{schedulingSelectedPotential},
 				},
 			},
@@ -365,8 +365,8 @@ func TestPlugin(t *testing.T) {
 			claims:  []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim, pendingDelayedClaim2},
 			classes: []*resourcev1alpha2.ResourceClass{resourceClass},
 			want: want{
-				reserve: result{
-					status: framework.NewStatus(framework.Pending, `waiting for resource driver to provide information`),
+				prebind: result{
+					status: framework.NewStatus(framework.Pending, `waiting for resource driver`),
 					added:  []metav1.Object{schedulingPotential},
 				},
 			},
@@ -379,8 +379,8 @@ func TestPlugin(t *testing.T) {
 			schedulings: []*resourcev1alpha2.PodSchedulingContext{schedulingInfo},
 			classes:     []*resourcev1alpha2.ResourceClass{resourceClass},
 			want: want{
-				reserve: result{
-					status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`),
+				prebind: result{
+					status: framework.NewStatus(framework.Pending, `waiting for resource driver`),
 					changes: change{
 						scheduling: func(in *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
 							return st.FromPodSchedulingContexts(in).
@@ -399,7 +399,7 @@ func TestPlugin(t *testing.T) {
 			schedulings: []*resourcev1alpha2.PodSchedulingContext{schedulingInfo},
 			classes:     []*resourcev1alpha2.ResourceClass{resourceClass},
 			prepare: prepare{
-				reserve: change{
+				prebind: change{
 					scheduling: func(in *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
 						// This does not actually conflict with setting the
 						// selected node, but because the plugin is not using
@@ -411,7 +411,7 @@ func TestPlugin(t *testing.T) {
 				},
 			},
 			want: want{
-				reserve: result{
+				prebind: result{
 					status: framework.AsStatus(errors.New(`ResourceVersion must match the object that gets updated`)),
 				},
 			},