From a809a6353b669590c88929f03bb6d388a7ee56af Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 2 Oct 2023 13:08:55 +0200 Subject: [PATCH] scheduler: publish PodSchedulingContext during PreBind Blocking API calls during a scheduling cycle like the DRA plugin is doing slow down overall scheduling, i.e. also affecting pods which don't use DRA. It is easy to move the blocking calls into a goroutine while the scheduling cycle ends with "pod unschedulable". The hard part is handling an error when those API calls then fail in the background. There is a solution for that (see https://github.com/kubernetes/kubernetes/pull/120963), but it's complex. Instead, publishing the modified PodSchedulingContext can also be done later. In the more common case of a pod which is ready for binding except for its claims, that'll be in PreBind, which runs in a separate goroutine already. In the less common case that a pod cannot be scheduled, that'll be in Unreserve which is still blocking. --- .../dynamicresources/dynamicresources.go | 32 +++++++++++++++---- .../dynamicresources/dynamicresources_test.go | 16 +++++----- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index d3d3900ba7c..b2f3c4b4c7b 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -958,16 +958,15 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName { state.podSchedulingState.selectedNode = &nodeName logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) - if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { - return statusError(logger, err) - } - return statusPending(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) + // The actual publish happens in PreBind or Unreserve. + return nil } } // May have been modified earlier in PreScore or above. - if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { - return statusError(logger, err) + if state.podSchedulingState.isDirty() { + // The actual publish happens in PreBind or Unreserve. + return nil } // More than one pending claim and not enough information about all of them. @@ -1004,6 +1003,18 @@ func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleSt } logger := klog.FromContext(ctx) + + // Was publishing delayed? If yes, do it now. + // + // The most common scenario is that a different set of potential nodes + // was identified. This revised set needs to be published to enable DRA + // drivers to provide better guidance for future scheduling attempts. + if state.podSchedulingState.isDirty() { + if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { + logger.Error(err, "publish PodSchedulingContext") + } + } + for _, claim := range state.claims { if claim.Status.Allocation != nil && resourceclaim.IsReservedForPod(pod, claim) { @@ -1042,6 +1053,15 @@ func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleStat } logger := klog.FromContext(ctx) + + // Was publishing delayed? If yes, do it now and then cause binding to stop. + if state.podSchedulingState.isDirty() { + if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { + return statusError(logger, err) + } + return statusPending(logger, "waiting for resource driver", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) + } + for index, claim := range state.claims { if !resourceclaim.IsReservedForPod(pod, claim) { // The claim might be stale, for example because the claim can get shared and some diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index c6ccfadfb9b..9ca2b407fde 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -351,8 +351,8 @@ func TestPlugin(t *testing.T) { claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim}, classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ - reserve: result{ - status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`), + prebind: result{ + status: framework.NewStatus(framework.Pending, `waiting for resource driver`), added: []metav1.Object{schedulingSelectedPotential}, }, }, @@ -365,8 +365,8 @@ func TestPlugin(t *testing.T) { claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim, pendingDelayedClaim2}, classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ - reserve: result{ - status: framework.NewStatus(framework.Pending, `waiting for resource driver to provide information`), + prebind: result{ + status: framework.NewStatus(framework.Pending, `waiting for resource driver`), added: []metav1.Object{schedulingPotential}, }, }, @@ -379,8 +379,8 @@ func TestPlugin(t *testing.T) { schedulings: []*resourcev1alpha2.PodSchedulingContext{schedulingInfo}, classes: []*resourcev1alpha2.ResourceClass{resourceClass}, want: want{ - reserve: result{ - status: framework.NewStatus(framework.Pending, `waiting for resource driver to allocate resource`), + prebind: result{ + status: framework.NewStatus(framework.Pending, `waiting for resource driver`), changes: change{ scheduling: func(in *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { return st.FromPodSchedulingContexts(in). @@ -399,7 +399,7 @@ func TestPlugin(t *testing.T) { schedulings: []*resourcev1alpha2.PodSchedulingContext{schedulingInfo}, classes: []*resourcev1alpha2.ResourceClass{resourceClass}, prepare: prepare{ - reserve: change{ + prebind: change{ scheduling: func(in *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { // This does not actually conflict with setting the // selected node, but because the plugin is not using @@ -411,7 +411,7 @@ func TestPlugin(t *testing.T) { }, }, want: want{ - reserve: result{ + prebind: result{ status: framework.AsStatus(errors.New(`ResourceVersion must match the object that gets updated`)), }, },