migrate scheduling logic to schedule_one.go as more reasonable (#109045)

* refactor: migrate scheduling logic to scheduler_one.go as more reasonable Signed-off-by: kerthcet <kerthcet@gmail.com> * remove snapshot() function Signed-off-by: kerthcet <kerthcet@gmail.com>
2026-01-05 07:27:21 +00:00 · 2022-03-27 23:51:20 +08:00
parent d796dd7d0f
commit 9bb76ecf00
6 changed files with 4068 additions and 4073 deletions
--- a/pkg/scheduler/factory.go
+++ b/pkg/scheduler/factory.go
@@ -1,142 +0,0 @@
-/*
-Copyright 2014 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
-	"context"
-	"fmt"
-
-	v1 "k8s.io/api/core/v1"
-	apierrors "k8s.io/apimachinery/pkg/api/errors"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	clientset "k8s.io/client-go/kubernetes"
-	corelisters "k8s.io/client-go/listers/core/v1"
-	"k8s.io/klog/v2"
-	schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config"
-	"k8s.io/kubernetes/pkg/scheduler/framework"
-	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
-	internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache"
-	internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
-)
-
-func buildExtenders(extenders []schedulerapi.Extender, profiles []schedulerapi.KubeSchedulerProfile) ([]framework.Extender, error) {
-	var fExtenders []framework.Extender
-	if len(extenders) == 0 {
-		return nil, nil
-	}
-
-	var ignoredExtendedResources []string
-	var ignorableExtenders []framework.Extender
-	for i := range extenders {
-		klog.V(2).InfoS("Creating extender", "extender", extenders[i])
-		extender, err := NewHTTPExtender(&extenders[i])
-		if err != nil {
-			return nil, err
-		}
-		if !extender.IsIgnorable() {
-			fExtenders = append(fExtenders, extender)
-		} else {
-			ignorableExtenders = append(ignorableExtenders, extender)
-		}
-		for _, r := range extenders[i].ManagedResources {
-			if r.IgnoredByScheduler {
-				ignoredExtendedResources = append(ignoredExtendedResources, r.Name)
-			}
-		}
-	}
-	// place ignorable extenders to the tail of extenders
-	fExtenders = append(fExtenders, ignorableExtenders...)
-
-	// If there are any extended resources found from the Extenders, append them to the pluginConfig for each profile.
-	// This should only have an effect on ComponentConfig, where it is possible to configure Extenders and
-	// plugin args (and in which case the extender ignored resources take precedence).
-	if len(ignoredExtendedResources) == 0 {
-		return fExtenders, nil
-	}
-
-	for i := range profiles {
-		prof := &profiles[i]
-		var found = false
-		for k := range prof.PluginConfig {
-			if prof.PluginConfig[k].Name == noderesources.Name {
-				// Update the existing args
-				pc := &prof.PluginConfig[k]
-				args, ok := pc.Args.(*schedulerapi.NodeResourcesFitArgs)
-				if !ok {
-					return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", pc.Args)
-				}
-				args.IgnoredResources = ignoredExtendedResources
-				found = true
-				break
-			}
-		}
-		if !found {
-			return nil, fmt.Errorf("can't find NodeResourcesFitArgs in plugin config")
-		}
-	}
-	return fExtenders, nil
-}
-
-// MakeDefaultErrorFunc construct a function to handle pod scheduler error
-func MakeDefaultErrorFunc(client clientset.Interface, podLister corelisters.PodLister, podQueue internalqueue.SchedulingQueue, schedulerCache internalcache.Cache) func(*framework.QueuedPodInfo, error) {
-	return func(podInfo *framework.QueuedPodInfo, err error) {
-		pod := podInfo.Pod
-		if err == ErrNoNodesAvailable {
-			klog.V(2).InfoS("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod))
-		} else if fitError, ok := err.(*framework.FitError); ok {
-			// Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently.
-			podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins
-			klog.V(2).InfoS("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", err)
-		} else if apierrors.IsNotFound(err) {
-			klog.V(2).InfoS("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", err)
-			if errStatus, ok := err.(apierrors.APIStatus); ok && errStatus.Status().Details.Kind == "node" {
-				nodeName := errStatus.Status().Details.Name
-				// when node is not found, We do not remove the node right away. Trying again to get
-				// the node and if the node is still not found, then remove it from the scheduler cache.
-				_, err := client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
-				if err != nil && apierrors.IsNotFound(err) {
-					node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
-					if err := schedulerCache.RemoveNode(&node); err != nil {
-						klog.V(4).InfoS("Node is not found; failed to remove it from the cache", "node", node.Name)
-					}
-				}
-			}
-		} else {
-			klog.ErrorS(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod))
-		}
-
-		// Check if the Pod exists in informer cache.
-		cachedPod, err := podLister.Pods(pod.Namespace).Get(pod.Name)
-		if err != nil {
-			klog.InfoS("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", err)
-			return
-		}
-
-		// In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler.
-		// It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version.
-		if len(cachedPod.Spec.NodeName) != 0 {
-			klog.InfoS("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName)
-			return
-		}
-
-		// As <cachedPod> is from SharedInformer, we need to do a DeepCopy() here.
-		podInfo.PodInfo = framework.NewPodInfo(cachedPod.DeepCopy())
-		if err := podQueue.AddUnschedulableIfNotPresent(podInfo, podQueue.SchedulingCycle()); err != nil {
-			klog.ErrorS(err, "Error occurred")
-		}
-	}
-}
--- a/pkg/scheduler/factory_test.go
+++ b/pkg/scheduler/factory_test.go
@@ -1,322 +0,0 @@
-/*
-Copyright 2014 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/google/go-cmp/cmp"
-	v1 "k8s.io/api/core/v1"
-	apierrors "k8s.io/apimachinery/pkg/api/errors"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/client-go/informers"
-	"k8s.io/client-go/kubernetes/fake"
-	"k8s.io/client-go/tools/cache"
-	extenderv1 "k8s.io/kube-scheduler/extender/v1"
-	"k8s.io/kubernetes/pkg/scheduler/framework"
-	internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache"
-	internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
-	testingclock "k8s.io/utils/clock/testing"
-)
-
-const (
-	podInitialBackoffDurationSeconds = 1
-	podMaxBackoffDurationSeconds     = 10
-	testSchedulerName                = "test-scheduler"
-)
-
-func TestDefaultErrorFunc(t *testing.T) {
-	testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}}
-	testPodUpdated := testPod.DeepCopy()
-	testPodUpdated.Labels = map[string]string{"foo": ""}
-
-	tests := []struct {
-		name                       string
-		injectErr                  error
-		podUpdatedDuringScheduling bool // pod is updated during a scheduling cycle
-		podDeletedDuringScheduling bool // pod is deleted during a scheduling cycle
-		expect                     *v1.Pod
-	}{
-		{
-			name:                       "pod is updated during a scheduling cycle",
-			injectErr:                  nil,
-			podUpdatedDuringScheduling: true,
-			expect:                     testPodUpdated,
-		},
-		{
-			name:      "pod is not updated during a scheduling cycle",
-			injectErr: nil,
-			expect:    testPod,
-		},
-		{
-			name:                       "pod is deleted during a scheduling cycle",
-			injectErr:                  nil,
-			podDeletedDuringScheduling: true,
-			expect:                     nil,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			stopCh := make(chan struct{})
-			defer close(stopCh)
-
-			client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}})
-			informerFactory := informers.NewSharedInformerFactory(client, 0)
-			podInformer := informerFactory.Core().V1().Pods()
-			// Need to add/update/delete testPod to the store.
-			podInformer.Informer().GetStore().Add(testPod)
-
-			queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())))
-			schedulerCache := internalcache.New(30*time.Second, stopCh)
-
-			queue.Add(testPod)
-			queue.Pop()
-
-			if tt.podUpdatedDuringScheduling {
-				podInformer.Informer().GetStore().Update(testPodUpdated)
-				queue.Update(testPod, testPodUpdated)
-			}
-			if tt.podDeletedDuringScheduling {
-				podInformer.Informer().GetStore().Delete(testPod)
-				queue.Delete(testPod)
-			}
-
-			testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)}
-			errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache)
-			errFunc(testPodInfo, tt.injectErr)
-
-			var got *v1.Pod
-			if tt.podUpdatedDuringScheduling {
-				head, e := queue.Pop()
-				if e != nil {
-					t.Fatalf("Cannot pop pod from the activeQ: %v", e)
-				}
-				got = head.Pod
-			} else {
-				got = getPodFromPriorityQueue(queue, testPod)
-			}
-
-			if diff := cmp.Diff(tt.expect, got); diff != "" {
-				t.Errorf("Unexpected pod (-want, +got): %s", diff)
-			}
-		})
-	}
-}
-
-func TestDefaultErrorFunc_NodeNotFound(t *testing.T) {
-	nodeFoo := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}
-	nodeBar := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "bar"}}
-	testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}}
-	tests := []struct {
-		name             string
-		nodes            []v1.Node
-		nodeNameToDelete string
-		injectErr        error
-		expectNodeNames  sets.String
-	}{
-		{
-			name:             "node is deleted during a scheduling cycle",
-			nodes:            []v1.Node{*nodeFoo, *nodeBar},
-			nodeNameToDelete: "foo",
-			injectErr:        apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name),
-			expectNodeNames:  sets.NewString("bar"),
-		},
-		{
-			name:            "node is not deleted but NodeNotFound is received incorrectly",
-			nodes:           []v1.Node{*nodeFoo, *nodeBar},
-			injectErr:       apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name),
-			expectNodeNames: sets.NewString("foo", "bar"),
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			stopCh := make(chan struct{})
-			defer close(stopCh)
-
-			client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: tt.nodes})
-			informerFactory := informers.NewSharedInformerFactory(client, 0)
-			podInformer := informerFactory.Core().V1().Pods()
-			// Need to add testPod to the store.
-			podInformer.Informer().GetStore().Add(testPod)
-
-			queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())))
-			schedulerCache := internalcache.New(30*time.Second, stopCh)
-
-			for i := range tt.nodes {
-				node := tt.nodes[i]
-				// Add node to schedulerCache no matter it's deleted in API server or not.
-				schedulerCache.AddNode(&node)
-				if node.Name == tt.nodeNameToDelete {
-					client.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{})
-				}
-			}
-
-			testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)}
-			errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache)
-			errFunc(testPodInfo, tt.injectErr)
-
-			gotNodes := schedulerCache.Dump().Nodes
-			gotNodeNames := sets.NewString()
-			for _, nodeInfo := range gotNodes {
-				gotNodeNames.Insert(nodeInfo.Node().Name)
-			}
-			if diff := cmp.Diff(tt.expectNodeNames, gotNodeNames); diff != "" {
-				t.Errorf("Unexpected nodes (-want, +got): %s", diff)
-			}
-		})
-	}
-}
-
-func TestDefaultErrorFunc_PodAlreadyBound(t *testing.T) {
-	stopCh := make(chan struct{})
-	defer close(stopCh)
-
-	nodeFoo := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}
-	testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}, Spec: v1.PodSpec{NodeName: "foo"}}
-
-	client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: []v1.Node{nodeFoo}})
-	informerFactory := informers.NewSharedInformerFactory(client, 0)
-	podInformer := informerFactory.Core().V1().Pods()
-	// Need to add testPod to the store.
-	podInformer.Informer().GetStore().Add(testPod)
-
-	queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())))
-	schedulerCache := internalcache.New(30*time.Second, stopCh)
-
-	// Add node to schedulerCache no matter it's deleted in API server or not.
-	schedulerCache.AddNode(&nodeFoo)
-
-	testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)}
-	errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache)
-	errFunc(testPodInfo, fmt.Errorf("binding rejected: timeout"))
-
-	pod := getPodFromPriorityQueue(queue, testPod)
-	if pod != nil {
-		t.Fatalf("Unexpected pod: %v should not be in PriorityQueue when the NodeName of pod is not empty", pod.Name)
-	}
-}
-
-// getPodFromPriorityQueue is the function used in the TestDefaultErrorFunc test to get
-// the specific pod from the given priority queue. It returns the found pod in the priority queue.
-func getPodFromPriorityQueue(queue *internalqueue.PriorityQueue, pod *v1.Pod) *v1.Pod {
-	podList := queue.PendingPods()
-	if len(podList) == 0 {
-		return nil
-	}
-
-	queryPodKey, err := cache.MetaNamespaceKeyFunc(pod)
-	if err != nil {
-		return nil
-	}
-
-	for _, foundPod := range podList {
-		foundPodKey, err := cache.MetaNamespaceKeyFunc(foundPod)
-		if err != nil {
-			return nil
-		}
-
-		if foundPodKey == queryPodKey {
-			return foundPod
-		}
-	}
-
-	return nil
-}
-
-type fakeExtender struct {
-	isBinder          bool
-	interestedPodName string
-	ignorable         bool
-	gotBind           bool
-}
-
-func (f *fakeExtender) Name() string {
-	return "fakeExtender"
-}
-
-func (f *fakeExtender) IsIgnorable() bool {
-	return f.ignorable
-}
-
-func (f *fakeExtender) ProcessPreemption(
-	_ *v1.Pod,
-	_ map[string]*extenderv1.Victims,
-	_ framework.NodeInfoLister,
-) (map[string]*extenderv1.Victims, error) {
-	return nil, nil
-}
-
-func (f *fakeExtender) SupportsPreemption() bool {
-	return false
-}
-
-func (f *fakeExtender) Filter(pod *v1.Pod, nodes []*v1.Node) ([]*v1.Node, extenderv1.FailedNodesMap, extenderv1.FailedNodesMap, error) {
-	return nil, nil, nil, nil
-}
-
-func (f *fakeExtender) Prioritize(
-	_ *v1.Pod,
-	_ []*v1.Node,
-) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error) {
-	return nil, 0, nil
-}
-
-func (f *fakeExtender) Bind(binding *v1.Binding) error {
-	if f.isBinder {
-		f.gotBind = true
-		return nil
-	}
-	return errors.New("not a binder")
-}
-
-func (f *fakeExtender) IsBinder() bool {
-	return f.isBinder
-}
-
-func (f *fakeExtender) IsInterested(pod *v1.Pod) bool {
-	return pod != nil && pod.Name == f.interestedPodName
-}
-
-type TestPlugin struct {
-	name string
-}
-
-var _ framework.ScorePlugin = &TestPlugin{}
-var _ framework.FilterPlugin = &TestPlugin{}
-
-func (t *TestPlugin) Name() string {
-	return t.name
-}
-
-func (t *TestPlugin) Score(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) (int64, *framework.Status) {
-	return 1, nil
-}
-
-func (t *TestPlugin) ScoreExtensions() framework.ScoreExtensions {
-	return nil
-}
-
-func (t *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
-	return nil
-}
--- a/pkg/scheduler/schedule_one.go
+++ b/pkg/scheduler/schedule_one.go
@@ -0,0 +1,859 @@
+/*
+Copyright 2014 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/apimachinery/pkg/util/sets"
+	clientset "k8s.io/client-go/kubernetes"
+	"k8s.io/klog/v2"
+	extenderv1 "k8s.io/kube-scheduler/extender/v1"
+	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
+	"k8s.io/kubernetes/pkg/apis/core/validation"
+	"k8s.io/kubernetes/pkg/scheduler/framework"
+	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
+	frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
+	internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
+	"k8s.io/kubernetes/pkg/scheduler/metrics"
+	"k8s.io/kubernetes/pkg/scheduler/util"
+	utiltrace "k8s.io/utils/trace"
+)
+
+const (
+	// SchedulerError is the reason recorded for events when an error occurs during scheduling a pod.
+	SchedulerError = "SchedulerError"
+	// Percentage of plugin metrics to be sampled.
+	pluginMetricsSamplePercent = 10
+	// minFeasibleNodesToFind is the minimum number of nodes that would be scored
+	// in each scheduling cycle. This is a semi-arbitrary value to ensure that a
+	// certain minimum of nodes are checked for feasibility. This in turn helps
+	// ensure a minimum level of spreading.
+	minFeasibleNodesToFind = 100
+	// minFeasibleNodesPercentageToFind is the minimum percentage of nodes that
+	// would be scored in each scheduling cycle. This is a semi-arbitrary value
+	// to ensure that a certain minimum of nodes are checked for feasibility.
+	// This in turn helps ensure a minimum level of spreading.
+	minFeasibleNodesPercentageToFind = 5
+)
+
+var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
+
+// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
+func (sched *Scheduler) scheduleOne(ctx context.Context) {
+	podInfo := sched.NextPod()
+	// pod could be nil when schedulerQueue is closed
+	if podInfo == nil || podInfo.Pod == nil {
+		return
+	}
+	pod := podInfo.Pod
+	fwk, err := sched.frameworkForPod(pod)
+	if err != nil {
+		// This shouldn't happen, because we only accept for scheduling the pods
+		// which specify a scheduler name that matches one of the profiles.
+		klog.ErrorS(err, "Error occurred")
+		return
+	}
+	if sched.skipPodSchedule(fwk, pod) {
+		return
+	}
+
+	klog.V(3).InfoS("Attempting to schedule pod", "pod", klog.KObj(pod))
+
+	// Synchronously attempt to find a fit for the pod.
+	start := time.Now()
+	state := framework.NewCycleState()
+	state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent)
+	// Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty.
+	podsToActivate := framework.NewPodsToActivate()
+	state.Write(framework.PodsToActivateKey, podsToActivate)
+
+	schedulingCycleCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	scheduleResult, err := sched.SchedulePod(schedulingCycleCtx, fwk, state, pod)
+	if err != nil {
+		// SchedulePod() may have failed because the pod would not fit on any host, so we try to
+		// preempt, with the expectation that the next time the pod is tried for scheduling it
+		// will fit due to the preemption. It is also possible that a different pod will schedule
+		// into the resources that were preempted, but this is harmless.
+		var nominatingInfo *framework.NominatingInfo
+		if fitError, ok := err.(*framework.FitError); ok {
+			if !fwk.HasPostFilterPlugins() {
+				klog.V(3).InfoS("No PostFilter plugins are registered, so no preemption will be performed")
+			} else {
+				// Run PostFilter plugins to try to make the pod schedulable in a future scheduling cycle.
+				result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap)
+				if status.Code() == framework.Error {
+					klog.ErrorS(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status)
+				} else {
+					fitError.Diagnosis.PostFilterMsg = status.Message()
+					klog.V(5).InfoS("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status)
+				}
+				if result != nil {
+					nominatingInfo = result.NominatingInfo
+				}
+			}
+			// Pod did not fit anywhere, so it is counted as a failure. If preemption
+			// succeeds, the pod should get counted as a success the next time we try to
+			// schedule it. (hopefully)
+			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		} else if err == ErrNoNodesAvailable {
+			nominatingInfo = clearNominatedNode
+			// No nodes available is counted as unschedulable rather than an error.
+			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		} else {
+			nominatingInfo = clearNominatedNode
+			klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
+			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		}
+		sched.handleSchedulingFailure(fwk, podInfo, err, v1.PodReasonUnschedulable, nominatingInfo)
+		return
+	}
+	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))
+	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
+	// This allows us to keep scheduling without waiting on binding to occur.
+	assumedPodInfo := podInfo.DeepCopy()
+	assumedPod := assumedPodInfo.Pod
+	// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
+	err = sched.assume(assumedPod, scheduleResult.SuggestedHost)
+	if err != nil {
+		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		// This is most probably result of a BUG in retrying logic.
+		// We report an error here so that pod scheduling can be retried.
+		// This relies on the fact that Error will check if the pod has been bound
+		// to a node and if so will not add it back to the unscheduled pods queue
+		// (otherwise this would cause an infinite loop).
+		sched.handleSchedulingFailure(fwk, assumedPodInfo, err, SchedulerError, clearNominatedNode)
+		return
+	}
+
+	// Run the Reserve method of reserve plugins.
+	if sts := fwk.RunReservePluginsReserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
+		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		// trigger un-reserve to clean up state associated with the reserved Pod
+		fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+		if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil {
+			klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed")
+		}
+		sched.handleSchedulingFailure(fwk, assumedPodInfo, sts.AsError(), SchedulerError, clearNominatedNode)
+		return
+	}
+
+	// Run "permit" plugins.
+	runPermitStatus := fwk.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+	if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
+		var reason string
+		if runPermitStatus.IsUnschedulable() {
+			metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
+			reason = v1.PodReasonUnschedulable
+		} else {
+			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+			reason = SchedulerError
+		}
+		// One of the plugins returned status different than success or wait.
+		fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+		if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil {
+			klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed")
+		}
+		sched.handleSchedulingFailure(fwk, assumedPodInfo, runPermitStatus.AsError(), reason, clearNominatedNode)
+		return
+	}
+
+	// At the end of a successful scheduling cycle, pop and move up Pods if needed.
+	if len(podsToActivate.Map) != 0 {
+		sched.SchedulingQueue.Activate(podsToActivate.Map)
+		// Clear the entries after activation.
+		podsToActivate.Map = make(map[string]*v1.Pod)
+	}
+
+	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
+	go func() {
+		bindingCycleCtx, cancel := context.WithCancel(ctx)
+		defer cancel()
+		metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Inc()
+		defer metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Dec()
+
+		waitOnPermitStatus := fwk.WaitOnPermit(bindingCycleCtx, assumedPod)
+		if !waitOnPermitStatus.IsSuccess() {
+			var reason string
+			if waitOnPermitStatus.IsUnschedulable() {
+				metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
+				reason = v1.PodReasonUnschedulable
+			} else {
+				metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+				reason = SchedulerError
+			}
+			// trigger un-reserve plugins to clean up state associated with the reserved Pod
+			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+			if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil {
+				klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
+			} else {
+				// "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event,
+				// as the assumed Pod had occupied a certain amount of resources in scheduler cache.
+				// TODO(#103853): de-duplicate the logic.
+				// Avoid moving the assumed Pod itself as it's always Unschedulable.
+				// It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would
+				// update `q.moveRequest` and thus move the assumed pod to backoffQ anyways.
+				defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, func(pod *v1.Pod) bool {
+					return assumedPod.UID != pod.UID
+				})
+			}
+			sched.handleSchedulingFailure(fwk, assumedPodInfo, waitOnPermitStatus.AsError(), reason, clearNominatedNode)
+			return
+		}
+
+		// Run "prebind" plugins.
+		preBindStatus := fwk.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+		if !preBindStatus.IsSuccess() {
+			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+			// trigger un-reserve plugins to clean up state associated with the reserved Pod
+			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+			if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil {
+				klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed")
+			} else {
+				// "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event,
+				// as the assumed Pod had occupied a certain amount of resources in scheduler cache.
+				// TODO(#103853): de-duplicate the logic.
+				sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil)
+			}
+			sched.handleSchedulingFailure(fwk, assumedPodInfo, preBindStatus.AsError(), SchedulerError, clearNominatedNode)
+			return
+		}
+
+		err := sched.bind(bindingCycleCtx, fwk, assumedPod, scheduleResult.SuggestedHost, state)
+		if err != nil {
+			metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
+			// trigger un-reserve plugins to clean up state associated with the reserved Pod
+			fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+			if err := sched.Cache.ForgetPod(assumedPod); err != nil {
+				klog.ErrorS(err, "scheduler cache ForgetPod failed")
+			} else {
+				// "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event,
+				// as the assumed Pod had occupied a certain amount of resources in scheduler cache.
+				// TODO(#103853): de-duplicate the logic.
+				sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil)
+			}
+			sched.handleSchedulingFailure(fwk, assumedPodInfo, fmt.Errorf("binding rejected: %w", err), SchedulerError, clearNominatedNode)
+			return
+		}
+		// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
+		klog.V(2).InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
+		metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
+		metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
+		metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))
+
+		// Run "postbind" plugins.
+		fwk.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
+
+		// At the end of a successful binding cycle, move up Pods if needed.
+		if len(podsToActivate.Map) != 0 {
+			sched.SchedulingQueue.Activate(podsToActivate.Map)
+			// Unlike the logic in scheduling cycle, we don't bother deleting the entries
+			// as `podsToActivate.Map` is no longer consumed.
+		}
+	}()
+}
+
+func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {
+	fwk, ok := sched.Profiles[pod.Spec.SchedulerName]
+	if !ok {
+		return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName)
+	}
+	return fwk, nil
+}
+
+// skipPodSchedule returns true if we could skip scheduling the pod for specified cases.
+func (sched *Scheduler) skipPodSchedule(fwk framework.Framework, pod *v1.Pod) bool {
+	// Case 1: pod is being deleted.
+	if pod.DeletionTimestamp != nil {
+		fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
+		klog.V(3).InfoS("Skip schedule deleting pod", "pod", klog.KObj(pod))
+		return true
+	}
+
+	// Case 2: pod that has been assumed could be skipped.
+	// An assumed pod can be added again to the scheduling queue if it got an update event
+	// during its previous scheduling cycle but before getting assumed.
+	isAssumed, err := sched.Cache.IsAssumedPod(pod)
+	if err != nil {
+		utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err))
+		return false
+	}
+	return isAssumed
+}
+
+// schedulePod tries to schedule the given pod to one of the nodes in the node list.
+// If it succeeds, it will return the name of the node.
+// If it fails, it will return a FitError with reasons.
+func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
+	trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
+	defer trace.LogIfLong(100 * time.Millisecond)
+
+	if err := sched.Cache.UpdateSnapshot(sched.nodeInfoSnapshot); err != nil {
+		return result, err
+	}
+	trace.Step("Snapshotting scheduler cache and node infos done")
+
+	if sched.nodeInfoSnapshot.NumNodes() == 0 {
+		return result, ErrNoNodesAvailable
+	}
+
+	feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod)
+	if err != nil {
+		return result, err
+	}
+	trace.Step("Computing predicates done")
+
+	if len(feasibleNodes) == 0 {
+		return result, &framework.FitError{
+			Pod:         pod,
+			NumAllNodes: sched.nodeInfoSnapshot.NumNodes(),
+			Diagnosis:   diagnosis,
+		}
+	}
+
+	// When only one node after predicate, just use it.
+	if len(feasibleNodes) == 1 {
+		return ScheduleResult{
+			SuggestedHost:  feasibleNodes[0].Name,
+			EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap),
+			FeasibleNodes:  1,
+		}, nil
+	}
+
+	priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes)
+	if err != nil {
+		return result, err
+	}
+
+	host, err := selectHost(priorityList)
+	trace.Step("Prioritizing done")
+
+	return ScheduleResult{
+		SuggestedHost:  host,
+		EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
+		FeasibleNodes:  len(feasibleNodes),
+	}, err
+}
+
+// Filters the nodes to find the ones that fit the pod based on the framework
+// filter plugins and filter extenders.
+func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) {
+	diagnosis := framework.Diagnosis{
+		NodeToStatusMap:      make(framework.NodeToStatusMap),
+		UnschedulablePlugins: sets.NewString(),
+	}
+
+	// Run "prefilter" plugins.
+	preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod)
+	allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List()
+	if err != nil {
+		return nil, diagnosis, err
+	}
+	if !s.IsSuccess() {
+		if !s.IsUnschedulable() {
+			return nil, diagnosis, s.AsError()
+		}
+		// All nodes will have the same status. Some non trivial refactoring is
+		// needed to avoid this copy.
+		for _, n := range allNodes {
+			diagnosis.NodeToStatusMap[n.Node().Name] = s
+		}
+		// Status satisfying IsUnschedulable() gets injected into diagnosis.UnschedulablePlugins.
+		if s.FailedPlugin() != "" {
+			diagnosis.UnschedulablePlugins.Insert(s.FailedPlugin())
+		}
+		return nil, diagnosis, nil
+	}
+
+	// "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption.
+	// This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes.
+	if len(pod.Status.NominatedNodeName) > 0 {
+		feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis)
+		if err != nil {
+			klog.ErrorS(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName)
+		}
+		// Nominated node passes all the filters, scheduler is good to assign this node to the pod.
+		if len(feasibleNodes) != 0 {
+			return feasibleNodes, diagnosis, nil
+		}
+	}
+
+	nodes := allNodes
+	if !preRes.AllNodes() {
+		nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames))
+		for n := range preRes.NodeNames {
+			nInfo, err := sched.nodeInfoSnapshot.NodeInfos().Get(n)
+			if err != nil {
+				return nil, diagnosis, err
+			}
+			nodes = append(nodes, nInfo)
+		}
+	}
+	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, nodes)
+	if err != nil {
+		return nil, diagnosis, err
+	}
+
+	feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
+	if err != nil {
+		return nil, diagnosis, err
+	}
+	return feasibleNodes, diagnosis, nil
+}
+
+func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*v1.Node, error) {
+	nnn := pod.Status.NominatedNodeName
+	nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn)
+	if err != nil {
+		return nil, err
+	}
+	node := []*framework.NodeInfo{nodeInfo}
+	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, node)
+	if err != nil {
+		return nil, err
+	}
+
+	feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
+	if err != nil {
+		return nil, err
+	}
+
+	return feasibleNodes, nil
+}
+
+// findNodesThatPassFilters finds the nodes that fit the filter plugins.
+func (sched *Scheduler) findNodesThatPassFilters(
+	ctx context.Context,
+	fwk framework.Framework,
+	state *framework.CycleState,
+	pod *v1.Pod,
+	diagnosis framework.Diagnosis,
+	nodes []*framework.NodeInfo) ([]*v1.Node, error) {
+	numNodesToFind := sched.numFeasibleNodesToFind(int32(len(nodes)))
+
+	// Create feasible list with enough space to avoid growing it
+	// and allow assigning.
+	feasibleNodes := make([]*v1.Node, numNodesToFind)
+
+	if !fwk.HasFilterPlugins() {
+		length := len(nodes)
+		for i := range feasibleNodes {
+			feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%length].Node()
+		}
+		sched.nextStartNodeIndex = (sched.nextStartNodeIndex + len(feasibleNodes)) % length
+		return feasibleNodes, nil
+	}
+
+	errCh := parallelize.NewErrorChannel()
+	var statusesLock sync.Mutex
+	var feasibleNodesLen int32
+	ctx, cancel := context.WithCancel(ctx)
+	checkNode := func(i int) {
+		// We check the nodes starting from where we left off in the previous scheduling cycle,
+		// this is to make sure all nodes have the same chance of being examined across pods.
+		nodeInfo := nodes[(sched.nextStartNodeIndex+i)%len(nodes)]
+		status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
+		if status.Code() == framework.Error {
+			errCh.SendErrorWithCancel(status.AsError(), cancel)
+			return
+		}
+		if status.IsSuccess() {
+			length := atomic.AddInt32(&feasibleNodesLen, 1)
+			if length > numNodesToFind {
+				cancel()
+				atomic.AddInt32(&feasibleNodesLen, -1)
+			} else {
+				feasibleNodes[length-1] = nodeInfo.Node()
+			}
+		} else {
+			statusesLock.Lock()
+			diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status
+			diagnosis.UnschedulablePlugins.Insert(status.FailedPlugin())
+			statusesLock.Unlock()
+		}
+	}
+
+	beginCheckNode := time.Now()
+	statusCode := framework.Success
+	defer func() {
+		// We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins
+		// function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle.
+		// Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod.
+		metrics.FrameworkExtensionPointDuration.WithLabelValues(frameworkruntime.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode))
+	}()
+
+	// Stops searching for more nodes once the configured number of feasible nodes
+	// are found.
+	fwk.Parallelizer().Until(ctx, len(nodes), checkNode)
+	processedNodes := int(feasibleNodesLen) + len(diagnosis.NodeToStatusMap)
+	sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes)
+
+	feasibleNodes = feasibleNodes[:feasibleNodesLen]
+	if err := errCh.ReceiveError(); err != nil {
+		statusCode = framework.Error
+		return nil, err
+	}
+	return feasibleNodes, nil
+}
+
+// numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops
+// its search for more feasible nodes.
+func (sched *Scheduler) numFeasibleNodesToFind(numAllNodes int32) (numNodes int32) {
+	if numAllNodes < minFeasibleNodesToFind || sched.percentageOfNodesToScore >= 100 {
+		return numAllNodes
+	}
+
+	adaptivePercentage := sched.percentageOfNodesToScore
+	if adaptivePercentage <= 0 {
+		basePercentageOfNodesToScore := int32(50)
+		adaptivePercentage = basePercentageOfNodesToScore - numAllNodes/125
+		if adaptivePercentage < minFeasibleNodesPercentageToFind {
+			adaptivePercentage = minFeasibleNodesPercentageToFind
+		}
+	}
+
+	numNodes = numAllNodes * adaptivePercentage / 100
+	if numNodes < minFeasibleNodesToFind {
+		return minFeasibleNodesToFind
+	}
+
+	return numNodes
+}
+
+func findNodesThatPassExtenders(extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*v1.Node, statuses framework.NodeToStatusMap) ([]*v1.Node, error) {
+	// Extenders are called sequentially.
+	// Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next
+	// extender in a decreasing manner.
+	for _, extender := range extenders {
+		if len(feasibleNodes) == 0 {
+			break
+		}
+		if !extender.IsInterested(pod) {
+			continue
+		}
+
+		// Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in <statuses>,
+		// so that the scheduler framework can respect the UnschedulableAndUnresolvable status for
+		// particular nodes, and this may eventually improve preemption efficiency.
+		// Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable
+		// status ahead of others.
+		feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes)
+		if err != nil {
+			if extender.IsIgnorable() {
+				klog.InfoS("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err)
+				continue
+			}
+			return nil, err
+		}
+
+		for failedNodeName, failedMsg := range failedAndUnresolvableMap {
+			var aggregatedReasons []string
+			if _, found := statuses[failedNodeName]; found {
+				aggregatedReasons = statuses[failedNodeName].Reasons()
+			}
+			aggregatedReasons = append(aggregatedReasons, failedMsg)
+			statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...)
+		}
+
+		for failedNodeName, failedMsg := range failedMap {
+			if _, found := failedAndUnresolvableMap[failedNodeName]; found {
+				// failedAndUnresolvableMap takes precedence over failedMap
+				// note that this only happens if the extender returns the node in both maps
+				continue
+			}
+			if _, found := statuses[failedNodeName]; !found {
+				statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg)
+			} else {
+				statuses[failedNodeName].AppendReason(failedMsg)
+			}
+		}
+
+		feasibleNodes = feasibleList
+	}
+	return feasibleNodes, nil
+}
+
+// prioritizeNodes prioritizes the nodes by running the score plugins,
+// which return a score for each node from the call to RunScorePlugins().
+// The scores from each plugin are added together to make the score for that node, then
+// any extenders are run as well.
+// All scores are finally combined (added) to get the total weighted scores of all nodes
+func prioritizeNodes(
+	ctx context.Context,
+	extenders []framework.Extender,
+	fwk framework.Framework,
+	state *framework.CycleState,
+	pod *v1.Pod,
+	nodes []*v1.Node,
+) (framework.NodeScoreList, error) {
+	// If no priority configs are provided, then all nodes will have a score of one.
+	// This is required to generate the priority list in the required format
+	if len(extenders) == 0 && !fwk.HasScorePlugins() {
+		result := make(framework.NodeScoreList, 0, len(nodes))
+		for i := range nodes {
+			result = append(result, framework.NodeScore{
+				Name:  nodes[i].Name,
+				Score: 1,
+			})
+		}
+		return result, nil
+	}
+
+	// Run PreScore plugins.
+	preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes)
+	if !preScoreStatus.IsSuccess() {
+		return nil, preScoreStatus.AsError()
+	}
+
+	// Run the Score plugins.
+	scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)
+	if !scoreStatus.IsSuccess() {
+		return nil, scoreStatus.AsError()
+	}
+
+	// Additional details logged at level 10 if enabled.
+	klogV := klog.V(10)
+	if klogV.Enabled() {
+		for plugin, nodeScoreList := range scoresMap {
+			for _, nodeScore := range nodeScoreList {
+				klogV.InfoS("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", plugin, "node", nodeScore.Name, "score", nodeScore.Score)
+			}
+		}
+	}
+
+	// Summarize all scores.
+	result := make(framework.NodeScoreList, 0, len(nodes))
+
+	for i := range nodes {
+		result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0})
+		for j := range scoresMap {
+			result[i].Score += scoresMap[j][i].Score
+		}
+	}
+
+	if len(extenders) != 0 && nodes != nil {
+		var mu sync.Mutex
+		var wg sync.WaitGroup
+		combinedScores := make(map[string]int64, len(nodes))
+		for i := range extenders {
+			if !extenders[i].IsInterested(pod) {
+				continue
+			}
+			wg.Add(1)
+			go func(extIndex int) {
+				metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Inc()
+				defer func() {
+					metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Dec()
+					wg.Done()
+				}()
+				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
+				if err != nil {
+					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
+					klog.V(5).InfoS("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name())
+					return
+				}
+				mu.Lock()
+				for i := range *prioritizedList {
+					host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
+					if klogV.Enabled() {
+						klogV.InfoS("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", host, "score", score)
+					}
+					combinedScores[host] += score * weight
+				}
+				mu.Unlock()
+			}(i)
+		}
+		// wait for all go routines to finish
+		wg.Wait()
+		for i := range result {
+			// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
+			// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
+			result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
+		}
+	}
+
+	if klogV.Enabled() {
+		for i := range result {
+			klogV.InfoS("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", result[i].Name, "score", result[i].Score)
+		}
+	}
+	return result, nil
+}
+
+// selectHost takes a prioritized list of nodes and then picks one
+// in a reservoir sampling manner from the nodes that had the highest score.
+func selectHost(nodeScoreList framework.NodeScoreList) (string, error) {
+	if len(nodeScoreList) == 0 {
+		return "", fmt.Errorf("empty priorityList")
+	}
+	maxScore := nodeScoreList[0].Score
+	selected := nodeScoreList[0].Name
+	cntOfMaxScore := 1
+	for _, ns := range nodeScoreList[1:] {
+		if ns.Score > maxScore {
+			maxScore = ns.Score
+			selected = ns.Name
+			cntOfMaxScore = 1
+		} else if ns.Score == maxScore {
+			cntOfMaxScore++
+			if rand.Intn(cntOfMaxScore) == 0 {
+				// Replace the candidate with probability of 1/cntOfMaxScore
+				selected = ns.Name
+			}
+		}
+	}
+	return selected, nil
+}
+
+// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
+// assume modifies `assumed`.
+func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
+	// Optimistically assume that the binding will succeed and send it to apiserver
+	// in the background.
+	// If the binding fails, scheduler will release resources allocated to assumed pod
+	// immediately.
+	assumed.Spec.NodeName = host
+
+	if err := sched.Cache.AssumePod(assumed); err != nil {
+		klog.ErrorS(err, "Scheduler cache AssumePod failed")
+		return err
+	}
+	// if "assumed" is a nominated pod, we should remove it from internal cache
+	if sched.SchedulingQueue != nil {
+		sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed)
+	}
+
+	return nil
+}
+
+// bind binds a pod to a given node defined in a binding object.
+// The precedence for binding is: (1) extenders and (2) framework plugins.
+// We expect this to run asynchronously, so we handle binding metrics internally.
+func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (err error) {
+	defer func() {
+		sched.finishBinding(fwk, assumed, targetNode, err)
+	}()
+
+	bound, err := sched.extendersBinding(assumed, targetNode)
+	if bound {
+		return err
+	}
+	bindStatus := fwk.RunBindPlugins(ctx, state, assumed, targetNode)
+	if bindStatus.IsSuccess() {
+		return nil
+	}
+	if bindStatus.Code() == framework.Error {
+		return bindStatus.AsError()
+	}
+	return fmt.Errorf("bind status: %s, %v", bindStatus.Code().String(), bindStatus.Message())
+}
+
+// TODO(#87159): Move this to a Plugin.
+func (sched *Scheduler) extendersBinding(pod *v1.Pod, node string) (bool, error) {
+	for _, extender := range sched.Extenders {
+		if !extender.IsBinder() || !extender.IsInterested(pod) {
+			continue
+		}
+		return true, extender.Bind(&v1.Binding{
+			ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID},
+			Target:     v1.ObjectReference{Kind: "Node", Name: node},
+		})
+	}
+	return false, nil
+}
+
+func (sched *Scheduler) finishBinding(fwk framework.Framework, assumed *v1.Pod, targetNode string, err error) {
+	if finErr := sched.Cache.FinishBinding(assumed); finErr != nil {
+		klog.ErrorS(finErr, "Scheduler cache FinishBinding failed")
+	}
+	if err != nil {
+		klog.V(1).InfoS("Failed to bind pod", "pod", klog.KObj(assumed))
+		return
+	}
+
+	fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode)
+}
+
+func getAttemptsLabel(p *framework.QueuedPodInfo) string {
+	// We breakdown the pod scheduling duration by attempts capped to a limit
+	// to avoid ending up with a high cardinality metric.
+	if p.Attempts >= 15 {
+		return "15+"
+	}
+	return strconv.Itoa(p.Attempts)
+}
+
+// handleSchedulingFailure records an event for the pod that indicates the
+// pod has failed to schedule. Also, update the pod condition and nominated node name if set.
+func (sched *Scheduler) handleSchedulingFailure(fwk framework.Framework, podInfo *framework.QueuedPodInfo, err error, reason string, nominatingInfo *framework.NominatingInfo) {
+	sched.Error(podInfo, err)
+
+	// Update the scheduling queue with the nominated pod information. Without
+	// this, there would be a race condition between the next scheduling cycle
+	// and the time the scheduler receives a Pod Update for the nominated pod.
+	// Here we check for nil only for tests.
+	if sched.SchedulingQueue != nil {
+		sched.SchedulingQueue.AddNominatedPod(podInfo.PodInfo, nominatingInfo)
+	}
+
+	pod := podInfo.Pod
+	msg := truncateMessage(err.Error())
+	fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)
+	if err := updatePod(sched.client, pod, &v1.PodCondition{
+		Type:    v1.PodScheduled,
+		Status:  v1.ConditionFalse,
+		Reason:  reason,
+		Message: err.Error(),
+	}, nominatingInfo); err != nil {
+		klog.ErrorS(err, "Error updating pod", "pod", klog.KObj(pod))
+	}
+}
+
+// truncateMessage truncates a message if it hits the NoteLengthLimit.
+func truncateMessage(message string) string {
+	max := validation.NoteLengthLimit
+	if len(message) <= max {
+		return message
+	}
+	suffix := " ..."
+	return message[:max-len(suffix)] + suffix
+}
+
+func updatePod(client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error {
+	klog.V(3).InfoS("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason)
+	podStatusCopy := pod.Status.DeepCopy()
+	// NominatedNodeName is updated only if we are trying to set it, and the value is
+	// different from the existing one.
+	nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName
+	if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate {
+		return nil
+	}
+	if nnnNeedsUpdate {
+		podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName
+	}
+	return util.PatchPodStatus(client, pod, podStatusCopy)
+}
--- a/pkg/scheduler/schedule_one_test.go
+++ b/pkg/scheduler/schedule_one_test.go
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
--- a/pkg/scheduler/scheduler_test.go
+++ b/pkg/scheduler/scheduler_test.go