feature(KEP-4832): asynchronous preemption

2025-08-02 00:07:50 +00:00 · 2024-10-18 10:55:46 +10:00 · 2024-10-18 10:55:46 +10:00 · 69a8d0ec0b
commit 69a8d0ec0b
parent 3184eb3d1b
14 changed files with 1401 additions and 663 deletions
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -579,6 +579,14 @@ const (
 	// which benefits to reduce the useless requeueing.
 	SchedulerQueueingHints featuregate.Feature = "SchedulerQueueingHints"

+	// owner: @sanposhiho
+	// kep: http://kep.k8s.io/4832
+	// alpha: v1.32
+	//
+	// Running some expensive operation within the scheduler's preemption asynchronously,
+	// which improves the scheduling latency when the preemption involves in.
+	SchedulerAsyncPreemption featuregate.Feature = "SchedulerAsyncPreemption"
+
 	// owner: @atosatto @yuanchen8911
 	// kep: http://kep.k8s.io/3902
 	//
--- a/pkg/features/versioned_kube_features.go
+++ b/pkg/features/versioned_kube_features.go
@ -639,6 +639,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
 		{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Beta},
 	},

+	SchedulerAsyncPreemption: {
+		{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
+	},
+
 	SELinuxChangePolicy: {
 		{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
 	},
--- a/pkg/scheduler/apis/config/testing/defaults/defaults.go
+++ b/pkg/scheduler/apis/config/testing/defaults/defaults.go
@ -52,6 +52,7 @@ var ExpandedPluginsV1 = &config.Plugins{
 	PreEnqueue: config.PluginSet{
 		Enabled: []config.Plugin{
 			{Name: names.SchedulingGates},
+			{Name: names.DefaultPreemption},
 		},
 	},
 	QueueSort: config.PluginSet{
--- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go
+++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go
@ -53,6 +53,7 @@ type DefaultPreemption struct {
 	args      config.DefaultPreemptionArgs
 	podLister corelisters.PodLister
 	pdbLister policylisters.PodDisruptionBudgetLister
+	Evaluator *preemption.Evaluator
 }

 var _ framework.PostFilterPlugin = &DefaultPreemption{}
@ -71,13 +72,19 @@ func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feat
 	if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
 		return nil, err
 	}
+
+	podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
+	pdbLister := getPDBLister(fh.SharedInformerFactory())
+
 	pl := DefaultPreemption{
 		fh:        fh,
 		fts:       fts,
 		args:      *args,
-		podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(),
-		pdbLister: getPDBLister(fh.SharedInformerFactory()),
+		podLister: podLister,
+		pdbLister: pdbLister,
 	}
+	pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
+
 	return &pl, nil
 }

@ -87,16 +94,7 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
 		metrics.PreemptionAttempts.Inc()
 	}()

-	pe := preemption.Evaluator{
-		PluginName: names.DefaultPreemption,
-		Handler:    pl.fh,
-		PodLister:  pl.podLister,
-		PdbLister:  pl.pdbLister,
-		State:      state,
-		Interface:  pl,
-	}
-
-	result, status := pe.Preempt(ctx, pod, m)
+	result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
 	msg := status.Message()
 	if len(msg) > 0 {
 		return result, framework.NewStatus(status.Code(), "preemption: "+msg)
@ -104,6 +102,22 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
 	return result, status
 }

+func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
+	if !pl.fts.EnableAsyncPreemption {
+		return nil
+	}
+	if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
+		return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
+	}
+	return nil
+}
+
+// EventsToRegister returns the possible events that may make a Pod
+// failed by this plugin schedulable.
+func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
+	return nil, nil
+}
+
 // calculateNumCandidates returns the number of candidates the FindCandidates
 // method must produce from dry running based on the constraints given by
 // <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
--- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go
+++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go
@ -25,6 +25,7 @@ import (
 	"math/rand"
 	"sort"
 	"strings"
+	"sync"
 	"testing"
 	"time"

@ -37,6 +38,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/strategicpatch"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/informers"
 	clientsetfake "k8s.io/client-go/kubernetes/fake"
 	clienttesting "k8s.io/client-go/testing"
@ -436,6 +438,7 @@ func TestPostFilter(t *testing.T) {
 				pdbLister: getPDBLister(informerFactory),
 				args:      *getDefaultDefaultPreemptionArgs(),
 			}
+			p.Evaluator = preemption.NewEvaluator(names.DefaultPreemption, f, &p, false)

 			state := framework.NewCycleState()
 			// Ensure <state> is populated.
@ -1206,11 +1209,10 @@ func TestDryRunPreemption(t *testing.T) {
 					Handler:    pl.fh,
 					PodLister:  pl.podLister,
 					PdbLister:  pl.pdbLister,
-					State:      state,
 					Interface:  pl,
 				}
 				offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
-				got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, tt.pdbs, offset, numCandidates)
+				got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, tt.pdbs, offset, numCandidates)
 				// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
 				for i := range got {
 					victims := got[i].Victims().Pods
@ -1447,11 +1449,10 @@ func TestSelectBestCandidate(t *testing.T) {
 				Handler:    pl.fh,
 				PodLister:  pl.podLister,
 				PdbLister:  pl.pdbLister,
-				State:      state,
 				Interface:  pl,
 			}
 			offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
-			candidates, _, _ := pe.DryRunPreemption(ctx, tt.pod, nodeInfos, nil, offset, numCandidates)
+			candidates, _, _ := pe.DryRunPreemption(ctx, state, tt.pod, nodeInfos, nil, offset, numCandidates)
 			s := pe.SelectCandidate(ctx, candidates)
 			if s == nil || len(s.Name()) == 0 {
 				return
@ -1549,6 +1550,7 @@ func TestPodEligibleToPreemptOthers(t *testing.T) {
 	}
 }
 func TestPreempt(t *testing.T) {
+	metrics.Register()
 	tests := []struct {
 		name           string
 		pod            *v1.Pod
@ -1713,197 +1715,229 @@ func TestPreempt(t *testing.T) {
 	}

 	labelKeys := []string{"hostname", "zone", "region"}
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			client := clientsetfake.NewClientset()
-			informerFactory := informers.NewSharedInformerFactory(client, 0)
-			podInformer := informerFactory.Core().V1().Pods().Informer()
-			podInformer.GetStore().Add(test.pod)
-			for i := range test.pods {
-				podInformer.GetStore().Add(test.pods[i])
-			}
-
-			deletedPodNames := sets.New[string]()
-			patchedPodNames := sets.New[string]()
-			patchedPods := []*v1.Pod{}
-			client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
-				patchAction := action.(clienttesting.PatchAction)
-				podName := patchAction.GetName()
-				namespace := patchAction.GetNamespace()
-				patch := patchAction.GetPatch()
-				pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName)
-				if err != nil {
-					t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err)
-				}
-				marshalledPod, err := json.Marshal(pod)
-				if err != nil {
-					t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err)
-				}
-				updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{})
-				if err != nil {
-					t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err)
-				}
-				updatedPod := &v1.Pod{}
-				if err := json.Unmarshal(updated, updatedPod); err != nil {
-					t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
-				}
-				patchedPods = append(patchedPods, updatedPod)
-				patchedPodNames.Insert(podName)
-				return true, nil, nil
-			})
-			client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
-				deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
-				return true, nil, nil
-			})
-
-			logger, ctx := ktesting.NewTestContext(t)
-			ctx, cancel := context.WithCancel(ctx)
-			defer cancel()
-
-			waitingPods := frameworkruntime.NewWaitingPodsMap()
-
-			cache := internalcache.New(ctx, time.Duration(0))
-			for _, pod := range test.pods {
-				cache.AddPod(logger, pod)
-			}
-			cachedNodeInfoMap := map[string]*framework.NodeInfo{}
-			nodes := make([]*v1.Node, len(test.nodeNames))
-			for i, name := range test.nodeNames {
-				node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj()
-				// Split node name by '/' to form labels in a format of
-				// {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]}
-				node.ObjectMeta.Labels = make(map[string]string)
-				for i, label := range strings.Split(node.Name, "/") {
-					node.ObjectMeta.Labels[labelKeys[i]] = label
-				}
-				node.Name = node.ObjectMeta.Labels["hostname"]
-				cache.AddNode(logger, node)
-				nodes[i] = node
-
-				// Set nodeInfo to extenders to mock extenders' cache for preemption.
-				cachedNodeInfo := framework.NewNodeInfo()
-				cachedNodeInfo.SetNode(node)
-				cachedNodeInfoMap[node.Name] = cachedNodeInfo
-			}
-			var extenders []framework.Extender
-			for _, extender := range test.extenders {
-				// Set nodeInfoMap as extenders cached node information.
-				extender.CachedNodeNameToInfo = cachedNodeInfoMap
-				extenders = append(extenders, extender)
-			}
-			fwk, err := tf.NewFramework(
-				ctx,
-				[]tf.RegisterPluginFunc{
-					test.registerPlugin,
-					tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
-					tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New),
-				},
-				"",
-				frameworkruntime.WithClientSet(client),
-				frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
-				frameworkruntime.WithExtenders(extenders),
-				frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
-				frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(test.pods, nodes)),
-				frameworkruntime.WithInformerFactory(informerFactory),
-				frameworkruntime.WithWaitingPods(waitingPods),
-				frameworkruntime.WithLogger(logger),
-			)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			state := framework.NewCycleState()
-			// Some tests rely on PreFilter plugin to compute its CycleState.
-			if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, test.pod); !s.IsSuccess() {
-				t.Errorf("Unexpected preFilterStatus: %v", s)
-			}
-			// Call preempt and check the expected results.
-			pl := DefaultPreemption{
-				fh:        fwk,
-				podLister: informerFactory.Core().V1().Pods().Lister(),
-				pdbLister: getPDBLister(informerFactory),
-				args:      *getDefaultDefaultPreemptionArgs(),
-			}
-
-			pe := preemption.Evaluator{
-				PluginName: names.DefaultPreemption,
-				Handler:    pl.fh,
-				PodLister:  pl.podLister,
-				PdbLister:  pl.pdbLister,
-				State:      state,
-				Interface:  &pl,
-			}
-
-			// so that these nodes are eligible for preemption, we set their status
-			// to Unschedulable.
-
-			nodeToStatusMap := framework.NewDefaultNodeToStatus()
-			for _, n := range nodes {
-				nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
-			}
-
-			res, status := pe.Preempt(ctx, test.pod, nodeToStatusMap)
-			if !status.IsSuccess() && !status.IsRejected() {
-				t.Errorf("unexpected error in preemption: %v", status.AsError())
-			}
-			if diff := cmp.Diff(test.want, res); diff != "" {
-				t.Errorf("Unexpected status (-want, +got):\n%s", diff)
-			}
-			if len(deletedPodNames) != len(test.expectedPods) {
-				t.Errorf("expected %v pods, got %v.", len(test.expectedPods), len(deletedPodNames))
-			}
-			if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
-				t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
-			}
-
-			// Make sure that the DisruptionTarget condition has been added to the pod status
-			for _, patchedPod := range patchedPods {
-				expectedPodCondition := &v1.PodCondition{
-					Type:    v1.DisruptionTarget,
-					Status:  v1.ConditionTrue,
-					Reason:  v1.PodReasonPreemptionByScheduler,
-					Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName),
+	for _, asyncPreemptionEnabled := range []bool{true, false} {
+		for _, test := range tests {
+			t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
+				client := clientsetfake.NewClientset()
+				informerFactory := informers.NewSharedInformerFactory(client, 0)
+				podInformer := informerFactory.Core().V1().Pods().Informer()
+				testPod := test.pod.DeepCopy()
+				testPods := make([]*v1.Pod, len(test.pods))
+				for i := range test.pods {
+					testPods[i] = test.pods[i].DeepCopy()
 				}

-				_, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget)
-				if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" {
-					t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff)
+				if err := podInformer.GetStore().Add(testPod); err != nil {
+					t.Fatalf("Failed to add test pod %s: %v", testPod.Name, err)
 				}
-			}
-
-			for victimName := range deletedPodNames {
-				found := false
-				for _, expPod := range test.expectedPods {
-					if expPod == victimName {
-						found = true
-						break
+				for i := range testPods {
+					if err := podInformer.GetStore().Add(testPods[i]); err != nil {
+						t.Fatalf("Failed to add test pod %s: %v", testPods[i], err)
 					}
 				}
-				if !found {
-					t.Errorf("pod %v is not expected to be a victim.", victimName)
-				}
-			}
-			if res != nil && res.NominatingInfo != nil {
-				test.pod.Status.NominatedNodeName = res.NominatedNodeName
-			}

-			// Manually set the deleted Pods' deletionTimestamp to non-nil.
-			for _, pod := range test.pods {
-				if deletedPodNames.Has(pod.Name) {
-					now := metav1.Now()
-					pod.DeletionTimestamp = &now
-					deletedPodNames.Delete(pod.Name)
-				}
-			}
+				// Need to protect deletedPodNames and patchedPodNames to prevent DATA RACE panic.
+				var mu sync.RWMutex
+				deletedPodNames := sets.New[string]()
+				patchedPodNames := sets.New[string]()
+				patchedPods := []*v1.Pod{}
+				client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
+					patchAction := action.(clienttesting.PatchAction)
+					podName := patchAction.GetName()
+					namespace := patchAction.GetNamespace()
+					patch := patchAction.GetPatch()
+					pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName)
+					if err != nil {
+						t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err)
+					}
+					marshalledPod, err := json.Marshal(pod)
+					if err != nil {
+						t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err)
+					}
+					updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{})
+					if err != nil {
+						t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err)
+					}
+					updatedPod := &v1.Pod{}
+					if err := json.Unmarshal(updated, updatedPod); err != nil {
+						t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
+					}
+					patchedPods = append(patchedPods, updatedPod)
+					mu.Lock()
+					defer mu.Unlock()
+					patchedPodNames.Insert(podName)
+					return true, nil, nil
+				})
+				client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
+					mu.Lock()
+					defer mu.Unlock()
+					deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
+					return true, nil, nil
+				})

-			// Call preempt again and make sure it doesn't preempt any more pods.
-			res, status = pe.Preempt(ctx, test.pod, framework.NewDefaultNodeToStatus())
-			if !status.IsSuccess() && !status.IsRejected() {
-				t.Errorf("unexpected error in preemption: %v", status.AsError())
-			}
-			if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 {
-				t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName)
-			}
-		})
+				logger, ctx := ktesting.NewTestContext(t)
+				ctx, cancel := context.WithCancel(ctx)
+				defer cancel()
+
+				waitingPods := frameworkruntime.NewWaitingPodsMap()
+
+				cache := internalcache.New(ctx, time.Duration(0))
+				for _, pod := range testPods {
+					if err := cache.AddPod(logger, pod.DeepCopy()); err != nil {
+						t.Fatalf("Failed to add pod %s: %v", pod.Name, err)
+					}
+				}
+				cachedNodeInfoMap := map[string]*framework.NodeInfo{}
+				nodes := make([]*v1.Node, len(test.nodeNames))
+				for i, name := range test.nodeNames {
+					node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj()
+					// Split node name by '/' to form labels in a format of
+					// {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]}
+					node.ObjectMeta.Labels = make(map[string]string)
+					for i, label := range strings.Split(node.Name, "/") {
+						node.ObjectMeta.Labels[labelKeys[i]] = label
+					}
+					node.Name = node.ObjectMeta.Labels["hostname"]
+					t.Logf("node is added: %v. labels: %#v", node.Name, node.ObjectMeta.Labels)
+					cache.AddNode(logger, node)
+					nodes[i] = node
+
+					// Set nodeInfo to extenders to mock extenders' cache for preemption.
+					cachedNodeInfo := framework.NewNodeInfo()
+					cachedNodeInfo.SetNode(node)
+					cachedNodeInfoMap[node.Name] = cachedNodeInfo
+				}
+				var extenders []framework.Extender
+				for _, extender := range test.extenders {
+					// Set nodeInfoMap as extenders cached node information.
+					extender.CachedNodeNameToInfo = cachedNodeInfoMap
+					extenders = append(extenders, extender)
+				}
+				fwk, err := tf.NewFramework(
+					ctx,
+					[]tf.RegisterPluginFunc{
+						test.registerPlugin,
+						tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
+						tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New),
+					},
+					"",
+					frameworkruntime.WithClientSet(client),
+					frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
+					frameworkruntime.WithExtenders(extenders),
+					frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
+					frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(testPods, nodes)),
+					frameworkruntime.WithInformerFactory(informerFactory),
+					frameworkruntime.WithWaitingPods(waitingPods),
+					frameworkruntime.WithLogger(logger),
+				)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				state := framework.NewCycleState()
+				// Some tests rely on PreFilter plugin to compute its CycleState.
+				if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, testPod); !s.IsSuccess() {
+					t.Errorf("Unexpected preFilterStatus: %v", s)
+				}
+				// Call preempt and check the expected results.
+				pl := DefaultPreemption{
+					fh:        fwk,
+					podLister: informerFactory.Core().V1().Pods().Lister(),
+					pdbLister: getPDBLister(informerFactory),
+					args:      *getDefaultDefaultPreemptionArgs(),
+				}
+
+				pe := preemption.NewEvaluator(names.DefaultPreemption, pl.fh, &pl, asyncPreemptionEnabled)
+
+				// so that these nodes are eligible for preemption, we set their status
+				// to Unschedulable.
+
+				nodeToStatusMap := framework.NewDefaultNodeToStatus()
+				for _, n := range nodes {
+					nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
+				}
+
+				res, status := pe.Preempt(ctx, state, testPod, nodeToStatusMap)
+				if !status.IsSuccess() && !status.IsRejected() {
+					t.Errorf("unexpected error in preemption: %v", status.AsError())
+				}
+				if diff := cmp.Diff(test.want, res); diff != "" {
+					t.Errorf("Unexpected status (-want, +got):\n%s", diff)
+				}
+
+				if asyncPreemptionEnabled {
+					// Wait for the pod to be deleted.
+					if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
+						mu.RLock()
+						defer mu.RUnlock()
+						return len(deletedPodNames) == len(test.expectedPods), nil
+					}); err != nil {
+						t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
+					}
+				} else {
+					mu.RLock()
+					// If async preemption is disabled, the pod should be deleted immediately.
+					if len(deletedPodNames) != len(test.expectedPods) {
+						t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
+					}
+					mu.RUnlock()
+				}
+
+				mu.RLock()
+				if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
+					t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
+				}
+
+				// Make sure that the DisruptionTarget condition has been added to the pod status
+				for _, patchedPod := range patchedPods {
+					expectedPodCondition := &v1.PodCondition{
+						Type:    v1.DisruptionTarget,
+						Status:  v1.ConditionTrue,
+						Reason:  v1.PodReasonPreemptionByScheduler,
+						Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName),
+					}
+
+					_, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget)
+					if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" {
+						t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff)
+					}
+				}
+
+				for victimName := range deletedPodNames {
+					found := false
+					for _, expPod := range test.expectedPods {
+						if expPod == victimName {
+							found = true
+							break
+						}
+					}
+					if !found {
+						t.Errorf("pod %v is not expected to be a victim.", victimName)
+					}
+				}
+				if res != nil && res.NominatingInfo != nil {
+					testPod.Status.NominatedNodeName = res.NominatedNodeName
+				}
+
+				// Manually set the deleted Pods' deletionTimestamp to non-nil.
+				for _, pod := range testPods {
+					if deletedPodNames.Has(pod.Name) {
+						now := metav1.Now()
+						pod.DeletionTimestamp = &now
+						deletedPodNames.Delete(pod.Name)
+					}
+				}
+				mu.RUnlock()
+
+				// Call preempt again and make sure it doesn't preempt any more pods.
+				res, status = pe.Preempt(ctx, state, testPod, framework.NewDefaultNodeToStatus())
+				if !status.IsSuccess() && !status.IsRejected() {
+					t.Errorf("unexpected error in preemption: %v", status.AsError())
+				}
+				if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 {
+					t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName)
+				}
+			})
+		}
 	}
 }
--- a/pkg/scheduler/framework/plugins/feature/feature.go
+++ b/pkg/scheduler/framework/plugins/feature/feature.go
@ -28,4 +28,5 @@ type Features struct {
 	EnableInPlacePodVerticalScaling              bool
 	EnableSidecarContainers                      bool
 	EnableSchedulingQueueHint                    bool
+	EnableAsyncPreemption                        bool
 }
--- a/pkg/scheduler/framework/plugins/registry.go
+++ b/pkg/scheduler/framework/plugins/registry.go
@ -54,6 +54,7 @@ func NewInTreeRegistry() runtime.Registry {
 		EnableInPlacePodVerticalScaling:              feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
 		EnableSidecarContainers:                      feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
 		EnableSchedulingQueueHint:                    feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
+		EnableAsyncPreemption:                        feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
 	}

 	registry := runtime.Registry{
--- a/pkg/scheduler/framework/preemption/preemption.go
+++ b/pkg/scheduler/framework/preemption/preemption.go
@ -26,7 +26,9 @@ import (

 	v1 "k8s.io/api/core/v1"
 	policy "k8s.io/api/policy/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/types"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	corelisters "k8s.io/client-go/listers/core/v1"
 	policylisters "k8s.io/client-go/listers/policy/v1"
@ -36,6 +38,7 @@ import (
 	apipod "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/scheduler/framework"
 	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
+	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
 	"k8s.io/kubernetes/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/pkg/scheduler/util"
 )
@ -125,10 +128,48 @@ type Evaluator struct {
 	Handler    framework.Handle
 	PodLister  corelisters.PodLister
 	PdbLister  policylisters.PodDisruptionBudgetLister
-	State      *framework.CycleState
+
+	enableAsyncPreemption bool
+	mu                    sync.RWMutex
+	// preempting is a map that records the pods that are currently triggering preemption asynchronously,
+	// which is used to prevent the pods from entering the scheduling cycle meanwhile.
+	preempting map[types.UID]struct{}
+
+	// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
+	// This is exposed to be replaced during tests.
+	PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
+
 	Interface
 }

+func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
+	podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
+	pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
+
+	e := &Evaluator{
+		PluginName:            names.DefaultPreemption,
+		Handler:               fh,
+		PodLister:             podLister,
+		PdbLister:             pdbLister,
+		Interface:             i,
+		enableAsyncPreemption: enableAsyncPreemption,
+		preempting:            make(map[types.UID]struct{}),
+	}
+
+	e.PreemptPod = e.preemptPod
+
+	return e
+}
+
+// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
+func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
+	ev.mu.RLock()
+	defer ev.mu.RUnlock()
+
+	_, ok := ev.preempting[podUID]
+	return ok
+}
+
 // Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
 // The semantics of returned <PostFilterResult, Status> varies on different scenarios:
 //
@ -145,7 +186,7 @@ type Evaluator struct {
 //
 //   - <non-nil PostFilterResult, Success>. It's the regular happy path
 //     and the non-empty nominatedNodeName will be applied to the preemptor pod.
-func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
+func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
 	logger := klog.FromContext(ctx)

 	// 0) Fetch the latest version of <pod>.
@ -171,7 +212,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
 	if err != nil {
 		return nil, framework.AsStatus(err)
 	}
-	candidates, nodeToStatusMap, err := ev.findCandidates(ctx, allNodes, pod, m)
+	candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
 	if err != nil && len(candidates) == 0 {
 		return nil, framework.AsStatus(err)
 	}
@ -203,9 +244,17 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
 		return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
 	}

+	logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
+
 	// 5) Perform preparation work before nominating the selected candidate.
-	if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
-		return nil, status
+	if ev.enableAsyncPreemption {
+		if status := ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
+			return nil, status
+		}
+	} else {
+		if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
+			return nil, status
+		}
 	}

 	return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
@ -213,7 +262,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT

 // FindCandidates calculates a slice of preemption candidates.
 // Each candidate is executable to make the given <pod> schedulable.
-func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
+func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
 	if len(allNodes) == 0 {
 		return nil, nil, errors.New("no nodes available")
 	}
@ -239,7 +288,7 @@ func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.N
 	}

 	offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
-	return ev.DryRunPreemption(ctx, pod, potentialNodes, pdbs, offset, candidatesNum)
+	return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
 }

 // callExtenders calls given <extenders> to select the list of feasible candidates.
@ -335,6 +384,46 @@ func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate
 	return candidates[0]
 }

+// preemptPod actually makes API calls to preempt a specific Pod.
+func (ev *Evaluator) preemptPod(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
+	logger := klog.FromContext(ctx)
+
+	// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
+	// Otherwise we should delete the victim.
+	if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
+		waitingPod.Reject(pluginName, "preempted")
+		logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
+	} else {
+		condition := &v1.PodCondition{
+			Type:    v1.DisruptionTarget,
+			Status:  v1.ConditionTrue,
+			Reason:  v1.PodReasonPreemptionByScheduler,
+			Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
+		}
+		newStatus := victim.Status.DeepCopy()
+		updated := apipod.UpdatePodCondition(newStatus, condition)
+		if updated {
+			if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
+				logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
+				return err
+			}
+		}
+		if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
+			if !apierrors.IsNotFound(err) {
+				logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
+				return err
+			}
+			logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
+			return nil
+		}
+		logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
+	}
+
+	ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
+
+	return nil
+}
+
 // prepareCandidate does some preparation work before nominating the selected candidate:
 // - Evict the victim pods
 // - Reject the victim pods if they are in waitingPod map
@ -347,41 +436,11 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
 	defer cancel()
 	logger := klog.FromContext(ctx)
 	errCh := parallelize.NewErrorChannel()
-	preemptPod := func(index int) {
-		victim := c.Victims().Pods[index]
-		// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
-		// Otherwise we should delete the victim.
-		if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil {
-			waitingPod.Reject(pluginName, "preempted")
-			logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(pod), "waitingPod", klog.KObj(victim), "node", c.Name())
-		} else {
-			condition := &v1.PodCondition{
-				Type:    v1.DisruptionTarget,
-				Status:  v1.ConditionTrue,
-				Reason:  v1.PodReasonPreemptionByScheduler,
-				Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", pod.Spec.SchedulerName),
-			}
-			newStatus := victim.Status.DeepCopy()
-			updated := apipod.UpdatePodCondition(newStatus, condition)
-			if updated {
-				if err := util.PatchPodStatus(ctx, cs, victim, newStatus); err != nil {
-					logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
-					errCh.SendErrorWithCancel(err, cancel)
-					return
-				}
-			}
-			if err := util.DeletePod(ctx, cs, victim); err != nil {
-				logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
-				errCh.SendErrorWithCancel(err, cancel)
-				return
-			}
-			logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(pod), "victim", klog.KObj(victim), "node", c.Name())
+	fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
+		if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
+			errCh.SendErrorWithCancel(err, cancel)
 		}
-
-		fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", pod.UID, c.Name())
-	}
-
-	fh.Parallelizer().Until(ctx, len(c.Victims().Pods), preemptPod, ev.PluginName)
+	}, ev.PluginName)
 	if err := errCh.ReceiveError(); err != nil {
 		return framework.AsStatus(err)
 	}
@ -401,6 +460,72 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
 	return nil
 }

+// prepareCandidateAsync triggers a goroutine for some preparation work:
+// - Evict the victim pods
+// - Reject the victim pods if they are in waitingPod map
+// - Clear the low-priority pods' nominatedNodeName status if needed
+// The Pod won't be retried until the goroutine triggered here completes.
+//
+// See http://kep.k8s.io/4832 for how the async preemption works.
+func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
+	metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
+
+	// intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
+	// because this process could continue even after this scheduling cycle finishes.
+	ctx, cancel := context.WithCancel(context.Background())
+	errCh := parallelize.NewErrorChannel()
+	preemptPod := func(index int) {
+		victim := c.Victims().Pods[index]
+		if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
+			errCh.SendErrorWithCancel(err, cancel)
+		}
+	}
+
+	ev.mu.Lock()
+	ev.preempting[pod.UID] = struct{}{}
+	ev.mu.Unlock()
+
+	logger := klog.FromContext(ctx)
+	go func() { // TODO: use paralizer
+		defer cancel()
+		logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
+
+		// Lower priority pods nominated to run on this node, may no longer fit on
+		// this node. So, we should remove their nomination. Removing their
+		// nomination updates these pods and moves them to the active queue. It
+		// lets scheduler find another place for them.
+		nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
+		if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
+			logger.Error(err, "Cannot clear 'NominatedNodeName' field")
+			// We do not return as this error is not critical.
+		}
+
+		// We can evict all victims in parallel, but the last one.
+		// We have to remove the pod from the preempting map before the last one is evicted
+		// because, otherwise, the pod removal might be notified to the scheduling queue before
+		// we remove this pod from the preempting map,
+		// and the pod could end up stucking at the unschedulable pod pool
+		// by all the pod removal events being ignored.
+
+		ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
+		if err := errCh.ReceiveError(); err != nil {
+			logger.Error(err, "Error occurred during preemption")
+		}
+
+		ev.mu.Lock()
+		delete(ev.preempting, pod.UID)
+		ev.mu.Unlock()
+
+		if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
+			logger.Error(err, "Error occurred during preemption")
+		}
+
+		logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name())
+	}()
+
+	return nil
+}
+
 func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
 	if pdbLister != nil {
 		return pdbLister.List(labels.Everything())
@ -538,7 +663,7 @@ func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator
 // The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
 // candidates, ones that do not violate PDB are preferred over ones that do.
 // NOTE: This method is exported for easier testing in default preemption.
-func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
+func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
 	pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {

 	fh := ev.Handler
@ -557,7 +682,7 @@ func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentia
 		nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
 		logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)

-		stateCopy := ev.State.Clone()
+		stateCopy := state.Clone()
 		pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
 		if status.IsSuccess() && len(pods) != 0 {
 			victims := extenderv1.Victims{
--- a/pkg/scheduler/framework/preemption/preemption_test.go
+++ b/pkg/scheduler/framework/preemption/preemption_test.go
@ -243,9 +243,8 @@ func TestDryRunPreemption(t *testing.T) {
 					PluginName: "FakePostFilter",
 					Handler:    fwk,
 					Interface:  fakePostPlugin,
-					State:      state,
 				}
-				got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
+				got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
 				// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
 				for i := range got {
 					victims := got[i].Victims().Pods
@ -344,9 +343,8 @@ func TestSelectCandidate(t *testing.T) {
 					PluginName: "FakePreemptionScorePostFilter",
 					Handler:    fwk,
 					Interface:  fakePreemptionScorePostFilterPlugin,
-					State:      state,
 				}
-				candidates, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
+				candidates, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
 				s := pe.SelectCandidate(ctx, candidates)
 				if s == nil || len(s.Name()) == 0 {
 					t.Errorf("expect any node in %v, but no candidate selected", tt.expected)
--- a/pkg/scheduler/testing/wrappers.go
+++ b/pkg/scheduler/testing/wrappers.go
@ -310,6 +310,12 @@ func (p *PodWrapper) Name(s string) *PodWrapper {
 	return p
 }

+// Name sets `s` as the name of the inner pod.
+func (p *PodWrapper) GenerateName(s string) *PodWrapper {
+	p.SetGenerateName(s)
+	return p
+}
+
 // UID sets `s` as the UID of the inner pod.
 func (p *PodWrapper) UID(s string) *PodWrapper {
 	p.SetUID(types.UID(s))
--- a/test/featuregates_linter/test_data/versioned_feature_list.yaml
+++ b/test/featuregates_linter/test_data/versioned_feature_list.yaml
@ -1060,6 +1060,12 @@
    lockToDefault: false
    preRelease: Alpha
    version: "1.29"
+- name: SchedulerAsyncPreemption
+  versionedSpecs:
+  - default: false
+    lockToDefault: false
+    preRelease: Alpha
+    version: "1.32"
 - name: SchedulerQueueingHints
  versionedSpecs:
  - default: false
--- a/test/integration/scheduler/preemption/preemption_test.go
+++ b/test/integration/scheduler/preemption/preemption_test.go
--- a/test/integration/scheduler_perf/misc/performance-config.yaml
+++ b/test/integration/scheduler_perf/misc/performance-config.yaml
@ -358,6 +358,16 @@
      SchedulerQueueingHints: false
    labels: [performance]
    threshold: 200
+    featureGates:
+      SchedulerAsyncPreemption: false
+    params:
+      initNodes: 5000
+      initPods: 20000
+      measurePods: 5000
+  - name: 5000Nodes_AsyncPreemption
+    labels: [performance]
+    featureGates:
+      SchedulerAsyncPreemption: true
    params:
      initNodes: 5000
      initPods: 20000
--- a/test/integration/scheduler_perf/templates/pod-high-priority.yaml
+++ b/test/integration/scheduler_perf/templates/pod-high-priority.yaml
@ -1,7 +1,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  generateName: pod-
+  generateName: pod-high-priority-
 spec:
  priority: 10
  containers: