mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-02 08:17:26 +00:00
feature(KEP-4832): asynchronous preemption
This commit is contained in:
parent
3184eb3d1b
commit
69a8d0ec0b
@ -579,6 +579,14 @@ const (
|
|||||||
// which benefits to reduce the useless requeueing.
|
// which benefits to reduce the useless requeueing.
|
||||||
SchedulerQueueingHints featuregate.Feature = "SchedulerQueueingHints"
|
SchedulerQueueingHints featuregate.Feature = "SchedulerQueueingHints"
|
||||||
|
|
||||||
|
// owner: @sanposhiho
|
||||||
|
// kep: http://kep.k8s.io/4832
|
||||||
|
// alpha: v1.32
|
||||||
|
//
|
||||||
|
// Running some expensive operation within the scheduler's preemption asynchronously,
|
||||||
|
// which improves the scheduling latency when the preemption involves in.
|
||||||
|
SchedulerAsyncPreemption featuregate.Feature = "SchedulerAsyncPreemption"
|
||||||
|
|
||||||
// owner: @atosatto @yuanchen8911
|
// owner: @atosatto @yuanchen8911
|
||||||
// kep: http://kep.k8s.io/3902
|
// kep: http://kep.k8s.io/3902
|
||||||
//
|
//
|
||||||
|
@ -639,6 +639,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
|
|||||||
{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Beta},
|
{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Beta},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
SchedulerAsyncPreemption: {
|
||||||
|
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
|
||||||
|
},
|
||||||
|
|
||||||
SELinuxChangePolicy: {
|
SELinuxChangePolicy: {
|
||||||
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
|
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
|
||||||
},
|
},
|
||||||
|
@ -52,6 +52,7 @@ var ExpandedPluginsV1 = &config.Plugins{
|
|||||||
PreEnqueue: config.PluginSet{
|
PreEnqueue: config.PluginSet{
|
||||||
Enabled: []config.Plugin{
|
Enabled: []config.Plugin{
|
||||||
{Name: names.SchedulingGates},
|
{Name: names.SchedulingGates},
|
||||||
|
{Name: names.DefaultPreemption},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
QueueSort: config.PluginSet{
|
QueueSort: config.PluginSet{
|
||||||
|
@ -53,6 +53,7 @@ type DefaultPreemption struct {
|
|||||||
args config.DefaultPreemptionArgs
|
args config.DefaultPreemptionArgs
|
||||||
podLister corelisters.PodLister
|
podLister corelisters.PodLister
|
||||||
pdbLister policylisters.PodDisruptionBudgetLister
|
pdbLister policylisters.PodDisruptionBudgetLister
|
||||||
|
Evaluator *preemption.Evaluator
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ framework.PostFilterPlugin = &DefaultPreemption{}
|
var _ framework.PostFilterPlugin = &DefaultPreemption{}
|
||||||
@ -71,13 +72,19 @@ func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feat
|
|||||||
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
|
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||||
|
pdbLister := getPDBLister(fh.SharedInformerFactory())
|
||||||
|
|
||||||
pl := DefaultPreemption{
|
pl := DefaultPreemption{
|
||||||
fh: fh,
|
fh: fh,
|
||||||
fts: fts,
|
fts: fts,
|
||||||
args: *args,
|
args: *args,
|
||||||
podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(),
|
podLister: podLister,
|
||||||
pdbLister: getPDBLister(fh.SharedInformerFactory()),
|
pdbLister: pdbLister,
|
||||||
}
|
}
|
||||||
|
pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
|
||||||
|
|
||||||
return &pl, nil
|
return &pl, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,16 +94,7 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
|
|||||||
metrics.PreemptionAttempts.Inc()
|
metrics.PreemptionAttempts.Inc()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
pe := preemption.Evaluator{
|
result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
|
||||||
PluginName: names.DefaultPreemption,
|
|
||||||
Handler: pl.fh,
|
|
||||||
PodLister: pl.podLister,
|
|
||||||
PdbLister: pl.pdbLister,
|
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
|
||||||
}
|
|
||||||
|
|
||||||
result, status := pe.Preempt(ctx, pod, m)
|
|
||||||
msg := status.Message()
|
msg := status.Message()
|
||||||
if len(msg) > 0 {
|
if len(msg) > 0 {
|
||||||
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
|
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
|
||||||
@ -104,6 +102,22 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
|
|||||||
return result, status
|
return result, status
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
|
||||||
|
if !pl.fts.EnableAsyncPreemption {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
|
||||||
|
return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// EventsToRegister returns the possible events that may make a Pod
|
||||||
|
// failed by this plugin schedulable.
|
||||||
|
func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
// calculateNumCandidates returns the number of candidates the FindCandidates
|
// calculateNumCandidates returns the number of candidates the FindCandidates
|
||||||
// method must produce from dry running based on the constraints given by
|
// method must produce from dry running based on the constraints given by
|
||||||
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
|
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
|
||||||
|
@ -25,6 +25,7 @@ import (
|
|||||||
"math/rand"
|
"math/rand"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -37,6 +38,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
"k8s.io/apimachinery/pkg/util/strategicpatch"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
"k8s.io/client-go/informers"
|
"k8s.io/client-go/informers"
|
||||||
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
clientsetfake "k8s.io/client-go/kubernetes/fake"
|
||||||
clienttesting "k8s.io/client-go/testing"
|
clienttesting "k8s.io/client-go/testing"
|
||||||
@ -436,6 +438,7 @@ func TestPostFilter(t *testing.T) {
|
|||||||
pdbLister: getPDBLister(informerFactory),
|
pdbLister: getPDBLister(informerFactory),
|
||||||
args: *getDefaultDefaultPreemptionArgs(),
|
args: *getDefaultDefaultPreemptionArgs(),
|
||||||
}
|
}
|
||||||
|
p.Evaluator = preemption.NewEvaluator(names.DefaultPreemption, f, &p, false)
|
||||||
|
|
||||||
state := framework.NewCycleState()
|
state := framework.NewCycleState()
|
||||||
// Ensure <state> is populated.
|
// Ensure <state> is populated.
|
||||||
@ -1206,11 +1209,10 @@ func TestDryRunPreemption(t *testing.T) {
|
|||||||
Handler: pl.fh,
|
Handler: pl.fh,
|
||||||
PodLister: pl.podLister,
|
PodLister: pl.podLister,
|
||||||
PdbLister: pl.pdbLister,
|
PdbLister: pl.pdbLister,
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
Interface: pl,
|
||||||
}
|
}
|
||||||
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
||||||
got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, tt.pdbs, offset, numCandidates)
|
got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, tt.pdbs, offset, numCandidates)
|
||||||
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
||||||
for i := range got {
|
for i := range got {
|
||||||
victims := got[i].Victims().Pods
|
victims := got[i].Victims().Pods
|
||||||
@ -1447,11 +1449,10 @@ func TestSelectBestCandidate(t *testing.T) {
|
|||||||
Handler: pl.fh,
|
Handler: pl.fh,
|
||||||
PodLister: pl.podLister,
|
PodLister: pl.podLister,
|
||||||
PdbLister: pl.pdbLister,
|
PdbLister: pl.pdbLister,
|
||||||
State: state,
|
|
||||||
Interface: pl,
|
Interface: pl,
|
||||||
}
|
}
|
||||||
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
offset, numCandidates := pl.GetOffsetAndNumCandidates(int32(len(nodeInfos)))
|
||||||
candidates, _, _ := pe.DryRunPreemption(ctx, tt.pod, nodeInfos, nil, offset, numCandidates)
|
candidates, _, _ := pe.DryRunPreemption(ctx, state, tt.pod, nodeInfos, nil, offset, numCandidates)
|
||||||
s := pe.SelectCandidate(ctx, candidates)
|
s := pe.SelectCandidate(ctx, candidates)
|
||||||
if s == nil || len(s.Name()) == 0 {
|
if s == nil || len(s.Name()) == 0 {
|
||||||
return
|
return
|
||||||
@ -1549,6 +1550,7 @@ func TestPodEligibleToPreemptOthers(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
func TestPreempt(t *testing.T) {
|
func TestPreempt(t *testing.T) {
|
||||||
|
metrics.Register()
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
pod *v1.Pod
|
pod *v1.Pod
|
||||||
@ -1713,197 +1715,229 @@ func TestPreempt(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
labelKeys := []string{"hostname", "zone", "region"}
|
labelKeys := []string{"hostname", "zone", "region"}
|
||||||
for _, test := range tests {
|
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
for _, test := range tests {
|
||||||
client := clientsetfake.NewClientset()
|
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) {
|
||||||
informerFactory := informers.NewSharedInformerFactory(client, 0)
|
client := clientsetfake.NewClientset()
|
||||||
podInformer := informerFactory.Core().V1().Pods().Informer()
|
informerFactory := informers.NewSharedInformerFactory(client, 0)
|
||||||
podInformer.GetStore().Add(test.pod)
|
podInformer := informerFactory.Core().V1().Pods().Informer()
|
||||||
for i := range test.pods {
|
testPod := test.pod.DeepCopy()
|
||||||
podInformer.GetStore().Add(test.pods[i])
|
testPods := make([]*v1.Pod, len(test.pods))
|
||||||
}
|
for i := range test.pods {
|
||||||
|
testPods[i] = test.pods[i].DeepCopy()
|
||||||
deletedPodNames := sets.New[string]()
|
|
||||||
patchedPodNames := sets.New[string]()
|
|
||||||
patchedPods := []*v1.Pod{}
|
|
||||||
client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
|
||||||
patchAction := action.(clienttesting.PatchAction)
|
|
||||||
podName := patchAction.GetName()
|
|
||||||
namespace := patchAction.GetNamespace()
|
|
||||||
patch := patchAction.GetPatch()
|
|
||||||
pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err)
|
|
||||||
}
|
|
||||||
marshalledPod, err := json.Marshal(pod)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err)
|
|
||||||
}
|
|
||||||
updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err)
|
|
||||||
}
|
|
||||||
updatedPod := &v1.Pod{}
|
|
||||||
if err := json.Unmarshal(updated, updatedPod); err != nil {
|
|
||||||
t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
|
|
||||||
}
|
|
||||||
patchedPods = append(patchedPods, updatedPod)
|
|
||||||
patchedPodNames.Insert(podName)
|
|
||||||
return true, nil, nil
|
|
||||||
})
|
|
||||||
client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
|
||||||
deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
|
|
||||||
return true, nil, nil
|
|
||||||
})
|
|
||||||
|
|
||||||
logger, ctx := ktesting.NewTestContext(t)
|
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
waitingPods := frameworkruntime.NewWaitingPodsMap()
|
|
||||||
|
|
||||||
cache := internalcache.New(ctx, time.Duration(0))
|
|
||||||
for _, pod := range test.pods {
|
|
||||||
cache.AddPod(logger, pod)
|
|
||||||
}
|
|
||||||
cachedNodeInfoMap := map[string]*framework.NodeInfo{}
|
|
||||||
nodes := make([]*v1.Node, len(test.nodeNames))
|
|
||||||
for i, name := range test.nodeNames {
|
|
||||||
node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj()
|
|
||||||
// Split node name by '/' to form labels in a format of
|
|
||||||
// {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]}
|
|
||||||
node.ObjectMeta.Labels = make(map[string]string)
|
|
||||||
for i, label := range strings.Split(node.Name, "/") {
|
|
||||||
node.ObjectMeta.Labels[labelKeys[i]] = label
|
|
||||||
}
|
|
||||||
node.Name = node.ObjectMeta.Labels["hostname"]
|
|
||||||
cache.AddNode(logger, node)
|
|
||||||
nodes[i] = node
|
|
||||||
|
|
||||||
// Set nodeInfo to extenders to mock extenders' cache for preemption.
|
|
||||||
cachedNodeInfo := framework.NewNodeInfo()
|
|
||||||
cachedNodeInfo.SetNode(node)
|
|
||||||
cachedNodeInfoMap[node.Name] = cachedNodeInfo
|
|
||||||
}
|
|
||||||
var extenders []framework.Extender
|
|
||||||
for _, extender := range test.extenders {
|
|
||||||
// Set nodeInfoMap as extenders cached node information.
|
|
||||||
extender.CachedNodeNameToInfo = cachedNodeInfoMap
|
|
||||||
extenders = append(extenders, extender)
|
|
||||||
}
|
|
||||||
fwk, err := tf.NewFramework(
|
|
||||||
ctx,
|
|
||||||
[]tf.RegisterPluginFunc{
|
|
||||||
test.registerPlugin,
|
|
||||||
tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
|
|
||||||
tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New),
|
|
||||||
},
|
|
||||||
"",
|
|
||||||
frameworkruntime.WithClientSet(client),
|
|
||||||
frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
|
|
||||||
frameworkruntime.WithExtenders(extenders),
|
|
||||||
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
|
||||||
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(test.pods, nodes)),
|
|
||||||
frameworkruntime.WithInformerFactory(informerFactory),
|
|
||||||
frameworkruntime.WithWaitingPods(waitingPods),
|
|
||||||
frameworkruntime.WithLogger(logger),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state := framework.NewCycleState()
|
|
||||||
// Some tests rely on PreFilter plugin to compute its CycleState.
|
|
||||||
if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, test.pod); !s.IsSuccess() {
|
|
||||||
t.Errorf("Unexpected preFilterStatus: %v", s)
|
|
||||||
}
|
|
||||||
// Call preempt and check the expected results.
|
|
||||||
pl := DefaultPreemption{
|
|
||||||
fh: fwk,
|
|
||||||
podLister: informerFactory.Core().V1().Pods().Lister(),
|
|
||||||
pdbLister: getPDBLister(informerFactory),
|
|
||||||
args: *getDefaultDefaultPreemptionArgs(),
|
|
||||||
}
|
|
||||||
|
|
||||||
pe := preemption.Evaluator{
|
|
||||||
PluginName: names.DefaultPreemption,
|
|
||||||
Handler: pl.fh,
|
|
||||||
PodLister: pl.podLister,
|
|
||||||
PdbLister: pl.pdbLister,
|
|
||||||
State: state,
|
|
||||||
Interface: &pl,
|
|
||||||
}
|
|
||||||
|
|
||||||
// so that these nodes are eligible for preemption, we set their status
|
|
||||||
// to Unschedulable.
|
|
||||||
|
|
||||||
nodeToStatusMap := framework.NewDefaultNodeToStatus()
|
|
||||||
for _, n := range nodes {
|
|
||||||
nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
|
|
||||||
}
|
|
||||||
|
|
||||||
res, status := pe.Preempt(ctx, test.pod, nodeToStatusMap)
|
|
||||||
if !status.IsSuccess() && !status.IsRejected() {
|
|
||||||
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
|
||||||
}
|
|
||||||
if diff := cmp.Diff(test.want, res); diff != "" {
|
|
||||||
t.Errorf("Unexpected status (-want, +got):\n%s", diff)
|
|
||||||
}
|
|
||||||
if len(deletedPodNames) != len(test.expectedPods) {
|
|
||||||
t.Errorf("expected %v pods, got %v.", len(test.expectedPods), len(deletedPodNames))
|
|
||||||
}
|
|
||||||
if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
|
|
||||||
t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure that the DisruptionTarget condition has been added to the pod status
|
|
||||||
for _, patchedPod := range patchedPods {
|
|
||||||
expectedPodCondition := &v1.PodCondition{
|
|
||||||
Type: v1.DisruptionTarget,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
Reason: v1.PodReasonPreemptionByScheduler,
|
|
||||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget)
|
if err := podInformer.GetStore().Add(testPod); err != nil {
|
||||||
if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" {
|
t.Fatalf("Failed to add test pod %s: %v", testPod.Name, err)
|
||||||
t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff)
|
|
||||||
}
|
}
|
||||||
}
|
for i := range testPods {
|
||||||
|
if err := podInformer.GetStore().Add(testPods[i]); err != nil {
|
||||||
for victimName := range deletedPodNames {
|
t.Fatalf("Failed to add test pod %s: %v", testPods[i], err)
|
||||||
found := false
|
|
||||||
for _, expPod := range test.expectedPods {
|
|
||||||
if expPod == victimName {
|
|
||||||
found = true
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !found {
|
|
||||||
t.Errorf("pod %v is not expected to be a victim.", victimName)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if res != nil && res.NominatingInfo != nil {
|
|
||||||
test.pod.Status.NominatedNodeName = res.NominatedNodeName
|
|
||||||
}
|
|
||||||
|
|
||||||
// Manually set the deleted Pods' deletionTimestamp to non-nil.
|
// Need to protect deletedPodNames and patchedPodNames to prevent DATA RACE panic.
|
||||||
for _, pod := range test.pods {
|
var mu sync.RWMutex
|
||||||
if deletedPodNames.Has(pod.Name) {
|
deletedPodNames := sets.New[string]()
|
||||||
now := metav1.Now()
|
patchedPodNames := sets.New[string]()
|
||||||
pod.DeletionTimestamp = &now
|
patchedPods := []*v1.Pod{}
|
||||||
deletedPodNames.Delete(pod.Name)
|
client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
||||||
}
|
patchAction := action.(clienttesting.PatchAction)
|
||||||
}
|
podName := patchAction.GetName()
|
||||||
|
namespace := patchAction.GetNamespace()
|
||||||
|
patch := patchAction.GetPatch()
|
||||||
|
pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err)
|
||||||
|
}
|
||||||
|
marshalledPod, err := json.Marshal(pod)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err)
|
||||||
|
}
|
||||||
|
updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err)
|
||||||
|
}
|
||||||
|
updatedPod := &v1.Pod{}
|
||||||
|
if err := json.Unmarshal(updated, updatedPod); err != nil {
|
||||||
|
t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err)
|
||||||
|
}
|
||||||
|
patchedPods = append(patchedPods, updatedPod)
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
patchedPodNames.Insert(podName)
|
||||||
|
return true, nil, nil
|
||||||
|
})
|
||||||
|
client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName())
|
||||||
|
return true, nil, nil
|
||||||
|
})
|
||||||
|
|
||||||
// Call preempt again and make sure it doesn't preempt any more pods.
|
logger, ctx := ktesting.NewTestContext(t)
|
||||||
res, status = pe.Preempt(ctx, test.pod, framework.NewDefaultNodeToStatus())
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
if !status.IsSuccess() && !status.IsRejected() {
|
defer cancel()
|
||||||
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
|
||||||
}
|
waitingPods := frameworkruntime.NewWaitingPodsMap()
|
||||||
if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 {
|
|
||||||
t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName)
|
cache := internalcache.New(ctx, time.Duration(0))
|
||||||
}
|
for _, pod := range testPods {
|
||||||
})
|
if err := cache.AddPod(logger, pod.DeepCopy()); err != nil {
|
||||||
|
t.Fatalf("Failed to add pod %s: %v", pod.Name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cachedNodeInfoMap := map[string]*framework.NodeInfo{}
|
||||||
|
nodes := make([]*v1.Node, len(test.nodeNames))
|
||||||
|
for i, name := range test.nodeNames {
|
||||||
|
node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj()
|
||||||
|
// Split node name by '/' to form labels in a format of
|
||||||
|
// {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]}
|
||||||
|
node.ObjectMeta.Labels = make(map[string]string)
|
||||||
|
for i, label := range strings.Split(node.Name, "/") {
|
||||||
|
node.ObjectMeta.Labels[labelKeys[i]] = label
|
||||||
|
}
|
||||||
|
node.Name = node.ObjectMeta.Labels["hostname"]
|
||||||
|
t.Logf("node is added: %v. labels: %#v", node.Name, node.ObjectMeta.Labels)
|
||||||
|
cache.AddNode(logger, node)
|
||||||
|
nodes[i] = node
|
||||||
|
|
||||||
|
// Set nodeInfo to extenders to mock extenders' cache for preemption.
|
||||||
|
cachedNodeInfo := framework.NewNodeInfo()
|
||||||
|
cachedNodeInfo.SetNode(node)
|
||||||
|
cachedNodeInfoMap[node.Name] = cachedNodeInfo
|
||||||
|
}
|
||||||
|
var extenders []framework.Extender
|
||||||
|
for _, extender := range test.extenders {
|
||||||
|
// Set nodeInfoMap as extenders cached node information.
|
||||||
|
extender.CachedNodeNameToInfo = cachedNodeInfoMap
|
||||||
|
extenders = append(extenders, extender)
|
||||||
|
}
|
||||||
|
fwk, err := tf.NewFramework(
|
||||||
|
ctx,
|
||||||
|
[]tf.RegisterPluginFunc{
|
||||||
|
test.registerPlugin,
|
||||||
|
tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
|
||||||
|
tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New),
|
||||||
|
},
|
||||||
|
"",
|
||||||
|
frameworkruntime.WithClientSet(client),
|
||||||
|
frameworkruntime.WithEventRecorder(&events.FakeRecorder{}),
|
||||||
|
frameworkruntime.WithExtenders(extenders),
|
||||||
|
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
||||||
|
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(testPods, nodes)),
|
||||||
|
frameworkruntime.WithInformerFactory(informerFactory),
|
||||||
|
frameworkruntime.WithWaitingPods(waitingPods),
|
||||||
|
frameworkruntime.WithLogger(logger),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
state := framework.NewCycleState()
|
||||||
|
// Some tests rely on PreFilter plugin to compute its CycleState.
|
||||||
|
if _, s, _ := fwk.RunPreFilterPlugins(ctx, state, testPod); !s.IsSuccess() {
|
||||||
|
t.Errorf("Unexpected preFilterStatus: %v", s)
|
||||||
|
}
|
||||||
|
// Call preempt and check the expected results.
|
||||||
|
pl := DefaultPreemption{
|
||||||
|
fh: fwk,
|
||||||
|
podLister: informerFactory.Core().V1().Pods().Lister(),
|
||||||
|
pdbLister: getPDBLister(informerFactory),
|
||||||
|
args: *getDefaultDefaultPreemptionArgs(),
|
||||||
|
}
|
||||||
|
|
||||||
|
pe := preemption.NewEvaluator(names.DefaultPreemption, pl.fh, &pl, asyncPreemptionEnabled)
|
||||||
|
|
||||||
|
// so that these nodes are eligible for preemption, we set their status
|
||||||
|
// to Unschedulable.
|
||||||
|
|
||||||
|
nodeToStatusMap := framework.NewDefaultNodeToStatus()
|
||||||
|
for _, n := range nodes {
|
||||||
|
nodeToStatusMap.Set(n.Name, framework.NewStatus(framework.Unschedulable))
|
||||||
|
}
|
||||||
|
|
||||||
|
res, status := pe.Preempt(ctx, state, testPod, nodeToStatusMap)
|
||||||
|
if !status.IsSuccess() && !status.IsRejected() {
|
||||||
|
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(test.want, res); diff != "" {
|
||||||
|
t.Errorf("Unexpected status (-want, +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
if asyncPreemptionEnabled {
|
||||||
|
// Wait for the pod to be deleted.
|
||||||
|
if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||||
|
mu.RLock()
|
||||||
|
defer mu.RUnlock()
|
||||||
|
return len(deletedPodNames) == len(test.expectedPods), nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mu.RLock()
|
||||||
|
// If async preemption is disabled, the pod should be deleted immediately.
|
||||||
|
if len(deletedPodNames) != len(test.expectedPods) {
|
||||||
|
t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames))
|
||||||
|
}
|
||||||
|
mu.RUnlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
mu.RLock()
|
||||||
|
if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" {
|
||||||
|
t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that the DisruptionTarget condition has been added to the pod status
|
||||||
|
for _, patchedPod := range patchedPods {
|
||||||
|
expectedPodCondition := &v1.PodCondition{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: v1.PodReasonPreemptionByScheduler,
|
||||||
|
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName),
|
||||||
|
}
|
||||||
|
|
||||||
|
_, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget)
|
||||||
|
if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" {
|
||||||
|
t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for victimName := range deletedPodNames {
|
||||||
|
found := false
|
||||||
|
for _, expPod := range test.expectedPods {
|
||||||
|
if expPod == victimName {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Errorf("pod %v is not expected to be a victim.", victimName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if res != nil && res.NominatingInfo != nil {
|
||||||
|
testPod.Status.NominatedNodeName = res.NominatedNodeName
|
||||||
|
}
|
||||||
|
|
||||||
|
// Manually set the deleted Pods' deletionTimestamp to non-nil.
|
||||||
|
for _, pod := range testPods {
|
||||||
|
if deletedPodNames.Has(pod.Name) {
|
||||||
|
now := metav1.Now()
|
||||||
|
pod.DeletionTimestamp = &now
|
||||||
|
deletedPodNames.Delete(pod.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mu.RUnlock()
|
||||||
|
|
||||||
|
// Call preempt again and make sure it doesn't preempt any more pods.
|
||||||
|
res, status = pe.Preempt(ctx, state, testPod, framework.NewDefaultNodeToStatus())
|
||||||
|
if !status.IsSuccess() && !status.IsRejected() {
|
||||||
|
t.Errorf("unexpected error in preemption: %v", status.AsError())
|
||||||
|
}
|
||||||
|
if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 {
|
||||||
|
t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,4 +28,5 @@ type Features struct {
|
|||||||
EnableInPlacePodVerticalScaling bool
|
EnableInPlacePodVerticalScaling bool
|
||||||
EnableSidecarContainers bool
|
EnableSidecarContainers bool
|
||||||
EnableSchedulingQueueHint bool
|
EnableSchedulingQueueHint bool
|
||||||
|
EnableAsyncPreemption bool
|
||||||
}
|
}
|
||||||
|
@ -54,6 +54,7 @@ func NewInTreeRegistry() runtime.Registry {
|
|||||||
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||||
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
|
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
|
||||||
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
||||||
|
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
|
||||||
}
|
}
|
||||||
|
|
||||||
registry := runtime.Registry{
|
registry := runtime.Registry{
|
||||||
|
@ -26,7 +26,9 @@ import (
|
|||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
policy "k8s.io/api/policy/v1"
|
policy "k8s.io/api/policy/v1"
|
||||||
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||||
corelisters "k8s.io/client-go/listers/core/v1"
|
corelisters "k8s.io/client-go/listers/core/v1"
|
||||||
policylisters "k8s.io/client-go/listers/policy/v1"
|
policylisters "k8s.io/client-go/listers/policy/v1"
|
||||||
@ -36,6 +38,7 @@ import (
|
|||||||
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||||
)
|
)
|
||||||
@ -125,10 +128,48 @@ type Evaluator struct {
|
|||||||
Handler framework.Handle
|
Handler framework.Handle
|
||||||
PodLister corelisters.PodLister
|
PodLister corelisters.PodLister
|
||||||
PdbLister policylisters.PodDisruptionBudgetLister
|
PdbLister policylisters.PodDisruptionBudgetLister
|
||||||
State *framework.CycleState
|
|
||||||
|
enableAsyncPreemption bool
|
||||||
|
mu sync.RWMutex
|
||||||
|
// preempting is a map that records the pods that are currently triggering preemption asynchronously,
|
||||||
|
// which is used to prevent the pods from entering the scheduling cycle meanwhile.
|
||||||
|
preempting map[types.UID]struct{}
|
||||||
|
|
||||||
|
// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
|
||||||
|
// This is exposed to be replaced during tests.
|
||||||
|
PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
|
||||||
|
|
||||||
Interface
|
Interface
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
|
||||||
|
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||||
|
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
|
||||||
|
|
||||||
|
e := &Evaluator{
|
||||||
|
PluginName: names.DefaultPreemption,
|
||||||
|
Handler: fh,
|
||||||
|
PodLister: podLister,
|
||||||
|
PdbLister: pdbLister,
|
||||||
|
Interface: i,
|
||||||
|
enableAsyncPreemption: enableAsyncPreemption,
|
||||||
|
preempting: make(map[types.UID]struct{}),
|
||||||
|
}
|
||||||
|
|
||||||
|
e.PreemptPod = e.preemptPod
|
||||||
|
|
||||||
|
return e
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
|
||||||
|
func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
|
||||||
|
ev.mu.RLock()
|
||||||
|
defer ev.mu.RUnlock()
|
||||||
|
|
||||||
|
_, ok := ev.preempting[podUID]
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
|
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
|
||||||
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
|
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
|
||||||
//
|
//
|
||||||
@ -145,7 +186,7 @@ type Evaluator struct {
|
|||||||
//
|
//
|
||||||
// - <non-nil PostFilterResult, Success>. It's the regular happy path
|
// - <non-nil PostFilterResult, Success>. It's the regular happy path
|
||||||
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
|
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
|
||||||
func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
|
|
||||||
// 0) Fetch the latest version of <pod>.
|
// 0) Fetch the latest version of <pod>.
|
||||||
@ -171,7 +212,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, framework.AsStatus(err)
|
return nil, framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, allNodes, pod, m)
|
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
|
||||||
if err != nil && len(candidates) == 0 {
|
if err != nil && len(candidates) == 0 {
|
||||||
return nil, framework.AsStatus(err)
|
return nil, framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
@ -203,9 +244,17 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
|
|||||||
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
|
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
|
||||||
|
|
||||||
// 5) Perform preparation work before nominating the selected candidate.
|
// 5) Perform preparation work before nominating the selected candidate.
|
||||||
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
if ev.enableAsyncPreemption {
|
||||||
return nil, status
|
if status := ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
||||||
|
return nil, status
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
||||||
|
return nil, status
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
|
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
|
||||||
@ -213,7 +262,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeT
|
|||||||
|
|
||||||
// FindCandidates calculates a slice of preemption candidates.
|
// FindCandidates calculates a slice of preemption candidates.
|
||||||
// Each candidate is executable to make the given <pod> schedulable.
|
// Each candidate is executable to make the given <pod> schedulable.
|
||||||
func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
|
func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
|
||||||
if len(allNodes) == 0 {
|
if len(allNodes) == 0 {
|
||||||
return nil, nil, errors.New("no nodes available")
|
return nil, nil, errors.New("no nodes available")
|
||||||
}
|
}
|
||||||
@ -239,7 +288,7 @@ func (ev *Evaluator) findCandidates(ctx context.Context, allNodes []*framework.N
|
|||||||
}
|
}
|
||||||
|
|
||||||
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
|
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
|
||||||
return ev.DryRunPreemption(ctx, pod, potentialNodes, pdbs, offset, candidatesNum)
|
return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
|
||||||
}
|
}
|
||||||
|
|
||||||
// callExtenders calls given <extenders> to select the list of feasible candidates.
|
// callExtenders calls given <extenders> to select the list of feasible candidates.
|
||||||
@ -335,6 +384,46 @@ func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate
|
|||||||
return candidates[0]
|
return candidates[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// preemptPod actually makes API calls to preempt a specific Pod.
|
||||||
|
func (ev *Evaluator) preemptPod(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
|
||||||
|
logger := klog.FromContext(ctx)
|
||||||
|
|
||||||
|
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
|
||||||
|
// Otherwise we should delete the victim.
|
||||||
|
if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
|
||||||
|
waitingPod.Reject(pluginName, "preempted")
|
||||||
|
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
|
||||||
|
} else {
|
||||||
|
condition := &v1.PodCondition{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: v1.PodReasonPreemptionByScheduler,
|
||||||
|
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
|
||||||
|
}
|
||||||
|
newStatus := victim.Status.DeepCopy()
|
||||||
|
updated := apipod.UpdatePodCondition(newStatus, condition)
|
||||||
|
if updated {
|
||||||
|
if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
|
||||||
|
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
|
||||||
|
if !apierrors.IsNotFound(err) {
|
||||||
|
logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// prepareCandidate does some preparation work before nominating the selected candidate:
|
// prepareCandidate does some preparation work before nominating the selected candidate:
|
||||||
// - Evict the victim pods
|
// - Evict the victim pods
|
||||||
// - Reject the victim pods if they are in waitingPod map
|
// - Reject the victim pods if they are in waitingPod map
|
||||||
@ -347,41 +436,11 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
errCh := parallelize.NewErrorChannel()
|
errCh := parallelize.NewErrorChannel()
|
||||||
preemptPod := func(index int) {
|
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
|
||||||
victim := c.Victims().Pods[index]
|
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
|
||||||
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
|
errCh.SendErrorWithCancel(err, cancel)
|
||||||
// Otherwise we should delete the victim.
|
|
||||||
if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil {
|
|
||||||
waitingPod.Reject(pluginName, "preempted")
|
|
||||||
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(pod), "waitingPod", klog.KObj(victim), "node", c.Name())
|
|
||||||
} else {
|
|
||||||
condition := &v1.PodCondition{
|
|
||||||
Type: v1.DisruptionTarget,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
Reason: v1.PodReasonPreemptionByScheduler,
|
|
||||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", pod.Spec.SchedulerName),
|
|
||||||
}
|
|
||||||
newStatus := victim.Status.DeepCopy()
|
|
||||||
updated := apipod.UpdatePodCondition(newStatus, condition)
|
|
||||||
if updated {
|
|
||||||
if err := util.PatchPodStatus(ctx, cs, victim, newStatus); err != nil {
|
|
||||||
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
|
|
||||||
errCh.SendErrorWithCancel(err, cancel)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := util.DeletePod(ctx, cs, victim); err != nil {
|
|
||||||
logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
|
|
||||||
errCh.SendErrorWithCancel(err, cancel)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(pod), "victim", klog.KObj(victim), "node", c.Name())
|
|
||||||
}
|
}
|
||||||
|
}, ev.PluginName)
|
||||||
fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", pod.UID, c.Name())
|
|
||||||
}
|
|
||||||
|
|
||||||
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), preemptPod, ev.PluginName)
|
|
||||||
if err := errCh.ReceiveError(); err != nil {
|
if err := errCh.ReceiveError(); err != nil {
|
||||||
return framework.AsStatus(err)
|
return framework.AsStatus(err)
|
||||||
}
|
}
|
||||||
@ -401,6 +460,72 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prepareCandidateAsync triggers a goroutine for some preparation work:
|
||||||
|
// - Evict the victim pods
|
||||||
|
// - Reject the victim pods if they are in waitingPod map
|
||||||
|
// - Clear the low-priority pods' nominatedNodeName status if needed
|
||||||
|
// The Pod won't be retried until the goroutine triggered here completes.
|
||||||
|
//
|
||||||
|
// See http://kep.k8s.io/4832 for how the async preemption works.
|
||||||
|
func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
|
||||||
|
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
|
||||||
|
|
||||||
|
// intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
|
||||||
|
// because this process could continue even after this scheduling cycle finishes.
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
errCh := parallelize.NewErrorChannel()
|
||||||
|
preemptPod := func(index int) {
|
||||||
|
victim := c.Victims().Pods[index]
|
||||||
|
if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
|
||||||
|
errCh.SendErrorWithCancel(err, cancel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.mu.Lock()
|
||||||
|
ev.preempting[pod.UID] = struct{}{}
|
||||||
|
ev.mu.Unlock()
|
||||||
|
|
||||||
|
logger := klog.FromContext(ctx)
|
||||||
|
go func() { // TODO: use paralizer
|
||||||
|
defer cancel()
|
||||||
|
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
|
||||||
|
|
||||||
|
// Lower priority pods nominated to run on this node, may no longer fit on
|
||||||
|
// this node. So, we should remove their nomination. Removing their
|
||||||
|
// nomination updates these pods and moves them to the active queue. It
|
||||||
|
// lets scheduler find another place for them.
|
||||||
|
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
|
||||||
|
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
|
||||||
|
logger.Error(err, "Cannot clear 'NominatedNodeName' field")
|
||||||
|
// We do not return as this error is not critical.
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can evict all victims in parallel, but the last one.
|
||||||
|
// We have to remove the pod from the preempting map before the last one is evicted
|
||||||
|
// because, otherwise, the pod removal might be notified to the scheduling queue before
|
||||||
|
// we remove this pod from the preempting map,
|
||||||
|
// and the pod could end up stucking at the unschedulable pod pool
|
||||||
|
// by all the pod removal events being ignored.
|
||||||
|
|
||||||
|
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
|
||||||
|
if err := errCh.ReceiveError(); err != nil {
|
||||||
|
logger.Error(err, "Error occurred during preemption")
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.mu.Lock()
|
||||||
|
delete(ev.preempting, pod.UID)
|
||||||
|
ev.mu.Unlock()
|
||||||
|
|
||||||
|
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
|
||||||
|
logger.Error(err, "Error occurred during preemption")
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name())
|
||||||
|
}()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
|
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
|
||||||
if pdbLister != nil {
|
if pdbLister != nil {
|
||||||
return pdbLister.List(labels.Everything())
|
return pdbLister.List(labels.Everything())
|
||||||
@ -538,7 +663,7 @@ func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator
|
|||||||
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
|
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
|
||||||
// candidates, ones that do not violate PDB are preferred over ones that do.
|
// candidates, ones that do not violate PDB are preferred over ones that do.
|
||||||
// NOTE: This method is exported for easier testing in default preemption.
|
// NOTE: This method is exported for easier testing in default preemption.
|
||||||
func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
|
func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
|
||||||
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
|
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
|
||||||
|
|
||||||
fh := ev.Handler
|
fh := ev.Handler
|
||||||
@ -557,7 +682,7 @@ func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentia
|
|||||||
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
|
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
|
||||||
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
|
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
|
||||||
|
|
||||||
stateCopy := ev.State.Clone()
|
stateCopy := state.Clone()
|
||||||
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
|
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
|
||||||
if status.IsSuccess() && len(pods) != 0 {
|
if status.IsSuccess() && len(pods) != 0 {
|
||||||
victims := extenderv1.Victims{
|
victims := extenderv1.Victims{
|
||||||
|
@ -243,9 +243,8 @@ func TestDryRunPreemption(t *testing.T) {
|
|||||||
PluginName: "FakePostFilter",
|
PluginName: "FakePostFilter",
|
||||||
Handler: fwk,
|
Handler: fwk,
|
||||||
Interface: fakePostPlugin,
|
Interface: fakePostPlugin,
|
||||||
State: state,
|
|
||||||
}
|
}
|
||||||
got, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
got, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
||||||
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
// Sort the values (inner victims) and the candidate itself (by its NominatedNodeName).
|
||||||
for i := range got {
|
for i := range got {
|
||||||
victims := got[i].Victims().Pods
|
victims := got[i].Victims().Pods
|
||||||
@ -344,9 +343,8 @@ func TestSelectCandidate(t *testing.T) {
|
|||||||
PluginName: "FakePreemptionScorePostFilter",
|
PluginName: "FakePreemptionScorePostFilter",
|
||||||
Handler: fwk,
|
Handler: fwk,
|
||||||
Interface: fakePreemptionScorePostFilterPlugin,
|
Interface: fakePreemptionScorePostFilterPlugin,
|
||||||
State: state,
|
|
||||||
}
|
}
|
||||||
candidates, _, _ := pe.DryRunPreemption(ctx, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
candidates, _, _ := pe.DryRunPreemption(ctx, state, pod, nodeInfos, nil, 0, int32(len(nodeInfos)))
|
||||||
s := pe.SelectCandidate(ctx, candidates)
|
s := pe.SelectCandidate(ctx, candidates)
|
||||||
if s == nil || len(s.Name()) == 0 {
|
if s == nil || len(s.Name()) == 0 {
|
||||||
t.Errorf("expect any node in %v, but no candidate selected", tt.expected)
|
t.Errorf("expect any node in %v, but no candidate selected", tt.expected)
|
||||||
|
@ -310,6 +310,12 @@ func (p *PodWrapper) Name(s string) *PodWrapper {
|
|||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Name sets `s` as the name of the inner pod.
|
||||||
|
func (p *PodWrapper) GenerateName(s string) *PodWrapper {
|
||||||
|
p.SetGenerateName(s)
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
// UID sets `s` as the UID of the inner pod.
|
// UID sets `s` as the UID of the inner pod.
|
||||||
func (p *PodWrapper) UID(s string) *PodWrapper {
|
func (p *PodWrapper) UID(s string) *PodWrapper {
|
||||||
p.SetUID(types.UID(s))
|
p.SetUID(types.UID(s))
|
||||||
|
@ -1060,6 +1060,12 @@
|
|||||||
lockToDefault: false
|
lockToDefault: false
|
||||||
preRelease: Alpha
|
preRelease: Alpha
|
||||||
version: "1.29"
|
version: "1.29"
|
||||||
|
- name: SchedulerAsyncPreemption
|
||||||
|
versionedSpecs:
|
||||||
|
- default: false
|
||||||
|
lockToDefault: false
|
||||||
|
preRelease: Alpha
|
||||||
|
version: "1.32"
|
||||||
- name: SchedulerQueueingHints
|
- name: SchedulerQueueingHints
|
||||||
versionedSpecs:
|
versionedSpecs:
|
||||||
- default: false
|
- default: false
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -358,6 +358,16 @@
|
|||||||
SchedulerQueueingHints: false
|
SchedulerQueueingHints: false
|
||||||
labels: [performance]
|
labels: [performance]
|
||||||
threshold: 200
|
threshold: 200
|
||||||
|
featureGates:
|
||||||
|
SchedulerAsyncPreemption: false
|
||||||
|
params:
|
||||||
|
initNodes: 5000
|
||||||
|
initPods: 20000
|
||||||
|
measurePods: 5000
|
||||||
|
- name: 5000Nodes_AsyncPreemption
|
||||||
|
labels: [performance]
|
||||||
|
featureGates:
|
||||||
|
SchedulerAsyncPreemption: true
|
||||||
params:
|
params:
|
||||||
initNodes: 5000
|
initNodes: 5000
|
||||||
initPods: 20000
|
initPods: 20000
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
metadata:
|
metadata:
|
||||||
generateName: pod-
|
generateName: pod-high-priority-
|
||||||
spec:
|
spec:
|
||||||
priority: 10
|
priority: 10
|
||||||
containers:
|
containers:
|
||||||
|
Loading…
Reference in New Issue
Block a user