Enhanced logic to identify eligible preemption node

This commit is contained in:
Wei Huang 2022-12-20 16:31:21 -08:00
parent 7fd0ff5761
commit 91742e2393
No known key found for this signature in database
GPG Key ID: 17AFE05D01EA77B2
6 changed files with 65 additions and 3 deletions

View File

@ -49,6 +49,7 @@ const Name = names.DefaultPreemption
// DefaultPreemption is a PostFilter plugin implements the preemption logic.
type DefaultPreemption struct {
fh framework.Handle
fts feature.Features
args config.DefaultPreemptionArgs
podLister corelisters.PodLister
pdbLister policylisters.PodDisruptionBudgetLister
@ -72,6 +73,7 @@ func New(dpArgs runtime.Object, fh framework.Handle, fts feature.Features) (fram
}
pl := DefaultPreemption{
fh: fh,
fts: fts,
args: *args,
podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(),
pdbLister: getPDBLister(fh.SharedInformerFactory()),
@ -250,7 +252,7 @@ func (pl *DefaultPreemption) PodEligibleToPreemptOthers(pod *v1.Pod, nominatedNo
if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil {
podPriority := corev1helpers.PodPriority(pod)
for _, p := range nodeInfo.Pods {
if p.Pod.DeletionTimestamp != nil && corev1helpers.PodPriority(p.Pod) < podPriority {
if corev1helpers.PodPriority(p.Pod) < podPriority && podTerminatingByPreemption(p.Pod, pl.fts.EnablePodDisruptionConditions) {
// There is a terminating pod on the nominated node.
return false, "not eligible due to a terminating pod on the nominated node."
}
@ -260,6 +262,25 @@ func (pl *DefaultPreemption) PodEligibleToPreemptOthers(pod *v1.Pod, nominatedNo
return true, ""
}
// podTerminatingByPreemption returns the pod's terminating state if feature PodDisruptionConditions is not enabled.
// Otherwise, it additionally checks if the termination state is caused by scheduler preemption.
func podTerminatingByPreemption(p *v1.Pod, enablePodDisruptionConditions bool) bool {
if p.DeletionTimestamp == nil {
return false
}
if !enablePodDisruptionConditions {
return true
}
for _, condition := range p.Status.Conditions {
if condition.Type == v1.DisruptionTarget {
return condition.Status == v1.ConditionTrue && condition.Reason == v1.PodReasonPreemptionByKubeScheduler
}
}
return false
}
// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
// and "nonViolatingPods" based on whether their PDBs will be violated if they are
// preempted.

View File

@ -1401,6 +1401,7 @@ func TestSelectBestCandidate(t *testing.T) {
func TestPodEligibleToPreemptOthers(t *testing.T) {
tests := []struct {
name string
fts feature.Features
pod *v1.Pod
pods []*v1.Pod
nodes []string
@ -1439,6 +1440,40 @@ func TestPodEligibleToPreemptOthers(t *testing.T) {
nominatedNodeStatus: nil,
expected: false,
},
{
name: "victim Pods terminating, feature PodDisruptionConditions is enabled",
fts: feature.Features{EnablePodDisruptionConditions: true},
pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(),
pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Priority(lowPriority).Node("node1").Terminating().
Condition(v1.DisruptionTarget, v1.ConditionTrue, v1.PodReasonPreemptionByKubeScheduler).Obj()},
nodes: []string{"node1"},
expected: false,
},
{
name: "non-victim Pods terminating, feature PodDisruptionConditions is enabled",
fts: feature.Features{EnablePodDisruptionConditions: true},
pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(),
pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Priority(lowPriority).Node("node1").Terminating().Obj()},
nodes: []string{"node1"},
expected: true,
},
{
name: "victim Pods terminating, feature PodDisruptionConditions is disabled",
fts: feature.Features{EnablePodDisruptionConditions: false},
pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(),
pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Priority(lowPriority).Node("node1").Terminating().
Condition(v1.DisruptionTarget, v1.ConditionTrue, v1.PodReasonPreemptionByKubeScheduler).Obj()},
nodes: []string{"node1"},
expected: false,
},
{
name: "non-victim Pods terminating, feature PodDisruptionConditions is disabled",
fts: feature.Features{EnablePodDisruptionConditions: false},
pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(),
pods: []*v1.Pod{st.MakePod().Name("p1").UID("p1").Priority(lowPriority).Node("node1").Terminating().Obj()},
nodes: []string{"node1"},
expected: false,
},
}
for _, test := range tests {
@ -1459,7 +1494,7 @@ func TestPodEligibleToPreemptOthers(t *testing.T) {
if err != nil {
t.Fatal(err)
}
pl := DefaultPreemption{fh: f}
pl := DefaultPreemption{fh: f, fts: test.fts}
if got, _ := pl.PodEligibleToPreemptOthers(test.pod, test.nominatedNodeStatus); got != test.expected {
t.Errorf("expected %t, got %t for pod: %s", test.expected, got, test.pod.Name)
}

View File

@ -27,4 +27,5 @@ type Features struct {
EnableNodeInclusionPolicyInPodTopologySpread bool
EnableMatchLabelKeysInPodTopologySpread bool
EnablePodSchedulingReadiness bool
EnablePodDisruptionConditions bool
}

View File

@ -54,6 +54,7 @@ func NewInTreeRegistry() runtime.Registry {
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
EnablePodSchedulingReadiness: feature.DefaultFeatureGate.Enabled(features.PodSchedulingReadiness),
EnablePodDisruptionConditions: feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions),
}
registry := runtime.Registry{

View File

@ -361,7 +361,7 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.
victimPodApply.Status.WithConditions(corev1apply.PodCondition().
WithType(v1.DisruptionTarget).
WithStatus(v1.ConditionTrue).
WithReason("PreemptionByKubeScheduler").
WithReason(v1.PodReasonPreemptionByKubeScheduler).
WithMessage(fmt.Sprintf("Kube-scheduler: preempting to accommodate a higher priority pod: %s", klog.KObj(pod))).
WithLastTransitionTime(metav1.Now()),
)

View File

@ -2721,6 +2721,10 @@ const (
// TerminationByKubelet reason in DisruptionTarget pod condition indicates that the termination
// is initiated by kubelet
PodReasonTerminationByKubelet = "TerminationByKubelet"
// PodReasonPreemptionByKubeScheduler reason in DisruptionTarget pod condition indicates that the
// disruption was initiated by scheduler's preemption.
PodReasonPreemptionByKubeScheduler = "PreemptionByKubeScheduler"
)
// PodCondition contains details for the current condition of this pod.