scheduler: start scheduling attempt with clean UnschedulablePlugins

When some plugin was registered as "unschedulable" in some previous scheduling
attempt, it kept that attribute for a pod forever. When that plugin then later
failed with an error that requires backoff, the pod was incorrectly moved to the
"unschedulable" queue where it got stuck until the periodic flushing because
there was no event that the plugin was waiting for.

Here's an example where that happened:

     framework.go:1280: E0831 20:03:47.184243] Reserve/DynamicResources: Plugin failed err="Operation cannot be fulfilled on podschedulingcontexts.resource.k8s.io \"test-dragxd5c\": the object has been modified; please apply your changes to the latest version and try again" node="scheduler-perf-dra-7l2v2" plugin="DynamicResources" pod="test/test-dragxd5c"
    schedule_one.go:1001: E0831 20:03:47.184345] Error scheduling pod; retrying err="running Reserve plugin \"DynamicResources\": Operation cannot be fulfilled on podschedulingcontexts.resource.k8s.io \"test-dragxd5c\": the object has been modified; please apply your changes to the latest version and try again" pod="test/test-dragxd5c"
    ...
    scheduling_queue.go:745: I0831 20:03:47.198968] Pod moved to an internal scheduling queue pod="test/test-dragxd5c" event="ScheduleAttemptFailure" queue="Unschedulable" schedulingCycle=9576 hint="QueueSkip"

Pop still needs the information about unschedulable plugins to update the
UnschedulableReason metric. It can reset that information before returning the
PodInfo for the next scheduling attempt.
This commit is contained in:
Patrick Ohly 2023-09-01 07:55:31 +02:00
parent 04292dd94b
commit 4e73634b53
2 changed files with 96 additions and 11 deletions

View File

@ -840,9 +840,11 @@ func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) {
p.inFlightPods[pInfo.Pod.UID] = p.inFlightEvents.PushBack(pInfo.Pod)
}
// Update metrics and reset the set of unschedulable plugins for the next attempt.
for plugin := range pInfo.UnschedulablePlugins {
metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec()
}
pInfo.UnschedulablePlugins.Clear()
return pInfo, nil
}

View File

@ -201,17 +201,6 @@ func Test_InFlightPods(t *testing.T) {
callback func(t *testing.T, q *PriorityQueue)
}
popPod := func(t *testing.T, q *PriorityQueue, pod *v1.Pod) *framework.QueuedPodInfo {
p, err := q.Pop()
if err != nil {
t.Fatalf("Pop failed: %v", err)
}
if p.Pod.UID != pod.UID {
t.Errorf("Unexpected popped pod: %v", p)
}
return p
}
tests := []struct {
name string
queueingHintMap QueueingHintMapPerProfile
@ -542,6 +531,47 @@ func Test_InFlightPods(t *testing.T) {
},
},
},
{
name: "popped pod must have empty UnschedulablePlugins",
isSchedulingQueueHintEnabled: true,
initialPods: []*v1.Pod{pod},
actions: []action{
{callback: func(t *testing.T, q *PriorityQueue) { poppedPod = popPod(t, q, pod) }},
{callback: func(t *testing.T, q *PriorityQueue) {
logger, _ := ktesting.NewTestContext(t)
// Unschedulable.
poppedPod.UnschedulablePlugins = sets.New("fooPlugin1")
if err := q.AddUnschedulableIfNotPresent(logger, poppedPod, q.SchedulingCycle()); err != nil {
t.Errorf("Unexpected error from AddUnschedulableIfNotPresent: %v", err)
}
}},
{eventHappens: &PvAdd}, // Active again.
{callback: func(t *testing.T, q *PriorityQueue) {
poppedPod = popPod(t, q, pod)
if len(poppedPod.UnschedulablePlugins) > 0 {
t.Errorf("QueuedPodInfo from Pop should have empty UnschedulablePlugins, got instead: %+v", poppedPod)
}
}},
{callback: func(t *testing.T, q *PriorityQueue) {
logger, _ := ktesting.NewTestContext(t)
// Failed (i.e. no UnschedulablePlugins). Should go to backoff.
if err := q.AddUnschedulableIfNotPresent(logger, poppedPod, q.SchedulingCycle()); err != nil {
t.Errorf("Unexpected error from AddUnschedulableIfNotPresent: %v", err)
}
}},
},
queueingHintMap: QueueingHintMapPerProfile{
"": {
PvAdd: {
{
// The hint fn tells that this event makes a Pod scheudlable immediately.
PluginName: "fooPlugin1",
QueueingHintFn: queueHintReturnQueueImmediately,
},
},
},
},
},
}
for _, test := range tests {
@ -642,6 +672,59 @@ func Test_InFlightPods(t *testing.T) {
}
}
func popPod(t *testing.T, q *PriorityQueue, pod *v1.Pod) *framework.QueuedPodInfo {
p, err := q.Pop()
if err != nil {
t.Fatalf("Pop failed: %v", err)
}
if p.Pod.UID != pod.UID {
t.Errorf("Unexpected popped pod: %v", p)
}
return p
}
func TestPop(t *testing.T) {
pod := st.MakePod().Name("targetpod").UID("pod1").Obj()
queueingHintMap := QueueingHintMapPerProfile{
"": {
PvAdd: {
{
// The hint fn tells that this event makes a Pod scheudlable immediately.
PluginName: "fooPlugin1",
QueueingHintFn: queueHintReturnQueueImmediately,
},
},
},
}
for name, isSchedulingQueueHintEnabled := range map[string]bool{"with-hints": true, "without-hints": false} {
t.Run(name, func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, isSchedulingQueueHintEnabled)()
logger, ctx := ktesting.NewTestContext(t)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
q := NewTestQueueWithObjects(ctx, newDefaultQueueSort(), []runtime.Object{pod}, WithQueueingHintMapPerProfile(queueingHintMap))
q.Add(logger, pod)
// Simulate failed attempt that makes the pod unschedulable.
poppedPod := popPod(t, q, pod)
poppedPod.UnschedulablePlugins = sets.New("fooPlugin1")
if err := q.AddUnschedulableIfNotPresent(logger, poppedPod, q.SchedulingCycle()); err != nil {
t.Errorf("Unexpected error from AddUnschedulableIfNotPresent: %v", err)
}
// Activate it again.
q.MoveAllToActiveOrBackoffQueue(logger, PvAdd, nil, nil, nil)
// Now check result of Pop.
poppedPod = popPod(t, q, pod)
if len(poppedPod.UnschedulablePlugins) > 0 {
t.Errorf("QueuedPodInfo from Pop should have empty UnschedulablePlugins, got instead: %+v", poppedPod)
}
})
}
}
func TestPriorityQueue_AddUnschedulableIfNotPresent(t *testing.T) {
objs := []runtime.Object{highPriNominatedPodInfo.Pod, unschedulablePodInfo.Pod}
logger, ctx := ktesting.NewTestContext(t)