bug(nodeunschedulable): register missing Pod event for NodeUnschedulable plugin

This commit is contained in:
Kensei Nakada 2024-09-18 12:53:42 +09:00
parent 048c8536d6
commit 495981974e
3 changed files with 111 additions and 4 deletions

View File

@ -51,12 +51,56 @@ const (
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// preCheck is not used when QHint is enabled.
if !pl.enableSchedulingQueueHint {
return []framework.ClusterEventWithHint{
// A note about UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
return []framework.ClusterEventWithHint{
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pod.UID == modifiedPod.UID {
// Note: we don't need to check oldPod tolerations the taint because:
// - Taint can be added, but can't be modified nor removed.
// - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place.
// Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet.
if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{
Key: v1.TaintNodeUnschedulable,
Effect: v1.TaintEffectNoSchedule,
}) {
// This update makes the pod tolerate the unschedulable taint.
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// isSchedulableAfterNodeChange is invoked for all node events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable.

View File

@ -23,6 +23,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
st "k8s.io/kubernetes/pkg/scheduler/testing"
"k8s.io/kubernetes/test/utils/ktesting"
)
@ -96,14 +97,14 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) {
expectedErr bool
}{
{
name: "backoff-wrong-new-object",
name: "error-wrong-new-object",
pod: &v1.Pod{},
newObj: "not-a-node",
expectedHint: framework.Queue,
expectedErr: true,
},
{
name: "backoff-wrong-old-object",
name: "error-wrong-old-object",
pod: &v1.Pod{},
newObj: &v1.Node{
Spec: v1.NodeSpec{
@ -186,3 +187,64 @@ func TestIsSchedulableAfterNodeChange(t *testing.T) {
})
}
}
func TestIsSchedulableAfterPodTolerationChange(t *testing.T) {
testCases := []struct {
name string
pod *v1.Pod
oldObj, newObj interface{}
expectedHint framework.QueueingHint
expectedErr bool
}{
{
name: "error-wrong-new-object",
pod: &v1.Pod{},
newObj: "not-a-pod",
expectedHint: framework.Queue,
expectedErr: true,
},
{
name: "error-wrong-old-object",
pod: &v1.Pod{},
newObj: &v1.Pod{},
oldObj: "not-a-pod",
expectedHint: framework.Queue,
expectedErr: true,
},
{
name: "skip-different-pod-update",
pod: st.MakePod().UID("uid").Obj(),
newObj: st.MakePod().UID("different-uid").Toleration(v1.TaintNodeUnschedulable).Obj(),
oldObj: st.MakePod().UID("different-uid").Obj(),
expectedHint: framework.QueueSkip,
},
{
name: "queue-when-the-unsched-pod-gets-toleration",
pod: st.MakePod().UID("uid").Obj(),
newObj: st.MakePod().UID("uid").Toleration(v1.TaintNodeUnschedulable).Obj(),
oldObj: st.MakePod().UID("uid").Obj(),
expectedHint: framework.Queue,
},
{
name: "skip-when-the-unsched-pod-gets-unrelated-toleration",
pod: st.MakePod().UID("uid").Obj(),
newObj: st.MakePod().UID("uid").Toleration("unrelated-key").Obj(),
oldObj: st.MakePod().UID("uid").Obj(),
expectedHint: framework.QueueSkip,
},
}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
logger, _ := ktesting.NewTestContext(t)
pl := &NodeUnschedulable{}
got, err := pl.isSchedulableAfterPodTolerationChange(logger, testCase.pod, testCase.oldObj, testCase.newObj)
if err != nil && !testCase.expectedErr {
t.Errorf("unexpected error: %v", err)
}
if got != testCase.expectedHint {
t.Errorf("isSchedulableAfterNodeChange() = %v, want %v", got, testCase.expectedHint)
}
})
}
}

View File

@ -69,7 +69,8 @@ const (
UpdatePodLabel
// UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced).
UpdatePodScaleDown
// UpdatePodTolerations is an update for pod's tolerations.
// UpdatePodTolerations is an addition for pod's tolerations.
// (Due to API validation, we can add, but cannot modify or remove tolerations.)
UpdatePodTolerations
// UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod.
UpdatePodSchedulingGatesEliminated