Merge pull request #127473 from dom4ha/fine-grain-qhints-fit

feature(scheduler): more fine-grained Node QHint for NodeResourceFit plugin
This commit is contained in:
Kubernetes Prow Robot
2024-09-26 18:34:02 +01:00
committed by GitHub
3 changed files with 283 additions and 16 deletions

View File

@@ -363,21 +363,68 @@ func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod
}
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
// that change made a previously unschedulable pod schedulable.
// that change could make a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// TODO: also check if the original node meets the pod's resource requestments once preCheck is completely removed.
// See: https://github.com/kubernetes/kubernetes/issues/110175
if isFit(pod, modifiedNode) {
logger.V(5).Info("node was updated, and may fit with the pod's resource requestments", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
// Leaving in the queue, since the pod won't fit into the modified node anyway.
if !isFit(pod, modifiedNode) {
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// The pod will fit, so since it's add, unblock scheduling.
if originalNode == nil {
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode) {
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node) bool {
podRequest := computePodResourceRequest(pod)
originalNodeInfo := framework.NewNodeInfo()
originalNodeInfo.SetNode(originalNode)
modifiedNodeInfo := framework.NewNodeInfo()
modifiedNodeInfo.SetNode(modifiedNode)
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
return true
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return false
}
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
return true
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
return true
}
}
return false
}
// isFit checks if the pod fits the node. If the node is nil, it returns false.

View File

@@ -1276,14 +1276,37 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
}).Obj(),
expectedHint: framework.Queue,
},
// uncomment this case when the isSchedulableAfterNodeChange also check the
// original node's resources.
// "skip-queue-on-node-unrelated-changes": {
// pod: &v1.Pod{},
// oldObj: st.MakeNode().Obj(),
// newObj: st.MakeNode().Label("foo", "bar").Obj(),
// expectedHint: framework.QueueSkip,
// },
"skip-queue-on-node-unrelated-changes": {
pod: newResourcePod(framework.Resource{
Memory: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
}),
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "2",
}).Obj(),
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "1",
extendedResourceB: "2",
}).Obj(),
expectedHint: framework.QueueSkip,
},
"queue-on-pod-requested-resources-increase": {
pod: newResourcePod(framework.Resource{
Memory: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
}),
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "1",
}).Obj(),
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "2",
}).Obj(),
expectedHint: framework.Queue,
},
"skip-queue-on-node-changes-from-suitable-to-unsuitable": {
pod: newResourcePod(framework.Resource{
Memory: 2,
@@ -1364,3 +1387,136 @@ func TestIsFit(t *testing.T) {
})
}
}
func TestHaveAnyRequestedResourcesIncreased(t *testing.T) {
testCases := map[string]struct {
pod *v1.Pod
originalNode *v1.Node
modifiedNode *v1.Node
expected bool
}{
"no-requested-resources": {
pod: newResourcePod(framework.Resource{}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "2",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "2",
extendedResourceA: "2",
}).Obj(),
expected: false,
},
"no-requested-resources-pods-increased": {
pod: newResourcePod(framework.Resource{}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "2",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: true,
},
"requested-resources-decreased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
EphemeralStorage: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: false,
},
"requested-resources-increased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
EphemeralStorage: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "3",
v1.ResourceMemory: "4",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
expected: true,
},
"non-requested-resources-decreased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: false,
},
"non-requested-resources-increased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "5",
extendedResourceA: "6",
}).Obj(),
expected: false,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
if got := haveAnyRequestedResourcesIncreased(tc.pod, tc.originalNode, tc.modifiedNode); got != tc.expected {
t.Errorf("expected: %v, got: %v", tc.expected, got)
}
})
}
}

View File

@@ -370,6 +370,70 @@ func TestCoreResourceEnqueue(t *testing.T) {
},
wantRequeuedPods: sets.New("pod1"),
},
{
name: "Pod rejected by the NodeResourcesFit plugin isn't requeued when a Node is updated without increase in the requested resources",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
},
pods: []*v1.Pod{
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases unrealted (not requested) memory capacity of fake-node1, which should not requeue Pod1.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4", v1.ResourceMemory: "4000"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.Set[string]{},
enableSchedulingQueueHint: []bool{true},
},
{
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the requested resources",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
},
pods: []*v1.Pod{
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases the requested CPU capacity of fake-node1, which should requeue Pod1.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "5"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.New("pod1"),
enableSchedulingQueueHint: []bool{true},
},
{
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the allowed pods number",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "2"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "1"}).Label("group", "b").Obj(),
},
initialPods: []*v1.Pod{
// - Pod1 will be scheduled on fake-node2 because of the affinity label.
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Node("fake-node2").Obj(),
},
pods: []*v1.Pod{
// - Pod2 is unschedulable because Pod1 saturated ResourcePods limit in fake-node2. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases the allowed Pods number of fake-node1, which should requeue Pod2.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "3"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.New("pod2"),
enableSchedulingQueueHint: []bool{true},
},
{
name: "Updating pod condition doesn't retry scheduling if the Pod was rejected by TaintToleration",
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()},