Merge pull request #127473 from dom4ha/fine-grain-qhints-fit

feature(scheduler): more fine-grained Node QHint for NodeResourceFit plugin
This commit is contained in:
Kubernetes Prow Robot
2024-09-26 18:34:02 +01:00
committed by GitHub
3 changed files with 283 additions and 16 deletions

View File

@@ -363,21 +363,68 @@ func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod
}
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
// that change made a previously unschedulable pod schedulable.
// that change could make a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// TODO: also check if the original node meets the pod's resource requestments once preCheck is completely removed.
// See: https://github.com/kubernetes/kubernetes/issues/110175
if isFit(pod, modifiedNode) {
logger.V(5).Info("node was updated, and may fit with the pod's resource requestments", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// Leaving in the queue, since the pod won't fit into the modified node anyway.
if !isFit(pod, modifiedNode) {
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// The pod will fit, so since it's add, unblock scheduling.
if originalNode == nil {
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode) {
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node) bool {
podRequest := computePodResourceRequest(pod)
originalNodeInfo := framework.NewNodeInfo()
originalNodeInfo.SetNode(originalNode)
modifiedNodeInfo := framework.NewNodeInfo()
modifiedNodeInfo.SetNode(modifiedNode)
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
return true
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return false
}
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
return true
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
return true
}
}
return false
}
// isFit checks if the pod fits the node. If the node is nil, it returns false.

View File

@@ -1276,14 +1276,37 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
}).Obj(),
expectedHint: framework.Queue,
},
// uncomment this case when the isSchedulableAfterNodeChange also check the
// original node's resources.
// "skip-queue-on-node-unrelated-changes": {
// pod: &v1.Pod{},
// oldObj: st.MakeNode().Obj(),
// newObj: st.MakeNode().Label("foo", "bar").Obj(),
// expectedHint: framework.QueueSkip,
// },
"skip-queue-on-node-unrelated-changes": {
pod: newResourcePod(framework.Resource{
Memory: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
}),
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "2",
}).Obj(),
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "1",
extendedResourceB: "2",
}).Obj(),
expectedHint: framework.QueueSkip,
},
"queue-on-pod-requested-resources-increase": {
pod: newResourcePod(framework.Resource{
Memory: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
}),
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "1",
}).Obj(),
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceMemory: "2",
extendedResourceA: "2",
}).Obj(),
expectedHint: framework.Queue,
},
"skip-queue-on-node-changes-from-suitable-to-unsuitable": {
pod: newResourcePod(framework.Resource{
Memory: 2,
@@ -1364,3 +1387,136 @@ func TestIsFit(t *testing.T) {
})
}
}
func TestHaveAnyRequestedResourcesIncreased(t *testing.T) {
testCases := map[string]struct {
pod *v1.Pod
originalNode *v1.Node
modifiedNode *v1.Node
expected bool
}{
"no-requested-resources": {
pod: newResourcePod(framework.Resource{}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "2",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "2",
extendedResourceA: "2",
}).Obj(),
expected: false,
},
"no-requested-resources-pods-increased": {
pod: newResourcePod(framework.Resource{}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "1",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourcePods: "2",
v1.ResourceCPU: "1",
v1.ResourceMemory: "1",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: true,
},
"requested-resources-decreased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
EphemeralStorage: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: false,
},
"requested-resources-increased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
EphemeralStorage: 2,
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "3",
v1.ResourceMemory: "4",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
expected: true,
},
"non-requested-resources-decreased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "1",
extendedResourceA: "1",
}).Obj(),
expected: false,
},
"non-requested-resources-increased": {
pod: newResourcePod(framework.Resource{
MilliCPU: 2,
Memory: 2,
}),
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "3",
extendedResourceA: "4",
}).Obj(),
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
v1.ResourceCPU: "1",
v1.ResourceMemory: "2",
v1.ResourceEphemeralStorage: "5",
extendedResourceA: "6",
}).Obj(),
expected: false,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
if got := haveAnyRequestedResourcesIncreased(tc.pod, tc.originalNode, tc.modifiedNode); got != tc.expected {
t.Errorf("expected: %v, got: %v", tc.expected, got)
}
})
}
}

View File

@@ -370,6 +370,70 @@ func TestCoreResourceEnqueue(t *testing.T) {
},
wantRequeuedPods: sets.New("pod1"),
},
{
name: "Pod rejected by the NodeResourcesFit plugin isn't requeued when a Node is updated without increase in the requested resources",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
},
pods: []*v1.Pod{
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases unrealted (not requested) memory capacity of fake-node1, which should not requeue Pod1.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4", v1.ResourceMemory: "4000"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.Set[string]{},
enableSchedulingQueueHint: []bool{true},
},
{
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the requested resources",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
},
pods: []*v1.Pod{
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases the requested CPU capacity of fake-node1, which should requeue Pod1.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "5"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.New("pod1"),
enableSchedulingQueueHint: []bool{true},
},
{
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the allowed pods number",
initialNodes: []*v1.Node{
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "2"}).Obj(),
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "1"}).Label("group", "b").Obj(),
},
initialPods: []*v1.Pod{
// - Pod1 will be scheduled on fake-node2 because of the affinity label.
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Node("fake-node2").Obj(),
},
pods: []*v1.Pod{
// - Pod2 is unschedulable because Pod1 saturated ResourcePods limit in fake-node2. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
},
triggerFn: func(testCtx *testutils.TestContext) error {
// Trigger a NodeUpdate event that increases the allowed Pods number of fake-node1, which should requeue Pod2.
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "3"}).Obj(), metav1.UpdateOptions{}); err != nil {
return fmt.Errorf("failed to update fake-node: %w", err)
}
return nil
},
wantRequeuedPods: sets.New("pod2"),
enableSchedulingQueueHint: []bool{true},
},
{
name: "Updating pod condition doesn't retry scheduling if the Pod was rejected by TaintToleration",
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()},