mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-14 13:45:06 +00:00
Merge pull request #127473 from dom4ha/fine-grain-qhints-fit
feature(scheduler): more fine-grained Node QHint for NodeResourceFit plugin
This commit is contained in:
@@ -363,21 +363,68 @@ func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
|
||||
// that change made a previously unschedulable pod schedulable.
|
||||
// that change could make a previously unschedulable pod schedulable.
|
||||
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
|
||||
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
// TODO: also check if the original node meets the pod's resource requestments once preCheck is completely removed.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/110175
|
||||
if isFit(pod, modifiedNode) {
|
||||
logger.V(5).Info("node was updated, and may fit with the pod's resource requestments", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
// Leaving in the queue, since the pod won't fit into the modified node anyway.
|
||||
if !isFit(pod, modifiedNode) {
|
||||
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
// The pod will fit, so since it's add, unblock scheduling.
|
||||
if originalNode == nil {
|
||||
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
|
||||
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode) {
|
||||
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
|
||||
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node) bool {
|
||||
podRequest := computePodResourceRequest(pod)
|
||||
originalNodeInfo := framework.NewNodeInfo()
|
||||
originalNodeInfo.SetNode(originalNode)
|
||||
modifiedNodeInfo := framework.NewNodeInfo()
|
||||
modifiedNodeInfo.SetNode(modifiedNode)
|
||||
|
||||
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
|
||||
return true
|
||||
}
|
||||
|
||||
if podRequest.MilliCPU == 0 &&
|
||||
podRequest.Memory == 0 &&
|
||||
podRequest.EphemeralStorage == 0 &&
|
||||
len(podRequest.ScalarResources) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
|
||||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
|
||||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
|
||||
return true
|
||||
}
|
||||
|
||||
for rName, rQuant := range podRequest.ScalarResources {
|
||||
// Skip in case request quantity is zero
|
||||
if rQuant == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isFit checks if the pod fits the node. If the node is nil, it returns false.
|
||||
|
@@ -1276,14 +1276,37 @@ func Test_isSchedulableAfterNodeChange(t *testing.T) {
|
||||
}).Obj(),
|
||||
expectedHint: framework.Queue,
|
||||
},
|
||||
// uncomment this case when the isSchedulableAfterNodeChange also check the
|
||||
// original node's resources.
|
||||
// "skip-queue-on-node-unrelated-changes": {
|
||||
// pod: &v1.Pod{},
|
||||
// oldObj: st.MakeNode().Obj(),
|
||||
// newObj: st.MakeNode().Label("foo", "bar").Obj(),
|
||||
// expectedHint: framework.QueueSkip,
|
||||
// },
|
||||
"skip-queue-on-node-unrelated-changes": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
Memory: 2,
|
||||
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
|
||||
}),
|
||||
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceMemory: "2",
|
||||
extendedResourceA: "2",
|
||||
}).Obj(),
|
||||
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceMemory: "2",
|
||||
extendedResourceA: "1",
|
||||
extendedResourceB: "2",
|
||||
}).Obj(),
|
||||
expectedHint: framework.QueueSkip,
|
||||
},
|
||||
"queue-on-pod-requested-resources-increase": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
Memory: 2,
|
||||
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 1},
|
||||
}),
|
||||
oldObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceMemory: "2",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
newObj: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceMemory: "2",
|
||||
extendedResourceA: "2",
|
||||
}).Obj(),
|
||||
expectedHint: framework.Queue,
|
||||
},
|
||||
"skip-queue-on-node-changes-from-suitable-to-unsuitable": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
Memory: 2,
|
||||
@@ -1364,3 +1387,136 @@ func TestIsFit(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHaveAnyRequestedResourcesIncreased(t *testing.T) {
|
||||
testCases := map[string]struct {
|
||||
pod *v1.Pod
|
||||
originalNode *v1.Node
|
||||
modifiedNode *v1.Node
|
||||
expected bool
|
||||
}{
|
||||
"no-requested-resources": {
|
||||
pod: newResourcePod(framework.Resource{}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourcePods: "1",
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "1",
|
||||
v1.ResourceEphemeralStorage: "1",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourcePods: "1",
|
||||
v1.ResourceCPU: "2",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "2",
|
||||
extendedResourceA: "2",
|
||||
}).Obj(),
|
||||
expected: false,
|
||||
},
|
||||
"no-requested-resources-pods-increased": {
|
||||
pod: newResourcePod(framework.Resource{}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourcePods: "1",
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "1",
|
||||
v1.ResourceEphemeralStorage: "1",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourcePods: "2",
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "1",
|
||||
v1.ResourceEphemeralStorage: "1",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
expected: true,
|
||||
},
|
||||
"requested-resources-decreased": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
MilliCPU: 2,
|
||||
Memory: 2,
|
||||
EphemeralStorage: 2,
|
||||
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
|
||||
}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "3",
|
||||
extendedResourceA: "4",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "1",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
expected: false,
|
||||
},
|
||||
"requested-resources-increased": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
MilliCPU: 2,
|
||||
Memory: 2,
|
||||
EphemeralStorage: 2,
|
||||
ScalarResources: map[v1.ResourceName]int64{extendedResourceA: 2},
|
||||
}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "3",
|
||||
extendedResourceA: "4",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "3",
|
||||
v1.ResourceMemory: "4",
|
||||
v1.ResourceEphemeralStorage: "3",
|
||||
extendedResourceA: "4",
|
||||
}).Obj(),
|
||||
expected: true,
|
||||
},
|
||||
"non-requested-resources-decreased": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
MilliCPU: 2,
|
||||
Memory: 2,
|
||||
}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "3",
|
||||
extendedResourceA: "4",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "1",
|
||||
extendedResourceA: "1",
|
||||
}).Obj(),
|
||||
expected: false,
|
||||
},
|
||||
"non-requested-resources-increased": {
|
||||
pod: newResourcePod(framework.Resource{
|
||||
MilliCPU: 2,
|
||||
Memory: 2,
|
||||
}),
|
||||
originalNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "3",
|
||||
extendedResourceA: "4",
|
||||
}).Obj(),
|
||||
modifiedNode: st.MakeNode().Capacity(map[v1.ResourceName]string{
|
||||
v1.ResourceCPU: "1",
|
||||
v1.ResourceMemory: "2",
|
||||
v1.ResourceEphemeralStorage: "5",
|
||||
extendedResourceA: "6",
|
||||
}).Obj(),
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
for name, tc := range testCases {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
if got := haveAnyRequestedResourcesIncreased(tc.pod, tc.originalNode, tc.modifiedNode); got != tc.expected {
|
||||
t.Errorf("expected: %v, got: %v", tc.expected, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@@ -370,6 +370,70 @@ func TestCoreResourceEnqueue(t *testing.T) {
|
||||
},
|
||||
wantRequeuedPods: sets.New("pod1"),
|
||||
},
|
||||
{
|
||||
name: "Pod rejected by the NodeResourcesFit plugin isn't requeued when a Node is updated without increase in the requested resources",
|
||||
initialNodes: []*v1.Node{
|
||||
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
|
||||
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
|
||||
},
|
||||
pods: []*v1.Pod{
|
||||
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
|
||||
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
|
||||
},
|
||||
triggerFn: func(testCtx *testutils.TestContext) error {
|
||||
// Trigger a NodeUpdate event that increases unrealted (not requested) memory capacity of fake-node1, which should not requeue Pod1.
|
||||
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4", v1.ResourceMemory: "4000"}).Obj(), metav1.UpdateOptions{}); err != nil {
|
||||
return fmt.Errorf("failed to update fake-node: %w", err)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantRequeuedPods: sets.Set[string]{},
|
||||
enableSchedulingQueueHint: []bool{true},
|
||||
},
|
||||
{
|
||||
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the requested resources",
|
||||
initialNodes: []*v1.Node{
|
||||
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
|
||||
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Label("group", "b").Obj(),
|
||||
},
|
||||
pods: []*v1.Pod{
|
||||
// - Pod1 requests available amount of CPU (in fake-node1), but will be rejected by NodeAffinity plugin. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
|
||||
st.MakePod().Name("pod1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
|
||||
},
|
||||
triggerFn: func(testCtx *testutils.TestContext) error {
|
||||
// Trigger a NodeUpdate event that increases the requested CPU capacity of fake-node1, which should requeue Pod1.
|
||||
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "5"}).Obj(), metav1.UpdateOptions{}); err != nil {
|
||||
return fmt.Errorf("failed to update fake-node: %w", err)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantRequeuedPods: sets.New("pod1"),
|
||||
enableSchedulingQueueHint: []bool{true},
|
||||
},
|
||||
{
|
||||
name: "Pod rejected by the NodeResourcesFit plugin is requeued when a Node is updated with increase in the allowed pods number",
|
||||
initialNodes: []*v1.Node{
|
||||
st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "2"}).Obj(),
|
||||
st.MakeNode().Name("fake-node2").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "1"}).Label("group", "b").Obj(),
|
||||
},
|
||||
initialPods: []*v1.Pod{
|
||||
// - Pod1 will be scheduled on fake-node2 because of the affinity label.
|
||||
st.MakePod().Name("pod1").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Node("fake-node2").Obj(),
|
||||
},
|
||||
pods: []*v1.Pod{
|
||||
// - Pod2 is unschedulable because Pod1 saturated ResourcePods limit in fake-node2. Note that the NodeResourceFit plugin will register for QHints because it rejected fake-node2.
|
||||
st.MakePod().Name("pod2").NodeAffinityIn("group", []string{"b"}, st.NodeSelectorTypeMatchExpressions).Container("image").Obj(),
|
||||
},
|
||||
triggerFn: func(testCtx *testutils.TestContext) error {
|
||||
// Trigger a NodeUpdate event that increases the allowed Pods number of fake-node1, which should requeue Pod2.
|
||||
if _, err := testCtx.ClientSet.CoreV1().Nodes().UpdateStatus(testCtx.Ctx, st.MakeNode().Name("fake-node1").Capacity(map[v1.ResourceName]string{v1.ResourcePods: "3"}).Obj(), metav1.UpdateOptions{}); err != nil {
|
||||
return fmt.Errorf("failed to update fake-node: %w", err)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantRequeuedPods: sets.New("pod2"),
|
||||
enableSchedulingQueueHint: []bool{true},
|
||||
},
|
||||
{
|
||||
name: "Updating pod condition doesn't retry scheduling if the Pod was rejected by TaintToleration",
|
||||
initialNodes: []*v1.Node{st.MakeNode().Name("fake-node").Taints([]v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoSchedule}}).Obj()},
|
||||
|
Reference in New Issue
Block a user