k8s.io/dynamic-resource-allocation: fix potential scheduling deadlock

When handling a PodSchedulingContext object, the code first checked for
unsuitable nodes and then tried to allocate if (and only if) the selected node
hadn't been found to be unsuitable.

If for whatever reason the selected node wasn't listed as potential node, then
scheduling got stuck because the allocation would fail and cause a return with
an error instead of updating the list of unsuitable nodes. This would be
retried with the same result.

To avoid this scenario, the selected node now also gets checked. This is better
than assuming a certain kube-scheduler behavior.

This problem occurred when experimenting with cluster autoscaling:

    spec:
      potentialNodes:
      - gke-cluster-pohly-pool-dra-69b88e1e-bz6c
      - gke-cluster-pohly-pool-dra-69b88e1e-fpvh
      selectedNode: gke-cluster-pohly-default-pool-c9f60a43-6kxh

Why the scheduler wrote a spec like this is unclear. This was with Kubernetes
1.27 and the code has been updated since then, so perhaps it's resolved.
This commit is contained in:
Patrick Ohly 2023-09-25 15:19:45 +02:00
parent 4eb6b3907a
commit 0ba37e70b6
2 changed files with 90 additions and 6 deletions

View File

@ -122,6 +122,12 @@ type Driver interface {
// can be allocated for it (for example, two GPUs requested but
// the node only has one).
//
// The potentialNodes slice contains all potential nodes selected
// by the scheduler plus the selected node. The response must
// not contain any other nodes. Implementations do not have to
// care about size limits in the PodSchedulingContext status, the
// caller will handle that.
//
// The result of the check is in ClaimAllocation.UnsuitableNodes.
// An error indicates that the entire check must be repeated.
UnsuitableNodes(ctx context.Context, pod *v1.Pod, claims []*ClaimAllocation, potentialNodes []string) error
@ -752,12 +758,20 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
// and shouldn't, because those allocations might have to be undone to
// pick a better node. If we don't need to allocate now, then we'll
// simply report back the gather information.
//
// We shouldn't assume that the scheduler has included the selected node
// in the list of potential nodes. Usually it does, but let's make sure
// that we check it.
selectedNode := schedulingCtx.Spec.SelectedNode
potentialNodes := schedulingCtx.Spec.PotentialNodes
if selectedNode != "" && !hasString(potentialNodes, selectedNode) {
potentialNodes = append(potentialNodes, selectedNode)
}
if len(schedulingCtx.Spec.PotentialNodes) > 0 {
if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, schedulingCtx.Spec.PotentialNodes); err != nil {
if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, potentialNodes); err != nil {
return fmt.Errorf("checking potential nodes: %v", err)
}
}
selectedNode := schedulingCtx.Spec.SelectedNode
logger.V(5).Info("pending pod claims", "claims", claims, "selectedNode", selectedNode)
if selectedNode != "" {
unsuitable := false
@ -811,12 +825,12 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
schedulingCtx.Status.ResourceClaims = append(schedulingCtx.Status.ResourceClaims,
resourcev1alpha2.ResourceClaimSchedulingStatus{
Name: delayed.PodClaimName,
UnsuitableNodes: delayed.UnsuitableNodes,
UnsuitableNodes: truncateNodes(delayed.UnsuitableNodes, selectedNode),
})
modified = true
} else if stringsDiffer(schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes, delayed.UnsuitableNodes) {
// Update existing entry.
schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = delayed.UnsuitableNodes
schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = truncateNodes(delayed.UnsuitableNodes, selectedNode)
modified = true
}
}
@ -832,6 +846,23 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
return errPeriodic
}
func truncateNodes(nodes []string, selectedNode string) []string {
// We might have checked "potential nodes + selected node" above, so
// this list might be too long by one element. When truncating it, make
// sure that the selected node is listed.
lenUnsuitable := len(nodes)
if lenUnsuitable > resourcev1alpha2.PodSchedulingNodeListMaxSize {
if nodes[0] == selectedNode {
// Truncate at the end and keep selected node in the first element.
nodes = nodes[0 : lenUnsuitable-1]
} else {
// Truncate at the front, it's not the selected node.
nodes = nodes[1:lenUnsuitable]
}
}
return nodes
}
type claimAllocations []*ClaimAllocation
// MarshalLog replaces the pointers with the actual structs because

View File

@ -19,6 +19,7 @@ package controller
import (
"context"
"errors"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
@ -64,6 +65,10 @@ func TestController(t *testing.T) {
otherNodeName := "worker-2"
unsuitableNodes := []string{otherNodeName}
potentialNodes := []string{nodeName, otherNodeName}
maxNodes := make([]string, resourcev1alpha2.PodSchedulingNodeListMaxSize)
for i := range maxNodes {
maxNodes[i] = fmt.Sprintf("node-%d", i)
}
withDeletionTimestamp := func(claim *resourcev1alpha2.ResourceClaim) *resourcev1alpha2.ResourceClaim {
var deleted metav1.Time
claim = claim.DeepCopy()
@ -101,18 +106,24 @@ func TestController(t *testing.T) {
podSchedulingCtx.Spec.SelectedNode = nodeName
return podSchedulingCtx
}
withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
withSpecificUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, unsuitableNodes []string) *resourcev1alpha2.PodSchedulingContext {
podSchedulingCtx = podSchedulingCtx.DeepCopy()
podSchedulingCtx.Status.ResourceClaims = append(podSchedulingCtx.Status.ResourceClaims,
resourcev1alpha2.ResourceClaimSchedulingStatus{Name: podClaimName, UnsuitableNodes: unsuitableNodes},
)
return podSchedulingCtx
}
withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
return withSpecificUnsuitableNodes(podSchedulingCtx, unsuitableNodes)
}
withSpecificPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, potentialNodes []string) *resourcev1alpha2.PodSchedulingContext {
podSchedulingCtx = podSchedulingCtx.DeepCopy()
podSchedulingCtx.Spec.PotentialNodes = potentialNodes
return podSchedulingCtx
}
withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
return withSpecificPotentialNodes(podSchedulingCtx, potentialNodes)
}
var m mockDriver
@ -376,6 +387,48 @@ func TestController(t *testing.T) {
expectedSchedulingCtx: withUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))),
expectedError: errPeriodic.Error(),
},
// pod with delayed allocation, potential nodes, selected node, all unsuitable -> update unsuitable nodes
"pod-selected-is-potential-node": {
key: podKey,
classes: classes,
claim: delayedClaim,
expectedClaim: delayedClaim,
pod: podWithClaim,
schedulingCtx: withPotentialNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))),
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
expectClaimParameters(map[string]interface{}{claimName: 2}).
expectUnsuitableNodes(map[string][]string{podClaimName: potentialNodes}, nil),
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx)), potentialNodes),
expectedError: errPeriodic.Error(),
},
// pod with delayed allocation, max potential nodes, other selected node, all unsuitable -> update unsuitable nodes with truncation at start
"pod-selected-is-potential-node-truncate-first": {
key: podKey,
classes: classes,
claim: delayedClaim,
expectedClaim: delayedClaim,
pod: podWithClaim,
schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes),
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
expectClaimParameters(map[string]interface{}{claimName: 2}).
expectUnsuitableNodes(map[string][]string{podClaimName: append(maxNodes, nodeName)}, nil),
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append(maxNodes[1:], nodeName)),
expectedError: errPeriodic.Error(),
},
// pod with delayed allocation, max potential nodes, other selected node, all unsuitable (but in reverse order) -> update unsuitable nodes with truncation at end
"pod-selected-is-potential-node-truncate-last": {
key: podKey,
classes: classes,
claim: delayedClaim,
expectedClaim: delayedClaim,
pod: podWithClaim,
schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes),
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
expectClaimParameters(map[string]interface{}{claimName: 2}).
expectUnsuitableNodes(map[string][]string{podClaimName: append([]string{nodeName}, maxNodes...)}, nil),
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append([]string{nodeName}, maxNodes[:len(maxNodes)-1]...)),
expectedError: errPeriodic.Error(),
},
} {
t.Run(name, func(t *testing.T) {
_, ctx := ktesting.NewTestContext(t)