From d2ab00cf55b8b2d20b77148c7c6e2a54460812ef Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 23 Sep 2024 16:00:47 +0200 Subject: [PATCH] DRA scheduler: fix aborting of allocation with mode "all" The internal "stop allocation" error was sometimes erroneously (pun intended) returned as result of the allocation and then shown to users. No error and no results should have been returned instead, which then is shown as "allocation not possible". Aborting allocation early is only correct if the device which cannot be allocated is one of the "all" devices which were requested. --- .../structured/allocator.go | 20 +- .../structured/allocator_test.go | 181 +++++++++++++++++- 2 files changed, 192 insertions(+), 9 deletions(-) diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go index 7d4d3a374fb..5430ec0a08e 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go @@ -294,10 +294,13 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult [] // All errors get created such that they can be returned by Allocate // without further wrapping. done, err := alloc.allocateOne(deviceIndices{}) + if errors.Is(err, errStop) { + return nil, nil + } if err != nil { return nil, err } - if errors.Is(err, errStop) || !done { + if !done { return nil, nil } @@ -538,20 +541,23 @@ func (alloc *allocator) allocateOne(r deviceIndices) (bool, error) { // For "all" devices we already know which ones we need. We // just need to check whether we can use them. deviceWithID := requestData.allDevices[r.deviceIndex] - _, _, err := alloc.allocateDevice(r, deviceWithID.device, deviceWithID.DeviceID, true) + success, _, err := alloc.allocateDevice(r, deviceWithID.device, deviceWithID.DeviceID, true) if err != nil { return false, err } + if !success { + // The order in which we allocate "all" devices doesn't matter, + // so we only try with the one which was up next. If we couldn't + // get all of them, then there is no solution and we have to stop. + return false, errStop + } done, err := alloc.allocateOne(deviceIndices{claimIndex: r.claimIndex, requestIndex: r.requestIndex, deviceIndex: r.deviceIndex + 1}) if err != nil { return false, err } - - // The order in which we allocate "all" devices doesn't matter, - // so we only try with the one which was up next. If we couldn't - // get all of them, then there is no solution and we have to stop. if !done { - return false, errStop + // Backtrack. + return false, nil } return done, nil } diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go index cf123b8289a..2acc0865307 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go @@ -582,11 +582,10 @@ func TestAllocator(t *testing.T) { expectResults: nil, }, - "all-devices": { + "all-devices-single": { claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ Name: req0, AllocationMode: resourceapi.DeviceAllocationModeAll, - Count: 1, DeviceClassName: classA, })), classes: objects(class(classA, driverA)), @@ -598,6 +597,25 @@ func TestAllocator(t *testing.T) { deviceAllocationResult(req0, driverA, pool1, device1), )}, }, + "all-devices-many": { + claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + })), + classes: objects(class(classA, driverA)), + slices: objects( + sliceWithOneDevice(slice1, node1, pool1, driverA), + sliceWithOneDevice(slice1, node1, pool2, driverA), + ), + node: node(node1, region1), + + expectResults: []any{allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverA, pool1, device1), + deviceAllocationResult(req0, driverA, pool2, device1), + )}, + }, "all-devices-of-the-incomplete-pool": { claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ Name: req0, @@ -654,6 +672,165 @@ func TestAllocator(t *testing.T) { ), }, }, + "all-devices-plus-another-reversed": { + claimsToAllocate: objects( + claimWithRequests(claim1, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeExactCount, + Count: 1, + DeviceClassName: classB, + }), + claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + }), + ), + classes: objects( + class(classA, driverA), + class(classB, driverB), + ), + slices: objects( + sliceWithOneDevice(slice1, node1, pool1, driverA), + sliceWithOneDevice(slice1, node1, pool1, driverB), + ), + node: node(node1, region1), + + expectResults: []any{ + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverB, pool1, device1), + ), + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverA, pool1, device1), + ), + }, + }, + "all-devices-many-plus-another": { + claimsToAllocate: objects( + claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + }), + claimWithRequests(claim1, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeExactCount, + Count: 1, + DeviceClassName: classB, + }), + ), + classes: objects( + class(classA, driverA), + class(classB, driverB), + ), + slices: objects( + slice(slice1, node1, pool1, driverA, + device(device1, nil, nil), + device(device2, nil, nil), + ), + sliceWithOneDevice(slice1, node1, pool1, driverB), + ), + node: node(node1, region1), + + expectResults: []any{ + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverA, pool1, device1), + deviceAllocationResult(req0, driverA, pool1, device2), + ), + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverB, pool1, device1), + ), + }, + }, + "all-devices-many-plus-another-reversed": { + claimsToAllocate: objects( + claimWithRequests(claim1, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeExactCount, + Count: 1, + DeviceClassName: classB, + }), + claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + }), + ), + classes: objects( + class(classA, driverA), + class(classB, driverB), + ), + slices: objects( + slice(slice1, node1, pool1, driverA, + device(device1, nil, nil), + device(device2, nil, nil), + ), + sliceWithOneDevice(slice1, node1, pool1, driverB), + ), + node: node(node1, region1), + + expectResults: []any{ + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverB, pool1, device1), + ), + allocationResult( + localNodeSelector(node1), + deviceAllocationResult(req0, driverA, pool1, device1), + deviceAllocationResult(req0, driverA, pool1, device2), + ), + }, + }, + "all-devices-no-solution": { + // One device, two claims both trying to allocate it. + claimsToAllocate: objects( + claimWithRequests(claim1, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeExactCount, + Count: 1, + DeviceClassName: classA, + }), + claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + }), + ), + classes: objects( + class(classA, driverA), + ), + slices: objects( + sliceWithOneDevice(slice1, node1, pool1, driverA), + ), + node: node(node1, region1), + }, + "all-devices-no-solution-reversed": { + // One device, two claims both trying to allocate it. + claimsToAllocate: objects( + claimWithRequests(claim0, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeAll, + DeviceClassName: classA, + }), + claimWithRequests(claim1, nil, resourceapi.DeviceRequest{ + Name: req0, + AllocationMode: resourceapi.DeviceAllocationModeExactCount, + Count: 1, + DeviceClassName: classA, + }), + ), + classes: objects( + class(classA, driverA), + ), + slices: objects( + sliceWithOneDevice(slice1, node1, pool1, driverA), + ), + node: node(node1, region1), + }, "network-attached-device": { claimsToAllocate: objects(claim(claim0, req0, classA)), classes: objects(class(classA, driverA)),