DRA scheduler: fix aborting of allocation with mode "all"

The internal "stop allocation" error was sometimes erroneously (pun intended)
returned as result of the allocation and then shown to users. No error and no
results should have been returned instead, which then is shown as "allocation
not possible".

Aborting allocation early is only correct if the device which cannot be
allocated is one of the "all" devices which were requested.
This commit is contained in:
Patrick Ohly 2024-09-23 16:00:47 +02:00
parent 1a34d4840b
commit d2ab00cf55
2 changed files with 192 additions and 9 deletions

View File

@ -294,10 +294,13 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult []
// All errors get created such that they can be returned by Allocate
// without further wrapping.
done, err := alloc.allocateOne(deviceIndices{})
if errors.Is(err, errStop) {
return nil, nil
}
if err != nil {
return nil, err
}
if errors.Is(err, errStop) || !done {
if !done {
return nil, nil
}
@ -538,20 +541,23 @@ func (alloc *allocator) allocateOne(r deviceIndices) (bool, error) {
// For "all" devices we already know which ones we need. We
// just need to check whether we can use them.
deviceWithID := requestData.allDevices[r.deviceIndex]
_, _, err := alloc.allocateDevice(r, deviceWithID.device, deviceWithID.DeviceID, true)
success, _, err := alloc.allocateDevice(r, deviceWithID.device, deviceWithID.DeviceID, true)
if err != nil {
return false, err
}
if !success {
// The order in which we allocate "all" devices doesn't matter,
// so we only try with the one which was up next. If we couldn't
// get all of them, then there is no solution and we have to stop.
return false, errStop
}
done, err := alloc.allocateOne(deviceIndices{claimIndex: r.claimIndex, requestIndex: r.requestIndex, deviceIndex: r.deviceIndex + 1})
if err != nil {
return false, err
}
// The order in which we allocate "all" devices doesn't matter,
// so we only try with the one which was up next. If we couldn't
// get all of them, then there is no solution and we have to stop.
if !done {
return false, errStop
// Backtrack.
return false, nil
}
return done, nil
}

View File

@ -582,11 +582,10 @@ func TestAllocator(t *testing.T) {
expectResults: nil,
},
"all-devices": {
"all-devices-single": {
claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
Count: 1,
DeviceClassName: classA,
})),
classes: objects(class(classA, driverA)),
@ -598,6 +597,25 @@ func TestAllocator(t *testing.T) {
deviceAllocationResult(req0, driverA, pool1, device1),
)},
},
"all-devices-many": {
claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
})),
classes: objects(class(classA, driverA)),
slices: objects(
sliceWithOneDevice(slice1, node1, pool1, driverA),
sliceWithOneDevice(slice1, node1, pool2, driverA),
),
node: node(node1, region1),
expectResults: []any{allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverA, pool1, device1),
deviceAllocationResult(req0, driverA, pool2, device1),
)},
},
"all-devices-of-the-incomplete-pool": {
claimsToAllocate: objects(claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
@ -654,6 +672,165 @@ func TestAllocator(t *testing.T) {
),
},
},
"all-devices-plus-another-reversed": {
claimsToAllocate: objects(
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classB,
}),
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
),
classes: objects(
class(classA, driverA),
class(classB, driverB),
),
slices: objects(
sliceWithOneDevice(slice1, node1, pool1, driverA),
sliceWithOneDevice(slice1, node1, pool1, driverB),
),
node: node(node1, region1),
expectResults: []any{
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverB, pool1, device1),
),
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverA, pool1, device1),
),
},
},
"all-devices-many-plus-another": {
claimsToAllocate: objects(
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classB,
}),
),
classes: objects(
class(classA, driverA),
class(classB, driverB),
),
slices: objects(
slice(slice1, node1, pool1, driverA,
device(device1, nil, nil),
device(device2, nil, nil),
),
sliceWithOneDevice(slice1, node1, pool1, driverB),
),
node: node(node1, region1),
expectResults: []any{
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverA, pool1, device1),
deviceAllocationResult(req0, driverA, pool1, device2),
),
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverB, pool1, device1),
),
},
},
"all-devices-many-plus-another-reversed": {
claimsToAllocate: objects(
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classB,
}),
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
),
classes: objects(
class(classA, driverA),
class(classB, driverB),
),
slices: objects(
slice(slice1, node1, pool1, driverA,
device(device1, nil, nil),
device(device2, nil, nil),
),
sliceWithOneDevice(slice1, node1, pool1, driverB),
),
node: node(node1, region1),
expectResults: []any{
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverB, pool1, device1),
),
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverA, pool1, device1),
deviceAllocationResult(req0, driverA, pool1, device2),
),
},
},
"all-devices-no-solution": {
// One device, two claims both trying to allocate it.
claimsToAllocate: objects(
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classA,
}),
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
),
classes: objects(
class(classA, driverA),
),
slices: objects(
sliceWithOneDevice(slice1, node1, pool1, driverA),
),
node: node(node1, region1),
},
"all-devices-no-solution-reversed": {
// One device, two claims both trying to allocate it.
claimsToAllocate: objects(
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classA,
}),
),
classes: objects(
class(classA, driverA),
),
slices: objects(
sliceWithOneDevice(slice1, node1, pool1, driverA),
),
node: node(node1, region1),
},
"network-attached-device": {
claimsToAllocate: objects(claim(claim0, req0, classA)),
classes: objects(class(classA, driverA)),