DRA scheduler: fix incorrect allocation of "all" devices

The code which pre-determined the set of "all" devices when using
"allocationMode: all" accidentally ignored the selector of the device class.
As a result, allocation worked correctly only when a node had only devices
matching the intended device class. When there were additional devices, things
went wrong:
 - Unrelated devices allocated for a request.
 - Claim allocation failed completely.
This commit is contained in:
Patrick Ohly 2024-09-23 15:26:15 +02:00
parent e456fbfaa6
commit 1a34d4840b
2 changed files with 41 additions and 2 deletions

View File

@ -169,9 +169,13 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult []
return nil, fmt.Errorf("claim %s, request %s: could not retrieve device class %s: %w", klog.KObj(claim), request.Name, request.DeviceClassName, err)
}
// Start collecting information about the request.
// The class must be set and stored before calling isSelectable.
requestData := requestData{
class: class,
}
requestKey := requestIndices{claimIndex: claimIndex, requestIndex: requestIndex}
alloc.requestData[requestKey] = requestData
switch request.AllocationMode {
case resourceapi.DeviceAllocationModeExactCount:
@ -190,7 +194,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult []
for _, slice := range pool.Slices {
for deviceIndex := range slice.Spec.Devices {
selectable, err := alloc.isSelectable(requestIndices{claimIndex: claimIndex, requestIndex: requestIndex}, slice, deviceIndex)
selectable, err := alloc.isSelectable(requestKey, slice, deviceIndex)
if err != nil {
return nil, err
}
@ -205,7 +209,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult []
default:
return nil, fmt.Errorf("claim %s, request %s: unsupported count mode %s", klog.KObj(claim), request.Name, request.AllocationMode)
}
alloc.requestData[requestIndices{claimIndex: claimIndex, requestIndex: requestIndex}] = requestData
alloc.requestData[requestKey] = requestData
numDevices += requestData.numDevices
}
alloc.logger.V(6).Info("Checked claim", "claim", klog.KObj(claim), "numDevices", numDevices)

View File

@ -619,6 +619,41 @@ func TestAllocator(t *testing.T) {
expectResults: nil,
expectError: gomega.MatchError(gomega.ContainSubstring("claim claim-0, request req-0: asks for all devices, but resource pool driver-a/pool-1 is currently being updated")),
},
"all-devices-plus-another": {
claimsToAllocate: objects(
claimWithRequests(claim0, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeAll,
DeviceClassName: classA,
}),
claimWithRequests(claim1, nil, resourceapi.DeviceRequest{
Name: req0,
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: 1,
DeviceClassName: classB,
}),
),
classes: objects(
class(classA, driverA),
class(classB, driverB),
),
slices: objects(
sliceWithOneDevice(slice1, node1, pool1, driverA),
sliceWithOneDevice(slice1, node1, pool1, driverB),
),
node: node(node1, region1),
expectResults: []any{
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverA, pool1, device1),
),
allocationResult(
localNodeSelector(node1),
deviceAllocationResult(req0, driverB, pool1, device1),
),
},
},
"network-attached-device": {
claimsToAllocate: objects(claim(claim0, req0, classA)),
classes: objects(class(classA, driverA)),