From f0efb8a5fd09bb538bc2018d2a7da65e4d40a2ac Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Wed, 4 Sep 2024 18:55:39 +0200
Subject: [PATCH] DRA scheduler: populate set of allocated devices only once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The devices which are allocated before starting the allocation always remain
allocated. They can be stored once in a set, then each Filter call for the
different nodes can reuse that set instead of allocating it anew for each node.

    goos: linux
    goarch: amd64
    pkg: k8s.io/kubernetes/test/integration/scheduler_perf
    cpu: Intel(R) Core(TM) i9-7980XE CPU @ 2.60GHz
                                                                                       │            before            │                        after                         │
                                                                                       │ SchedulingThroughput/Average │ SchedulingThroughput/Average  vs base                │
    PerfScheduling/SchedulingWithResourceClaimTemplateStructured/5000pods_500nodes-36                      36.89 ± 2%                     54.70 ± 6%   +48.26% (p=0.002 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/empty_100nodes-36                     105.7 ± 5%                     106.4 ± 4%         ~ (p=0.970 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/empty_500nodes-36                     117.8 ± 3%                     120.0 ± 4%         ~ (p=0.134 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/half_100nodes-36                      119.5 ± 4%                     112.5 ± 4%    -5.86% (p=0.009 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/half_500nodes-36                      63.22 ± 2%                     87.13 ± 4%   +37.82% (p=0.002 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/full_100nodes-36                      109.5 ± 2%                     113.4 ± 2%    +3.65% (p=0.006 n=6)
    PerfScheduling/SteadyStateClusterResourceClaimTemplateStructured/full_500nodes-36                      27.56 ± 5%                     65.55 ± 3%  +137.84% (p=0.002 n=6)
    geomean                                                                                                72.44                          90.81        +25.37%
---
 .../structured/allocator.go                   | 37 ++++++++-----------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
index e12f29aeb3a..51f950fb132 100644
--- a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
+++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
@@ -43,7 +43,7 @@ import (
 type Allocator struct {
 	adminAccessEnabled bool
 	claimsToAllocate   []*resourceapi.ResourceClaim
-	allocatedDevices   []DeviceID
+	allocatedDevices   sets.Set[DeviceID]
 	classLister        resourcelisters.DeviceClassLister
 	slices             []*resourceapi.ResourceSlice
 	celCache           *CELCache
@@ -66,11 +66,12 @@ func NewAllocator(ctx context.Context,
 	return &Allocator{
 		adminAccessEnabled: adminAccessEnabled,
 		claimsToAllocate:   claimsToAllocate,
-		allocatedDevices:   allocatedDevices,
-		classLister:        classLister,
-		slices:             slices,
-		celCache:           celCache,
-		celMutex:           keymutex.NewHashed(0),
+		// This won't change, so build this set only once.
+		allocatedDevices: sets.New(allocatedDevices...),
+		classLister:      classLister,
+		slices:           slices,
+		celCache:         celCache,
+		celMutex:         keymutex.NewHashed(0),
 	}, nil
 }
 
@@ -295,18 +296,10 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node) (finalResult []
 	// the Allocate call and can be compared in Go.
 	alloc.deviceMatchesRequest = make(map[matchKey]bool)
 
-	// Some of the existing devices are probably already allocated by
-	// claims...
-	numAllocated := len(alloc.allocatedDevices)
+	// We can estimate the size based on what we need to allocate.
+	alloc.allocatingDevices = make(map[DeviceID]bool, numDevicesTotal)
 
-	// Pre-allocating the map avoids growing it later. We can estimate
-	// the size based on what is already allocated and what we need to
-	// allocate.
-	alloc.allocated = make(map[DeviceID]bool, numAllocated+numDevicesTotal)
-	for _, deviceID := range alloc.allocatedDevices {
-		alloc.allocated[deviceID] = true
-	}
-	alloc.logger.V(6).Info("Gathered information about allocated devices", "numAllocated", numAllocated)
+	alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedDevices), "toBeAllocated", numDevicesTotal)
 
 	// In practice, there aren't going to be many different CEL
 	// expressions. Most likely, there is going to be handful of different
@@ -393,7 +386,7 @@ type allocator struct {
 	deviceMatchesRequest map[matchKey]bool
 	constraints          [][]constraint                 // one list of constraints per claim
 	requestData          map[requestIndices]requestData // one entry per request
-	allocated            map[DeviceID]bool
+	allocatingDevices    map[DeviceID]bool
 	result               []internalAllocationResult
 }
 
@@ -620,7 +613,7 @@ func (alloc *allocator) allocateOne(r deviceIndices) (bool, error) {
 				deviceID := DeviceID{Driver: pool.Driver, Pool: pool.Pool, Device: slice.Spec.Devices[deviceIndex].Name}
 
 				// Checking for "in use" is cheap and thus gets done first.
-				if !ptr.Deref(request.AdminAccess, false) && alloc.allocated[deviceID] {
+				if !ptr.Deref(request.AdminAccess, false) && (alloc.allocatedDevices.Has(deviceID) || alloc.allocatingDevices[deviceID]) {
 					alloc.logger.V(7).Info("Device in use", "device", deviceID)
 					continue
 				}
@@ -775,7 +768,7 @@ func (alloc *allocator) allocateDevice(r deviceIndices, device deviceWithID, mus
 	claim := alloc.claimsToAllocate[r.claimIndex]
 	request := &claim.Spec.Devices.Requests[r.requestIndex]
 	adminAccess := ptr.Deref(request.AdminAccess, false)
-	if !adminAccess && alloc.allocated[device.id] {
+	if !adminAccess && (alloc.allocatedDevices.Has(device.id) || alloc.allocatingDevices[device.id]) {
 		alloc.logger.V(7).Info("Device in use", "device", device.id)
 		return false, nil, nil
 	}
@@ -802,7 +795,7 @@ func (alloc *allocator) allocateDevice(r deviceIndices, device deviceWithID, mus
 	// and record the result.
 	alloc.logger.V(7).Info("Device allocated", "device", device.id)
 	if !adminAccess {
-		alloc.allocated[device.id] = true
+		alloc.allocatingDevices[device.id] = true
 	}
 	result := internalDeviceResult{
 		request: request.Name,
@@ -820,7 +813,7 @@ func (alloc *allocator) allocateDevice(r deviceIndices, device deviceWithID, mus
 			constraint.remove(request.Name, device.basic, device.id)
 		}
 		if !adminAccess {
-			alloc.allocated[device.id] = false
+			alloc.allocatingDevices[device.id] = false
 		}
 		// Truncate, but keep the underlying slice.
 		alloc.result[r.claimIndex].devices = alloc.result[r.claimIndex].devices[:previousNumResults]