dra test: enhance performance of test driver controller

Analyzing the CPU profile of

    go test -timeout=0 -count=5 -cpuprofile profile.out -bench=BenchmarkPerfScheduling/.*Claim.* -benchtime=1ns -run=xxx ./test/integration/scheduler_perf

showed that a significant amount of time was spent iterating over allocated
claims to determine how many were allocated per node. That "naive" approach was
taken to avoid maintaining a redundant data structure, but now that performance
measurements show that this comes at a cost, it's not "premature optimization"
anymore to introduce such a second field.

The average scheduling throughput in
SchedulingWithResourceClaimTemplate/2000pods_100nodes increases from 16.4
pods/s to 19.2 pods/s.
This commit is contained in:
Patrick Ohly 2023-08-08 13:36:35 +02:00
parent 99190634ab
commit 0e23840929

View File

@ -63,6 +63,9 @@ type ExampleController struct {
mutex sync.Mutex
// allocated maps claim.UID to the node (if network-attached) or empty (if not).
allocated map[types.UID]string
// claimsPerNode counts how many claims are currently allocated for a node (non-empty key)
// or allocated for the entire cluster (empty key). Must be kept in sync with allocated.
claimsPerNode map[string]int
numAllocations, numDeallocations int64
}
@ -73,7 +76,8 @@ func NewController(clientset kubernetes.Interface, driverName string, resources
resources: resources,
driverName: driverName,
allocated: make(map[types.UID]string),
allocated: make(map[types.UID]string),
claimsPerNode: make(map[string]int),
}
return c
}
@ -95,16 +99,6 @@ type parameters struct {
var _ controller.Driver = &ExampleController{}
func (c *ExampleController) countAllocations(node string) int {
total := 0
for _, n := range c.allocated {
if n == node {
total++
}
}
return total
}
// GetNumAllocations returns the number of times that a claim was allocated.
// Idempotent calls to Allocate that do not need to allocate the claim again do
// not contribute to that counter.
@ -204,7 +198,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
var viableNodes []string
for _, n := range c.resources.Nodes {
if c.resources.MaxAllocations == 0 ||
c.countAllocations(n) < c.resources.MaxAllocations {
c.claimsPerNode[n] < c.resources.MaxAllocations {
viableNodes = append(viableNodes, n)
}
}
@ -217,7 +211,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
logger.V(3).Info("picked a node ourselves", "selectedNode", selectedNode)
} else if !contains(c.resources.Nodes, node) ||
c.resources.MaxAllocations > 0 &&
c.countAllocations(node) >= c.resources.MaxAllocations {
c.claimsPerNode[node] >= c.resources.MaxAllocations {
return nil, fmt.Errorf("resources exhausted on node %q", node)
}
} else {
@ -271,6 +265,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
if !alreadyAllocated {
c.numAllocations++
c.allocated[claim.UID] = node
c.claimsPerNode[node]++
}
return allocation, nil
}
@ -280,7 +275,8 @@ func (c *ExampleController) Deallocate(ctx context.Context, claim *resourcev1alp
c.mutex.Lock()
defer c.mutex.Unlock()
if _, ok := c.allocated[claim.UID]; !ok {
node, ok := c.allocated[claim.UID]
if !ok {
logger.V(3).Info("already deallocated")
return nil
}
@ -288,6 +284,7 @@ func (c *ExampleController) Deallocate(ctx context.Context, claim *resourcev1alp
logger.V(3).Info("done")
c.numDeallocations++
delete(c.allocated, claim.UID)
c.claimsPerNode[node]--
return nil
}
@ -307,10 +304,6 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
return nil
}
if c.resources.NodeLocal {
allocationsPerNode := make(map[string]int)
for _, node := range c.resources.Nodes {
allocationsPerNode[node] = c.countAllocations(node)
}
for _, claim := range claims {
claim.UnsuitableNodes = nil
for _, node := range potentialNodes {
@ -320,7 +313,7 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
// for all of them. Also, nodes that the driver
// doesn't run on cannot be used.
if !contains(c.resources.Nodes, node) ||
allocationsPerNode[node]+len(claims) > c.resources.MaxAllocations {
c.claimsPerNode[node]+len(claims) > c.resources.MaxAllocations {
claim.UnsuitableNodes = append(claim.UnsuitableNodes, node)
}
}
@ -328,7 +321,7 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
return nil
}
allocations := c.countAllocations("")
allocations := c.claimsPerNode[""]
for _, claim := range claims {
claim.UnsuitableNodes = nil
for _, node := range potentialNodes {