dra test: enhance performance of test driver controller

Analyzing the CPU profile of

    go test -timeout=0 -count=5 -cpuprofile profile.out -bench=BenchmarkPerfScheduling/.*Claim.* -benchtime=1ns -run=xxx ./test/integration/scheduler_perf

showed that a significant amount of time was spent iterating over allocated
claims to determine how many were allocated per node. That "naive" approach was
taken to avoid maintaining a redundant data structure, but now that performance
measurements show that this comes at a cost, it's not "premature optimization"
anymore to introduce such a second field.

The average scheduling throughput in
SchedulingWithResourceClaimTemplate/2000pods_100nodes increases from 16.4
pods/s to 19.2 pods/s.
This commit is contained in:
Patrick Ohly 2023-08-08 13:36:35 +02:00
parent 99190634ab
commit 0e23840929

View File

@ -63,6 +63,9 @@ type ExampleController struct {
mutex sync.Mutex mutex sync.Mutex
// allocated maps claim.UID to the node (if network-attached) or empty (if not). // allocated maps claim.UID to the node (if network-attached) or empty (if not).
allocated map[types.UID]string allocated map[types.UID]string
// claimsPerNode counts how many claims are currently allocated for a node (non-empty key)
// or allocated for the entire cluster (empty key). Must be kept in sync with allocated.
claimsPerNode map[string]int
numAllocations, numDeallocations int64 numAllocations, numDeallocations int64
} }
@ -73,7 +76,8 @@ func NewController(clientset kubernetes.Interface, driverName string, resources
resources: resources, resources: resources,
driverName: driverName, driverName: driverName,
allocated: make(map[types.UID]string), allocated: make(map[types.UID]string),
claimsPerNode: make(map[string]int),
} }
return c return c
} }
@ -95,16 +99,6 @@ type parameters struct {
var _ controller.Driver = &ExampleController{} var _ controller.Driver = &ExampleController{}
func (c *ExampleController) countAllocations(node string) int {
total := 0
for _, n := range c.allocated {
if n == node {
total++
}
}
return total
}
// GetNumAllocations returns the number of times that a claim was allocated. // GetNumAllocations returns the number of times that a claim was allocated.
// Idempotent calls to Allocate that do not need to allocate the claim again do // Idempotent calls to Allocate that do not need to allocate the claim again do
// not contribute to that counter. // not contribute to that counter.
@ -204,7 +198,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
var viableNodes []string var viableNodes []string
for _, n := range c.resources.Nodes { for _, n := range c.resources.Nodes {
if c.resources.MaxAllocations == 0 || if c.resources.MaxAllocations == 0 ||
c.countAllocations(n) < c.resources.MaxAllocations { c.claimsPerNode[n] < c.resources.MaxAllocations {
viableNodes = append(viableNodes, n) viableNodes = append(viableNodes, n)
} }
} }
@ -217,7 +211,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
logger.V(3).Info("picked a node ourselves", "selectedNode", selectedNode) logger.V(3).Info("picked a node ourselves", "selectedNode", selectedNode)
} else if !contains(c.resources.Nodes, node) || } else if !contains(c.resources.Nodes, node) ||
c.resources.MaxAllocations > 0 && c.resources.MaxAllocations > 0 &&
c.countAllocations(node) >= c.resources.MaxAllocations { c.claimsPerNode[node] >= c.resources.MaxAllocations {
return nil, fmt.Errorf("resources exhausted on node %q", node) return nil, fmt.Errorf("resources exhausted on node %q", node)
} }
} else { } else {
@ -271,6 +265,7 @@ func (c *ExampleController) allocateOne(ctx context.Context, claim *resourcev1al
if !alreadyAllocated { if !alreadyAllocated {
c.numAllocations++ c.numAllocations++
c.allocated[claim.UID] = node c.allocated[claim.UID] = node
c.claimsPerNode[node]++
} }
return allocation, nil return allocation, nil
} }
@ -280,7 +275,8 @@ func (c *ExampleController) Deallocate(ctx context.Context, claim *resourcev1alp
c.mutex.Lock() c.mutex.Lock()
defer c.mutex.Unlock() defer c.mutex.Unlock()
if _, ok := c.allocated[claim.UID]; !ok { node, ok := c.allocated[claim.UID]
if !ok {
logger.V(3).Info("already deallocated") logger.V(3).Info("already deallocated")
return nil return nil
} }
@ -288,6 +284,7 @@ func (c *ExampleController) Deallocate(ctx context.Context, claim *resourcev1alp
logger.V(3).Info("done") logger.V(3).Info("done")
c.numDeallocations++ c.numDeallocations++
delete(c.allocated, claim.UID) delete(c.allocated, claim.UID)
c.claimsPerNode[node]--
return nil return nil
} }
@ -307,10 +304,6 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
return nil return nil
} }
if c.resources.NodeLocal { if c.resources.NodeLocal {
allocationsPerNode := make(map[string]int)
for _, node := range c.resources.Nodes {
allocationsPerNode[node] = c.countAllocations(node)
}
for _, claim := range claims { for _, claim := range claims {
claim.UnsuitableNodes = nil claim.UnsuitableNodes = nil
for _, node := range potentialNodes { for _, node := range potentialNodes {
@ -320,7 +313,7 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
// for all of them. Also, nodes that the driver // for all of them. Also, nodes that the driver
// doesn't run on cannot be used. // doesn't run on cannot be used.
if !contains(c.resources.Nodes, node) || if !contains(c.resources.Nodes, node) ||
allocationsPerNode[node]+len(claims) > c.resources.MaxAllocations { c.claimsPerNode[node]+len(claims) > c.resources.MaxAllocations {
claim.UnsuitableNodes = append(claim.UnsuitableNodes, node) claim.UnsuitableNodes = append(claim.UnsuitableNodes, node)
} }
} }
@ -328,7 +321,7 @@ func (c *ExampleController) UnsuitableNodes(ctx context.Context, pod *v1.Pod, cl
return nil return nil
} }
allocations := c.countAllocations("") allocations := c.claimsPerNode[""]
for _, claim := range claims { for _, claim := range claims {
claim.UnsuitableNodes = nil claim.UnsuitableNodes = nil
for _, node := range potentialNodes { for _, node := range potentialNodes {