Merge pull request #124548 from pohly/dra-scheduler-perf-structured-parameters

scheduler_perf: add DRA structured parameters test with shared claims
2025-09-13 21:25:09 +00:00 · 2024-06-18 02:15:58 -07:00
parent ab8ad49b47 d88a153086
commit 3b90ae4f58
9 changed files with 236 additions and 24 deletions
--- a/test/integration/scheduler_perf/config/dra/pod-with-claim-ref.yaml
+++ b/test/integration/scheduler_perf/config/dra/pod-with-claim-ref.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: test-dra-{{.Index}}
+spec:
+  containers:
+  - image: registry.k8s.io/pause:3.9
+    name: pause
+    resources:
+      claims:
+      - name: resource
+  resourceClaims:
+  - name: resource
+    source:
+      # Five pods share access to the same claim.
+      resourceClaimName: test-claim-{{div .Index 5}}
--- a/test/integration/scheduler_perf/config/dra/resourceclaim-structured.yaml
+++ b/test/integration/scheduler_perf/config/dra/resourceclaim-structured.yaml
@@ -0,0 +1,10 @@
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClaim
+metadata:
+  name: test-claim-{{.Index}}
+spec:
+  resourceClassName: test-class
+  parametersRef:
+    apiGroup: resource.k8s.io
+    kind: ResourceClaimParameters
+    name: test-claim-parameters
--- a/test/integration/scheduler_perf/config/dra/resourceclaim.yaml
+++ b/test/integration/scheduler_perf/config/dra/resourceclaim.yaml
@@ -0,0 +1,6 @@
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClaim
+metadata:
+  name: test-claim-{{.Index}}
+spec:
+  resourceClassName: test-class
--- a/test/integration/scheduler_perf/config/dra/resourceclaimparameters.yaml
+++ b/test/integration/scheduler_perf/config/dra/resourceclaimparameters.yaml
@@ -2,6 +2,7 @@ apiVersion: resource.k8s.io/v1alpha2
 kind: ResourceClaimParameters
 metadata:
  name: test-claim-parameters
+shareable: true
 driverRequests:
 - driverName: test-driver.cdi.k8s.io
  requests:
--- a/test/integration/scheduler_perf/config/performance-config.yaml
+++ b/test/integration/scheduler_perf/config/performance-config.yaml
@@ -746,6 +746,7 @@
 - name: SchedulingWithResourceClaimTemplate
  featureGates:
    DynamicResourceAllocation: true
+    # SchedulerQueueingHints: true
  workloadTemplate:
  - opcode: createNodes
    countParam: $nodesWithoutDRA
@@ -812,6 +813,7 @@
 - name: SchedulingWithMultipleResourceClaims
  featureGates:
    DynamicResourceAllocation: true
+    # SchedulerQueueingHints: true
  workloadTemplate:
  - opcode: createNodes
    countParam: $nodesWithoutDRA
@@ -887,6 +889,7 @@
 - name: SchedulingWithResourceClaimTemplateStructured
  featureGates:
    DynamicResourceAllocation: true
+    # SchedulerQueueingHints: true
  workloadTemplate:
  - opcode: createNodes
    countParam: $nodesWithoutDRA
@@ -935,8 +938,6 @@
  - name: 2000pods_100nodes
    labels: [performance, fast]
    params:
-      # In this testcase, the number of nodes is smaller
-      # than the limit for the PodScheduling slices.
      nodesWithDRA: 100
      nodesWithoutDRA: 0
      initPods: 1000
@@ -944,11 +945,102 @@
      maxClaimsPerNode: 20
  - name: 2000pods_200nodes
    params:
-      # In this testcase, the driver and scheduler must
-      # truncate the PotentialNodes and UnsuitableNodes
-      # slices.
      nodesWithDRA: 200
      nodesWithoutDRA: 0
      initPods: 1000
      measurePods: 1000
      maxClaimsPerNode: 10
+  - name: 5000pods_500nodes
+    params:
+      nodesWithDRA: 500
+      nodesWithoutDRA: 0
+      initPods: 2500
+      measurePods: 2500
+      maxClaimsPerNode: 10
+
+# SchedulingWithResourceClaimTemplate uses ResourceClaims
+# with deterministic names that are shared between pods.
+# There is a fixed ratio of 1:5 between claims and pods.
+#
+# The driver uses structured parameters.
+- name: SchedulingWithResourceClaimStructured
+  featureGates:
+    DynamicResourceAllocation: true
+    # SchedulerQueueingHints: true
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $nodesWithoutDRA
+  - opcode: createNodes
+    nodeTemplatePath: config/dra/node-with-dra-test-driver.yaml
+    countParam: $nodesWithDRA
+  - opcode: createResourceDriver
+    driverName: test-driver.cdi.k8s.io
+    nodes: scheduler-perf-dra-*
+    maxClaimsPerNodeParam: $maxClaimsPerNode
+    structuredParameters: true
+  - opcode: createAny
+    templatePath: config/dra/resourceclass-structured.yaml
+  - opcode: createAny
+    templatePath: config/dra/resourceclaimparameters.yaml
+    namespace: init
+  - opcode: createAny
+    templatePath: config/dra/resourceclaim-structured.yaml
+    namespace: init
+    countParam: $initClaims
+  - opcode: createPods
+    namespace: init
+    countParam: $initPods
+    podTemplatePath: config/dra/pod-with-claim-ref.yaml
+  - opcode: createAny
+    templatePath: config/dra/resourceclaimparameters.yaml
+    namespace: test
+  - opcode: createAny
+    templatePath: config/dra/resourceclaim-structured.yaml
+    namespace: test
+    countParam: $measureClaims
+  - opcode: createPods
+    namespace: test
+    countParam: $measurePods
+    podTemplatePath: config/dra/pod-with-claim-ref.yaml
+    collectMetrics: true
+  workloads:
+  - name: fast
+    labels: [integration-test, fast]
+    params:
+      # This testcase runs through all code paths without
+      # taking too long overall.
+      nodesWithDRA: 1
+      nodesWithoutDRA: 1
+      initPods: 0
+      initClaims: 0
+      measurePods: 10
+      measureClaims: 2 # must be measurePods / 5
+      maxClaimsPerNode: 2
+  - name: 2000pods_100nodes
+    labels: [performance, fast]
+    params:
+      nodesWithDRA: 100
+      nodesWithoutDRA: 0
+      initPods: 1000
+      initClaims: 200 # must be initPods / 5
+      measurePods: 1000
+      measureClaims: 200 # must be initPods / 5
+      maxClaimsPerNode: 4
+  - name: 2000pods_200nodes
+    params:
+      nodesWithDRA: 200
+      nodesWithoutDRA: 0
+      initPods: 1000
+      initClaims: 200 # must be initPods / 5
+      measurePods: 1000
+      measureClaims: 200 # must be measurePods / 5
+      maxClaimsPerNode: 2
+  - name: 5000pods_500nodes
+    params:
+      nodesWithDRA: 500
+      nodesWithoutDRA: 0
+      initPods: 2500
+      initClaims: 500 # must be initPods / 5
+      measurePods: 2500
+      measureClaims: 500 # must be measurePods / 5
+      maxClaimsPerNode: 2
--- a/test/integration/scheduler_perf/create.go
+++ b/test/integration/scheduler_perf/create.go
@@ -17,8 +17,11 @@ limitations under the License.
 package benchmark

 import (
+	"bytes"
 	"context"
 	"fmt"
+	"html/template"
+	"os"
 	"time"

 	apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -30,6 +33,8 @@ import (
 	"k8s.io/client-go/restmapper"
 	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/test/utils/ktesting"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/yaml"
 )

 // createAny defines an op where some object gets created from a YAML file.
@@ -40,7 +45,13 @@ type createAny struct {
 	// Namespace the object should be created in. Must be empty for cluster-scoped objects.
 	Namespace string
 	// Path to spec file describing the object to create.
+	// This will be processed with text/template.
+	// .Index will be in the range [0, Count-1] when creating
+	// more than one object. .Count is the total number of objects.
 	TemplatePath string
+	// Count determines how many objects get created. Defaults to 1 if unset.
+	Count      *int
+	CountParam string
 }

 var _ runnableOp = &createAny{}
@@ -61,8 +72,15 @@ func (c *createAny) collectsMetrics() bool {
 	return false
 }

-func (c *createAny) patchParams(w *workload) (realOp, error) {
-	return c, c.isValid(false)
+func (c createAny) patchParams(w *workload) (realOp, error) {
+	if c.CountParam != "" {
+		count, err := w.Params.get(c.CountParam[1:])
+		if err != nil {
+			return nil, err
+		}
+		c.Count = ptr.To(count)
+	}
+	return &c, c.isValid(false)
 }

 func (c *createAny) requiredNamespaces() []string {
@@ -73,8 +91,18 @@ func (c *createAny) requiredNamespaces() []string {
 }

 func (c *createAny) run(tCtx ktesting.TContext) {
+	count := 1
+	if c.Count != nil {
+		count = *c.Count
+	}
+	for index := 0; index < count; index++ {
+		c.create(tCtx, map[string]any{"Index": index, "Count": count})
+	}
+}
+
+func (c *createAny) create(tCtx ktesting.TContext, env map[string]any) {
 	var obj *unstructured.Unstructured
-	if err := getSpecFromFile(&c.TemplatePath, &obj); err != nil {
+	if err := getSpecFromTextTemplateFile(c.TemplatePath, env, &obj); err != nil {
 		tCtx.Fatalf("%s: parsing failed: %v", c.TemplatePath, err)
 	}

@@ -143,3 +171,23 @@ func (c *createAny) run(tCtx ktesting.TContext) {
 		}
 	}
 }
+
+func getSpecFromTextTemplateFile(path string, env map[string]any, spec interface{}) error {
+	content, err := os.ReadFile(path)
+	if err != nil {
+		return err
+	}
+	fm := template.FuncMap{"div": func(a, b int) int {
+		return a / b
+	}}
+	tmpl, err := template.New("object").Funcs(fm).Parse(string(content))
+	if err != nil {
+		return err
+	}
+	var buffer bytes.Buffer
+	if err := tmpl.Execute(&buffer, env); err != nil {
+		return err
+	}
+
+	return yaml.UnmarshalStrict(buffer.Bytes(), spec)
+}
--- a/test/integration/scheduler_perf/dra.go
+++ b/test/integration/scheduler_perf/dra.go
@@ -177,6 +177,7 @@ func (op *createResourceDriverOp) run(tCtx ktesting.TContext) {
 		DriverName:     op.DriverName,
 		NodeLocal:      true,
 		MaxAllocations: op.MaxClaimsPerNode,
+		Shareable:      true,
 	}

 	nodes, err := tCtx.Client().CoreV1().Nodes().List(tCtx, metav1.ListOptions{})
--- a/test/integration/scheduler_perf/scheduler_perf.go
+++ b/test/integration/scheduler_perf/scheduler_perf.go
@@ -1432,16 +1432,12 @@ func validateTestCases(testCases []*testCase) error {
 }

 func getPodStrategy(cpo *createPodsOp) (testutils.TestPodCreateStrategy, error) {
-	basePod := makeBasePod()
+	podTemplate := testutils.StaticPodTemplate(makeBasePod())
 	if cpo.PodTemplatePath != nil {
-		var err error
-		basePod, err = getPodSpecFromFile(cpo.PodTemplatePath)
-		if err != nil {
-			return nil, err
-		}
+		podTemplate = podTemplateFromFile(*cpo.PodTemplatePath)
 	}
 	if cpo.PersistentVolumeClaimTemplatePath == nil {
-		return testutils.NewCustomCreatePodStrategy(basePod), nil
+		return testutils.NewCustomCreatePodStrategy(podTemplate), nil
 	}

 	pvTemplate, err := getPersistentVolumeSpecFromFile(cpo.PersistentVolumeTemplatePath)
@@ -1452,7 +1448,7 @@ func getPodStrategy(cpo *createPodsOp) (testutils.TestPodCreateStrategy, error)
 	if err != nil {
 		return nil, err
 	}
-	return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), basePod), nil
+	return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), podTemplate), nil
 }

 func getNodeSpecFromFile(path *string) (*v1.Node, error) {
@@ -1463,9 +1459,11 @@ func getNodeSpecFromFile(path *string) (*v1.Node, error) {
 	return nodeSpec, nil
 }

-func getPodSpecFromFile(path *string) (*v1.Pod, error) {
+type podTemplateFromFile string
+
+func (f podTemplateFromFile) GetPodTemplate(index, count int) (*v1.Pod, error) {
 	podSpec := &v1.Pod{}
-	if err := getSpecFromFile(path, podSpec); err != nil {
+	if err := getSpecFromTextTemplateFile(string(f), map[string]any{"Index": index, "Count": count}, podSpec); err != nil {
 		return nil, fmt.Errorf("parsing Pod: %w", err)
 	}
 	return podSpec, nil
--- a/test/utils/runners.go
+++ b/test/utils/runners.go
@@ -1235,14 +1235,22 @@ func makeCreatePod(client clientset.Interface, namespace string, podTemplate *v1
 	return nil
 }

-func CreatePod(ctx context.Context, client clientset.Interface, namespace string, podCount int, podTemplate *v1.Pod) error {
+func CreatePod(ctx context.Context, client clientset.Interface, namespace string, podCount int, podTemplate PodTemplate) error {
 	var createError error
 	lock := sync.Mutex{}
 	createPodFunc := func(i int) {
+		pod, err := podTemplate.GetPodTemplate(i, podCount)
+		if err != nil {
+			lock.Lock()
+			defer lock.Unlock()
+			createError = err
+			return
+		}
+		pod = pod.DeepCopy()
 		// client-go writes into the object that is passed to Create,
 		// causing a data race unless we create a new copy for each
 		// parallel call.
-		if err := makeCreatePod(client, namespace, podTemplate.DeepCopy()); err != nil {
+		if err := makeCreatePod(client, namespace, pod); err != nil {
 			lock.Lock()
 			defer lock.Unlock()
 			createError = err
@@ -1257,7 +1265,7 @@ func CreatePod(ctx context.Context, client clientset.Interface, namespace string
 	return createError
 }

-func CreatePodWithPersistentVolume(ctx context.Context, client clientset.Interface, namespace string, claimTemplate *v1.PersistentVolumeClaim, factory volumeFactory, podTemplate *v1.Pod, count int, bindVolume bool) error {
+func CreatePodWithPersistentVolume(ctx context.Context, client clientset.Interface, namespace string, claimTemplate *v1.PersistentVolumeClaim, factory volumeFactory, podTemplate PodTemplate, count int, bindVolume bool) error {
 	var createError error
 	lock := sync.Mutex{}
 	createPodFunc := func(i int) {
@@ -1318,7 +1326,14 @@ func CreatePodWithPersistentVolume(ctx context.Context, client clientset.Interfa
 		}

 		// pod
-		pod := podTemplate.DeepCopy()
+		pod, err := podTemplate.GetPodTemplate(i, count)
+		if err != nil {
+			lock.Lock()
+			defer lock.Unlock()
+			createError = fmt.Errorf("error getting pod template: %s", err)
+			return
+		}
+		pod = pod.DeepCopy()
 		pod.Spec.Volumes = []v1.Volume{
 			{
 				Name: "vol",
@@ -1345,7 +1360,7 @@ func CreatePodWithPersistentVolume(ctx context.Context, client clientset.Interfa
 	return createError
 }

-func NewCustomCreatePodStrategy(podTemplate *v1.Pod) TestPodCreateStrategy {
+func NewCustomCreatePodStrategy(podTemplate PodTemplate) TestPodCreateStrategy {
 	return func(ctx context.Context, client clientset.Interface, namespace string, podCount int) error {
 		return CreatePod(ctx, client, namespace, podCount, podTemplate)
 	}
@@ -1354,7 +1369,32 @@ func NewCustomCreatePodStrategy(podTemplate *v1.Pod) TestPodCreateStrategy {
 // volumeFactory creates an unique PersistentVolume for given integer.
 type volumeFactory func(uniqueID int) *v1.PersistentVolume

-func NewCreatePodWithPersistentVolumeStrategy(claimTemplate *v1.PersistentVolumeClaim, factory volumeFactory, podTemplate *v1.Pod) TestPodCreateStrategy {
+// PodTemplate is responsible for creating a v1.Pod instance that is ready
+// to be sent to the API server.
+type PodTemplate interface {
+	// GetPodTemplate returns a pod template for one out of many different pods.
+	// Pods with numbers in the range [index, index+count-1] will be created
+	// based on what GetPodTemplate returns. It gets called multiple times
+	// with a fixed index and increasing count parameters. This number can,
+	// but doesn't have to be, used to modify parts of the pod spec like
+	// for example a named reference to some other object.
+	GetPodTemplate(index, count int) (*v1.Pod, error)
+}
+
+// StaticPodTemplate returns an implementation of PodTemplate for a fixed pod that is the same regardless of the index.
+func StaticPodTemplate(pod *v1.Pod) PodTemplate {
+	return (*staticPodTemplate)(pod)
+}
+
+type staticPodTemplate v1.Pod
+
+// GetPodTemplate implements [PodTemplate.GetPodTemplate] by returning the same pod
+// for each call.
+func (s *staticPodTemplate) GetPodTemplate(index, count int) (*v1.Pod, error) {
+	return (*v1.Pod)(s), nil
+}
+
+func NewCreatePodWithPersistentVolumeStrategy(claimTemplate *v1.PersistentVolumeClaim, factory volumeFactory, podTemplate PodTemplate) TestPodCreateStrategy {
 	return func(ctx context.Context, client clientset.Interface, namespace string, podCount int) error {
 		return CreatePodWithPersistentVolume(ctx, client, namespace, claimTemplate, factory, podTemplate, podCount, true /* bindVolume */)
 	}