Merge pull request #93252 from adtac/scheduler-perf

scheduler_perf: refactor to allow arbitrary workloads
2025-09-04 10:47:25 +00:00 · 2020-09-17 14:08:46 -07:00
parent a34fd1c65f 71bc9ce9c2
commit ff1d6e8c1d
8 changed files with 930 additions and 488 deletions
--- a/test/integration/framework/perf_utils.go
+++ b/test/integration/framework/perf_utils.go
@@ -58,7 +58,7 @@ func NewIntegrationTestNodePreparerWithNodeSpec(client clientset.Interface, coun
 }

 // PrepareNodes prepares countToStrategy test nodes.
-func (p *IntegrationTestNodePreparer) PrepareNodes() error {
+func (p *IntegrationTestNodePreparer) PrepareNodes(nextNodeIndex int) error {
 	numNodes := 0
 	for _, v := range p.countToStrategy {
 		numNodes += v.Count
@@ -103,11 +103,9 @@ func (p *IntegrationTestNodePreparer) PrepareNodes() error {
 	if err != nil {
 		klog.Fatalf("Error listing nodes: %v", err)
 	}
-	index := 0
-	sum := 0
+	index := nextNodeIndex
 	for _, v := range p.countToStrategy {
-		sum += v.Count
-		for ; index < sum; index++ {
+		for i := 0; i < v.Count; i, index = i+1, index+1 {
 			if err := testutils.DoPrepareNode(p.client, &nodes.Items[index], v.Strategy); err != nil {
 				klog.Errorf("Aborting node preparation: %v", err)
 				return err
@@ -119,14 +117,18 @@ func (p *IntegrationTestNodePreparer) PrepareNodes() error {

 // CleanupNodes deletes existing test nodes.
 func (p *IntegrationTestNodePreparer) CleanupNodes() error {
+	// TODO(#93794): make CleanupNodes only clean up the nodes created by this
+	// IntegrationTestNodePreparer to make this more intuitive.
 	nodes, err := GetReadySchedulableNodes(p.client)
 	if err != nil {
 		klog.Fatalf("Error listing nodes: %v", err)
 	}
+	var errRet error
 	for i := range nodes.Items {
 		if err := p.client.CoreV1().Nodes().Delete(context.TODO(), nodes.Items[i].Name, metav1.DeleteOptions{}); err != nil {
 			klog.Errorf("Error while deleting Node: %v", err)
+			errRet = err
 		}
 	}
-	return nil
+	return errRet
 }
--- a/test/integration/scheduler_perf/.gitignore
+++ b/test/integration/scheduler_perf/.gitignore
@@ -0,0 +1 @@
+BenchmarkPerfScheduling_*.json
--- a/test/integration/scheduler_perf/BUILD
+++ b/test/integration/scheduler_perf/BUILD
@@ -48,6 +48,7 @@ go_test(
        "//staging/src/k8s.io/api/storage/v1beta1:go_default_library",
        "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
        "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
        "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
        "//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
        "//staging/src/k8s.io/client-go/kubernetes:go_default_library",
--- a/test/integration/scheduler_perf/config/performance-config.yaml
+++ b/test/integration/scheduler_perf/config/performance-config.yaml
@@ -1,264 +1,426 @@
- template:
-    desc: SchedulingBasic
-    initPods:
-    - podTemplatePath: config/pod-default.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-default.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingPodAntiAffinity
-    nodes:
-      uniqueNodeLabelStrategy:
-        labelKey: kubernetes.io/hostname
-    initPods:
-    - podTemplatePath: config/pod-with-pod-anti-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-pod-anti-affinity.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [100]
-    numPodsToSchedule: 400
-  - numNodes: 5000
-    numInitPods: [1000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingSecrets
-    initPods:
-    - podTemplatePath: config/pod-with-secret-volume.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-secret-volume.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingInTreePVs
-    initPods:
-    - persistentVolumeTemplatePath: config/pv-aws.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-    podsToSchedule:
-      persistentVolumeTemplatePath: config/pv-aws.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingMigratedInTreePVs
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      nodeAllocatableStrategy:
-        nodeAllocatable:
-          attachable-volumes-csi-ebs.csi.aws.com: 39
-        csiNodeAllocatable:
-          ebs.csi.aws.com:
-            count: 39
-        migratedPlugins:
-        - "kubernetes.io/aws-ebs"
-    initPods:
-    - persistentVolumeTemplatePath: config/pv-aws.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-    podsToSchedule:
-      persistentVolumeTemplatePath: config/pv-aws.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-    featureGates:
-      CSIMigration: true
-      CSIMigrationAWS: true
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingCSIPVs
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      nodeAllocatableStrategy:
-        nodeAllocatable:
-          attachable-volumes-csi-ebs.csi.aws.com: 39
-        csiNodeAllocatable:
-          ebs.csi.aws.com:
-            count: 39
-    initPods:
-    - persistentVolumeTemplatePath: config/pv-csi.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-    podsToSchedule:
-      persistentVolumeTemplatePath: config/pv-csi.yaml
-      persistentVolumeClaimTemplatePath: config/pvc.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingPodAffinity
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      labelNodePrepareStrategy:
-        labelKey: "failure-domain.beta.kubernetes.io/zone"
-        labelValues: ["zone1"]
-    initPods:
-    - podTemplatePath: config/pod-with-pod-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-pod-affinity.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingPreferredPodAffinity
-    nodes:
-      uniqueNodeLabelStrategy:
-        labelKey: kubernetes.io/hostname
-    initPods:
-    - podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingPreferredPodAntiAffinity
-    nodes:
-      uniqueNodeLabelStrategy:
-        labelKey: kubernetes.io/hostname
-    initPods:
-    - podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: SchedulingNodeAffinity
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      labelNodePrepareStrategy:
-        labelKey: "failure-domain.beta.kubernetes.io/zone"
-        labelValues: ["zone1"]
-    initPods:
-    - podTemplatePath: config/pod-with-node-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-node-affinity.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [500]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 1000
- template:
-    desc: TopologySpreading
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      labelNodePrepareStrategy:
-        labelKey: "topology.kubernetes.io/zone"
-        labelValues: ["moon-1", "moon-2", "moon-3"]
-    initPods:
-    - podTemplatePath: config/pod-default.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-topology-spreading.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [1000]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 2000
- template:
-    desc: PreferredTopologySpreading
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      labelNodePrepareStrategy:
-        labelKey: "topology.kubernetes.io/zone"
-        labelValues: ["moon-1", "moon-2", "moon-3"]
-    initPods:
-    - podTemplatePath: config/pod-default.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-with-preferred-topology-spreading.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [1000]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [5000]
-    numPodsToSchedule: 2000
- template:
-    desc: MixedSchedulingBasePod
-    nodes:
-      nodeTemplatePath: config/node-default.yaml
-      labelNodePrepareStrategy:
-        labelKey: "topology.kubernetes.io/zone"
-        labelValues: ["zone1"]
-    initPods:
-    - podTemplatePath: config/pod-default.yaml
-    - podTemplatePath: config/pod-with-pod-affinity.yaml
-    - podTemplatePath: config/pod-with-pod-anti-affinity.yaml
-    - podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
-    - podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-default.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [200, 200, 200, 200, 200]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [2000, 2000, 2000, 2000, 2000]
-    numPodsToSchedule: 1000
- template:
-    desc: Preemption
-    initPods:
-    - podTemplatePath: config/pod-low-priority.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-high-priority.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [2000]
-    numPodsToSchedule: 500
-  - numNodes: 5000
-    numInitPods: [20000]
-    numPodsToSchedule: 5000
- template:
-    desc: Unschedulable
-    skipWaitUntilInitPodsScheduled: true
-    initPods:
-    - podTemplatePath: config/pod-large-cpu.yaml
-    podsToSchedule:
-      podTemplatePath: config/pod-default.yaml
-  params:
-  - numNodes: 500
-    numInitPods: [200]
-    numPodsToSchedule: 1000
-  - numNodes: 5000
-    numInitPods: [200]
-    numPodsToSchedule: 5000
-  - numNodes: 5000
-    numInitPods: [2000]
-    numPodsToSchedule: 5000
+- name: SchedulingBasic
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-default.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-default.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 1000
+      measurePods: 1000
+
+- name: SchedulingPodAntiAffinity
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    uniqueNodeLabelStrategy:
+      labelKey: kubernetes.io/hostname
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-pod-anti-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-pod-anti-affinity.yaml
+    collectMetrics: true
+    namespace: sched-test
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 100
+      measurePods: 400
+  - name: 5000Nodes
+    params:
+      initNodes: 500
+      initPods: 100
+      measurePods: 400
+
+- name: SchedulingSecrets
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-secret-volume.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-secret-volume.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingInTreePVs
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+  - opcode: createPods
+    countParam: $initPods
+    persistentVolumeTemplatePath: config/pv-aws.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    persistentVolumeTemplatePath: config/pv-aws.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingMigratedInTreePVs
+  featureGates:
+    CSIMigration: true
+    CSIMigrationAWS: true
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    nodeAllocatableStrategy:
+      nodeAllocatable:
+        attachable-volumes-csi-ebs.csi.aws.com: "39"
+      csiNodeAllocatable:
+        ebs.csi.aws.com:
+          count: 39
+      migratedPlugins:
+      - "kubernetes.io/aws-ebs"
+  - opcode: createPods
+    countParam: $initPods
+    persistentVolumeTemplatePath: config/pv-aws.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    persistentVolumeTemplatePath: config/pv-aws.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingCSIPVs
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    nodeAllocatableStrategy:
+      nodeAllocatable:
+        attachable-volumes-csi-ebs.csi.aws.com: "39"
+      csiNodeAllocatable:
+        ebs.csi.aws.com:
+          count: 39
+  - opcode: createPods
+    countParam: $initPods
+    persistentVolumeTemplatePath: config/pv-csi.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    persistentVolumeTemplatePath: config/pv-csi.yaml
+    persistentVolumeClaimTemplatePath: config/pvc.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingPodAffinity
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    labelNodePrepareStrategy:
+      labelKey: "failure-domain.beta.kubernetes.io/zone"
+      labelValues: ["zone1"]
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-pod-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-pod-affinity.yaml
+    namespace: sched-test
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingPreferredPodAffinity
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    uniqueNodeLabelStrategy:
+      labelKey: kubernetes.io/hostname
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
+    namespace: sched-test
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingPreferredPodAntiAffinity
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    uniqueNodeLabelStrategy:
+      labelKey: kubernetes.io/hostname
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
+    namespace: sched-test
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: SchedulingNodeAffinity
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    labelNodePrepareStrategy:
+      labelKey: "failure-domain.beta.kubernetes.io/zone"
+      labelValues: ["zone1"]
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-node-affinity.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-node-affinity.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 500
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 1000
+
+- name: TopologySpreading
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    labelNodePrepareStrategy:
+      labelKey: "topology.kubernetes.io/zone"
+      labelValues: ["moon-1", "moon-2", "moon-3"]
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-default.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-topology-spreading.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 1000
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 2000
+
+- name: PreferredTopologySpreading
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    labelNodePrepareStrategy:
+      labelKey: "topology.kubernetes.io/zone"
+      labelValues: ["moon-1", "moon-2", "moon-3"]
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-default.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-with-preferred-topology-spreading.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 1000
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 5000
+      measurePods: 2000
+
+- name: MixedSchedulingBasePod
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+    nodeTemplatePath: config/node-default.yaml
+    labelNodePrepareStrategy:
+      labelKey: "topology.kubernetes.io/zone"
+      labelValues: ["zone1"]
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-default.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-pod-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-pod-anti-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-preferred-pod-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml
+    namespace: sched-setup
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-default.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 200
+      measurePods: 1000
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 2000
+      measurePods: 1000
+
+- name: Preemption
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-low-priority.yaml
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-high-priority.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes
+    params:
+      initNodes: 500
+      initPods: 2000
+      measurePods: 500
+  - name: 5000Nodes
+    params:
+      initNodes: 5000
+      initPods: 20000
+      measurePods: 5000
+
+- name: Unschedulable
+  workloadTemplate:
+  - opcode: createNodes
+    countParam: $initNodes
+  - opcode: createPods
+    countParam: $initPods
+    podTemplatePath: config/pod-large-cpu.yaml
+    skipWaitToCompletion: true
+  - opcode: createPods
+    countParam: $measurePods
+    podTemplatePath: config/pod-default.yaml
+    collectMetrics: true
+  workloads:
+  - name: 500Nodes/200InitPods
+    params:
+      initNodes: 500
+      initPods: 200
+      measurePods: 1000
+  - name: 5000Nodes/200InitPods
+    params:
+      initNodes: 5000
+      initPods: 200
+      measurePods: 5000
+  - name: 5000Nodes/2000InitPods
+    params:
+      initNodes: 5000
+      initPods: 2000
+      measurePods: 5000
--- a/test/integration/scheduler_perf/scheduler_perf_legacy_test.go
+++ b/test/integration/scheduler_perf/scheduler_perf_legacy_test.go
@@ -448,7 +448,7 @@ func benchmarkScheduling(numExistingPods, minPods int,
 		clientset,
 		nodeStrategies,
 		"scheduler-perf-")
-	if err := nodePreparer.PrepareNodes(); err != nil {
+	if err := nodePreparer.PrepareNodes(0); err != nil {
 		klog.Fatalf("%v", err)
 	}
 	defer nodePreparer.CleanupNodes()
--- a/test/integration/scheduler_perf/scheduler_perf_test.go
+++ b/test/integration/scheduler_perf/scheduler_perf_test.go
@@ -17,12 +17,17 @@ limitations under the License.
 package benchmark

 import (
+	"context"
+	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"strings"
+	"sync"
 	"testing"
 	"time"

 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	coreinformers "k8s.io/client-go/informers/core/v1"
 	clientset "k8s.io/client-go/kubernetes"
@@ -35,7 +40,10 @@ import (
 )

 const (
-	configFile = "config/performance-config.yaml"
+	configFile        = "config/performance-config.yaml"
+	createNodesOpcode = "createNodes"
+	createPodsOpcode  = "createPods"
+	barrierOpcode     = "barrier"
 )

 var (
@@ -51,87 +59,248 @@ var (
 	}
 )

-// testCase configures a test case to run the scheduler performance test. Users should be able to
-// provide this via a YAML file.
-//
-// It specifies nodes and pods in the cluster before running the test. It also specifies the pods to
-// schedule during the test. The config can be as simple as just specify number of nodes/pods, where
-// default spec will be applied. It also allows the user to specify a pod spec template for more
-// complicated test cases.
-//
-// It also specifies the metrics to be collected after the test. If nothing is specified, default metrics
-// such as scheduling throughput and latencies will be collected.
+// testCase defines a set of test cases that intend to test the performance of
+// similar workloads of varying sizes with shared overall settings such as
+// feature gates and metrics collected.
 type testCase struct {
-	// description of the test case
-	Desc string
-	// configures nodes in the cluster
-	Nodes nodeCase
-	// configures pods in the cluster before running the tests
-	InitPods []podCase
-	// configures the test to now wait for init pods to schedule before creating
-	// test pods.
-	SkipWaitUntilInitPodsScheduled bool
-	// pods to be scheduled during the test.
-	PodsToSchedule podCase
-	// optional, feature gates to set before running the test
+	// Name of the testCase.
+	Name string
+	// Feature gates to set before running the test. Optional.
 	FeatureGates map[featuregate.Feature]bool
-	// optional, replaces default defaultMetricsCollectorConfig if supplied.
+	// List of metrics to collect. Optional, defaults to
+	// defaultMetricsCollectorConfig if unspecified.
 	MetricsCollectorConfig *metricsCollectorConfig
+	// Template for sequence of ops that each workload must follow. Each op will
+	// be executed serially one after another. Each element of the list must be
+	// createNodesOp, createPodsOp, or barrierOp.
+	WorkloadTemplate []op
+	// List of workloads to run under this testCase.
+	Workloads []*workload
+	// TODO(#93792): reduce config toil by having a default pod and node spec per
+	// testCase? CreatePods and CreateNodes ops will inherit these unless
+	// manually overridden.
 }

-type nodeCase struct {
-	Num              int
+func (tc *testCase) collectsMetrics() bool {
+	for _, op := range tc.WorkloadTemplate {
+		if op.realOp.collectsMetrics() {
+			return true
+		}
+	}
+	return false
+}
+
+// workload is a subtest under a testCase that tests the scheduler performance
+// for a certain ordering of ops. The set of nodes created and pods scheduled
+// in a workload may be heterogenous.
+type workload struct {
+	// Name of the workload.
+	Name string
+	// Values of parameters used in the workloadTemplate.
+	Params map[string]int
+}
+
+// op is a dummy struct which stores the real op in itself.
+type op struct {
+	realOp realOp
+}
+
+// UnmarshalJSON is a custom unmarshaler for the op struct since we don't know
+// which op we're decoding at runtime.
+func (op *op) UnmarshalJSON(b []byte) error {
+	possibleOps := []realOp{
+		&createNodesOp{},
+		&createPodsOp{},
+		&barrierOp{},
+		// TODO(#93793): add a sleep timer op to simulate waiting?
+		// TODO(#94601): add a delete nodes op to simulate scaling behaviour?
+	}
+	var firstError error
+	for _, possibleOp := range possibleOps {
+		if err := json.Unmarshal(b, possibleOp); err == nil {
+			if err2 := possibleOp.isValid(true); err2 == nil {
+				op.realOp = possibleOp
+				return nil
+			} else if firstError == nil {
+				// Don't return an error yet. Even though this op is invalid, it may
+				// still match other possible ops.
+				firstError = err2
+			}
+		}
+	}
+	return fmt.Errorf("cannot unmarshal %s into any known op type: %w", string(b), firstError)
+}
+
+// realOp is an interface that is implemented by different structs. To evaluate
+// the validity of ops at parse-time, a isValid function must be implemented.
+type realOp interface {
+	// isValid verifies the validity of the op args such as node/pod count. Note
+	// that we don't catch undefined parameters at this stage.
+	isValid(allowParameterization bool) error
+	// collectsMetrics checks if the op collects metrics.
+	collectsMetrics() bool
+	// patchParams returns a patched realOp of the same type after substituting
+	// parameterizable values with workload-specific values. One should implement
+	// this method on the value receiver base type, not a pointer receiver base
+	// type, even though calls will be made from with a *realOp. This is because
+	// callers don't want the receiver to inadvertently modify the realOp
+	// (instead, it's returned as a return value).
+	patchParams(w *workload) (realOp, error)
+}
+
+func isValidParameterizable(val string) bool {
+	return strings.HasPrefix(val, "$")
+}
+
+// createNodesOp defines an op where nodes are created as a part of a workload.
+type createNodesOp struct {
+	// Must be "createNodes".
+	Opcode string
+	// Number of nodes to create. Parameterizable through CountParam.
+	Count int
+	// Template parameter for Count.
+	CountParam string
+	// Path to spec file describing the nodes to create. Optional.
 	NodeTemplatePath *string
-	// At most one of the following strategies can be defined. If not specified, default to TrivialNodePrepareStrategy.
+	// At most one of the following strategies can be defined. Optional, defaults
+	// to TrivialNodePrepareStrategy if unspecified.
 	NodeAllocatableStrategy  *testutils.NodeAllocatableStrategy
 	LabelNodePrepareStrategy *testutils.LabelNodePrepareStrategy
 	UniqueNodeLabelStrategy  *testutils.UniqueNodeLabelStrategy
 }

-type podCase struct {
-	Num                               int
-	PodTemplatePath                   *string
+func (cno *createNodesOp) isValid(allowParameterization bool) error {
+	if cno.Opcode != createNodesOpcode {
+		return fmt.Errorf("invalid opcode")
+	}
+	ok := (cno.Count > 0 ||
+		(cno.CountParam != "" && allowParameterization && isValidParameterizable(cno.CountParam)))
+	if !ok {
+		return fmt.Errorf("invalid Count=%d / CountParam=%q", cno.Count, cno.CountParam)
+	}
+	return nil
+}
+
+func (*createNodesOp) collectsMetrics() bool {
+	return false
+}
+
+func (cno createNodesOp) patchParams(w *workload) (realOp, error) {
+	if cno.CountParam != "" {
+		var ok bool
+		if cno.Count, ok = w.Params[cno.CountParam[1:]]; !ok {
+			return nil, fmt.Errorf("parameter %s is undefined", cno.CountParam)
+		}
+	}
+	return &cno, (&cno).isValid(false)
+}
+
+// createPodsOp defines an op where pods are scheduled as a part of a workload.
+// The test can block on the completion of this op before moving forward or
+// continue asynchronously.
+type createPodsOp struct {
+	// Must be "createPods".
+	Opcode string
+	// Number of pods to schedule. Parameterizable through CountParam.
+	Count int
+	// Template parameter for Count.
+	CountParam string
+	// Whether or not to enable metrics collection for this createPodsOp.
+	// Optional. Both CollectMetrics and SkipWaitToCompletion cannot be true at
+	// the same time for a particular createPodsOp.
+	CollectMetrics bool
+	// Namespace the pods should be created in. Optional, defaults to a unique
+	// namespace of the format "namespace-<number>".
+	Namespace *string
+	// Path to spec file describing the pods to schedule. Optional.
+	PodTemplatePath *string
+	// Whether or not to wait for all pods in this op to get scheduled. Optional,
+	// defaults to false.
+	SkipWaitToCompletion bool
+	// Persistent volume settings for the pods to be scheduled. Optional.
 	PersistentVolumeTemplatePath      *string
 	PersistentVolumeClaimTemplatePath *string
 }

-// simpleTestCases defines a set of test cases that share the same template (node spec, pod spec, etc)
-// with testParams(e.g., NumNodes) being overridden. This provides a convenient way to define multiple tests
-// with various sizes.
-type simpleTestCases struct {
-	Template testCase
-	Params   []testParams
+func (cpo *createPodsOp) isValid(allowParameterization bool) error {
+	if cpo.Opcode != createPodsOpcode {
+		return fmt.Errorf("invalid opcode")
+	}
+	ok := (cpo.Count > 0 ||
+		(cpo.CountParam != "" && allowParameterization && isValidParameterizable(cpo.CountParam)))
+	if !ok {
+		return fmt.Errorf("invalid Count=%d / CountParam=%q", cpo.Count, cpo.CountParam)
+	}
+	if cpo.CollectMetrics && cpo.SkipWaitToCompletion {
+		// While it's technically possible to achieve this, the additional
+		// complexity is not worth it, especially given that we don't have any
+		// use-cases right now.
+		return fmt.Errorf("collectMetrics and skipWaitToCompletion cannot be true at the same time")
+	}
+	return nil
 }

-type testParams struct {
-	NumNodes          int
-	NumInitPods       []int
-	NumPodsToSchedule int
+func (cpo *createPodsOp) collectsMetrics() bool {
+	return cpo.CollectMetrics
 }

-type testDataCollector interface {
-	run(stopCh chan struct{})
-	collect() []DataItem
+func (cpo createPodsOp) patchParams(w *workload) (realOp, error) {
+	if cpo.CountParam != "" {
+		var ok bool
+		if cpo.Count, ok = w.Params[cpo.CountParam[1:]]; !ok {
+			return nil, fmt.Errorf("parameter %s is undefined", cpo.CountParam)
+		}
+	}
+	return &cpo, (&cpo).isValid(false)
+}
+
+// barrierOp defines an op that can be used to wait until all scheduled pods of
+// one or many namespaces have been bound to nodes. This is useful when pods
+// were scheduled with SkipWaitToCompletion set to true. A barrierOp is added
+// at the end of each each workload automatically.
+type barrierOp struct {
+	// Must be "barrier".
+	Opcode string
+	// Namespaces to block on. Empty array or not specifying this field signifies
+	// that the barrier should block on all namespaces.
+	Namespaces []string
+}
+
+func (bo *barrierOp) isValid(allowParameterization bool) error {
+	if bo.Opcode != barrierOpcode {
+		return fmt.Errorf("invalid opcode")
+	}
+	return nil
+}
+
+func (*barrierOp) collectsMetrics() bool {
+	return false
+}
+
+func (bo barrierOp) patchParams(w *workload) (realOp, error) {
+	return &bo, nil
 }

 func BenchmarkPerfScheduling(b *testing.B) {
-	dataItems := DataItems{Version: "v1"}
-	tests, err := parseTestCases(configFile)
+	testCases, err := getTestCases(configFile)
 	if err != nil {
 		b.Fatal(err)
 	}
+	if err = validateTestCases(testCases); err != nil {
+		b.Fatal(err)
+	}

-	for _, test := range tests {
-		initPods := 0
-		for _, p := range test.InitPods {
-			initPods += p.Num
-		}
-		name := fmt.Sprintf("%v/%vNodes/%vInitPods/%vPodsToSchedule", test.Desc, test.Nodes.Num, initPods, test.PodsToSchedule.Num)
-		b.Run(name, func(b *testing.B) {
-			for feature, flag := range test.FeatureGates {
-				defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)()
+	dataItems := DataItems{Version: "v1"}
+	for _, tc := range testCases {
+		b.Run(tc.Name, func(b *testing.B) {
+			for _, w := range tc.Workloads {
+				b.Run(w.Name, func(b *testing.B) {
+					for feature, flag := range tc.FeatureGates {
+						defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)()
+					}
+					dataItems.DataItems = append(dataItems.DataItems, runWorkload(b, tc, w)...)
+				})
 			}
-			dataItems.DataItems = append(dataItems.DataItems, perfScheduling(test, b)...)
 		})
 	}
 	if err := dataItems2JSONFile(dataItems, b.Name()); err != nil {
@@ -139,202 +308,219 @@ func BenchmarkPerfScheduling(b *testing.B) {
 	}
 }

-func perfScheduling(test testCase, b *testing.B) []DataItem {
+func runWorkload(b *testing.B, tc *testCase, w *workload) []DataItem {
+	// 30 minutes should be plenty enough even for the 5000-node tests.
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
+	defer cancel()
 	finalFunc, podInformer, clientset := mustSetupScheduler()
-	defer finalFunc()
-
-	nodePreparer, err := getNodePreparer(test.Nodes, clientset)
-	if err != nil {
-		b.Fatal(err)
-	}
-	if err := nodePreparer.PrepareNodes(); err != nil {
-		b.Fatal(err)
-	}
-	defer nodePreparer.CleanupNodes()
-
-	total := 0
-	for _, p := range test.InitPods {
-		if err := createPods(setupNamespace, p, clientset); err != nil {
-			b.Fatal(err)
-		}
-		total += p.Num
-	}
-	if !test.SkipWaitUntilInitPodsScheduled {
-		if err := waitNumPodsScheduled(b, total, podInformer, setupNamespace); err != nil {
-			b.Fatal(err)
-		}
-	}
-
-	// start benchmark
-	b.ResetTimer()
-
-	// Start test data collectors.
-	stopCh := make(chan struct{})
-	collectors := getTestDataCollectors(test, podInformer, b)
-	for _, collector := range collectors {
-		go collector.run(stopCh)
-	}
-
-	// Schedule the main workload
-	if err := createPods(testNamespace, test.PodsToSchedule, clientset); err != nil {
-		b.Fatal(err)
-	}
-	if err := waitNumPodsScheduled(b, test.PodsToSchedule.Num, podInformer, testNamespace); err != nil {
-		b.Fatal(err)
-	}
-
-	close(stopCh)
-	// Note: without this line we're taking the overhead of defer() into account.
-	b.StopTimer()
+	b.Cleanup(finalFunc)

+	var mu sync.Mutex
 	var dataItems []DataItem
-	for _, collector := range collectors {
-		dataItems = append(dataItems, collector.collect()...)
+	numPodsScheduledPerNamespace := make(map[string]int)
+	nextNodeIndex := 0
+
+	for opIndex, op := range tc.WorkloadTemplate {
+		realOp, err := op.realOp.patchParams(w)
+		if err != nil {
+			b.Fatalf("op %d: %v", opIndex, err)
+		}
+		select {
+		case <-ctx.Done():
+			b.Fatalf("op %d: %v", opIndex, ctx.Err())
+		default:
+		}
+		switch concreteOp := realOp.(type) {
+		case *createNodesOp:
+			nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, clientset)
+			if err != nil {
+				b.Fatalf("op %d: %v", opIndex, err)
+			}
+			if err := nodePreparer.PrepareNodes(nextNodeIndex); err != nil {
+				b.Fatalf("op %d: %v", opIndex, err)
+			}
+			b.Cleanup(func() {
+				nodePreparer.CleanupNodes()
+			})
+			nextNodeIndex += concreteOp.Count
+
+		case *createPodsOp:
+			var namespace string
+			if concreteOp.Namespace != nil {
+				namespace = *concreteOp.Namespace
+			} else {
+				namespace = fmt.Sprintf("namespace-%d", opIndex)
+			}
+			var collectors []testDataCollector
+			var collectorCtx context.Context
+			var collectorCancel func()
+			if concreteOp.CollectMetrics {
+				collectorCtx, collectorCancel = context.WithCancel(ctx)
+				defer collectorCancel()
+				collectors = getTestDataCollectors(podInformer, fmt.Sprintf("%s/%s", b.Name(), namespace), namespace, tc.MetricsCollectorConfig)
+				for _, collector := range collectors {
+					go collector.run(collectorCtx)
+				}
+			}
+			if err := createPods(namespace, concreteOp, clientset); err != nil {
+				b.Fatalf("op %d: %v", opIndex, err)
+			}
+			if concreteOp.SkipWaitToCompletion {
+				// Only record those namespaces that may potentially require barriers
+				// in the future.
+				if _, ok := numPodsScheduledPerNamespace[namespace]; ok {
+					numPodsScheduledPerNamespace[namespace] += concreteOp.Count
+				} else {
+					numPodsScheduledPerNamespace[namespace] = concreteOp.Count
+				}
+			} else {
+				if err := waitUntilPodsScheduledInNamespace(ctx, podInformer, b.Name(), namespace, concreteOp.Count); err != nil {
+					b.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err)
+				}
+			}
+			if concreteOp.CollectMetrics {
+				// CollectMetrics and SkipWaitToCompletion can never be true at the
+				// same time, so if we're here, it means that all pods have been
+				// scheduled.
+				collectorCancel()
+				mu.Lock()
+				for _, collector := range collectors {
+					dataItems = append(dataItems, collector.collect()...)
+				}
+				mu.Unlock()
+			}
+
+		case *barrierOp:
+			for _, namespace := range concreteOp.Namespaces {
+				if _, ok := numPodsScheduledPerNamespace[namespace]; !ok {
+					b.Fatalf("op %d: unknown namespace %s", opIndex, namespace)
+				}
+			}
+			if err := waitUntilPodsScheduled(ctx, podInformer, b.Name(), concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil {
+				b.Fatalf("op %d: %v", opIndex, err)
+			}
+			// At the end of the barrier, we can be sure that there are no pods
+			// pending scheduling in the namespaces that we just blocked on.
+			if len(concreteOp.Namespaces) == 0 {
+				numPodsScheduledPerNamespace = make(map[string]int)
+			} else {
+				for _, namespace := range concreteOp.Namespaces {
+					delete(numPodsScheduledPerNamespace, namespace)
+				}
+			}
+
+		default:
+			b.Fatalf("op %d: invalid op %v", opIndex, concreteOp)
+		}
+	}
+	if err := waitUntilPodsScheduled(ctx, podInformer, b.Name(), nil, numPodsScheduledPerNamespace); err != nil {
+		// Any pending pods must be scheduled before this test can be considered to
+		// be complete.
+		b.Fatal(err)
 	}
 	return dataItems
 }

-func waitNumPodsScheduled(b *testing.B, num int, podInformer coreinformers.PodInformer, namespace string) error {
-	for {
-		scheduled, err := getScheduledPods(podInformer, namespace)
-		if err != nil {
-			return err
-		}
-		if len(scheduled) >= num {
-			break
-		}
-		klog.Infof("%s: got %d existing pods, required: %d", b.Name(), len(scheduled), num)
-		time.Sleep(1 * time.Second)
-	}
-	return nil
+type testDataCollector interface {
+	run(ctx context.Context)
+	collect() []DataItem
 }

-func getTestDataCollectors(tc testCase, podInformer coreinformers.PodInformer, b *testing.B) []testDataCollector {
-	collectors := []testDataCollector{newThroughputCollector(podInformer, map[string]string{"Name": b.Name()}, []string{testNamespace})}
-	metricsCollectorConfig := defaultMetricsCollectorConfig
-	if tc.MetricsCollectorConfig != nil {
-		metricsCollectorConfig = *tc.MetricsCollectorConfig
+func getTestDataCollectors(podInformer coreinformers.PodInformer, name, namespace string, mcc *metricsCollectorConfig) []testDataCollector {
+	if mcc == nil {
+		mcc = &defaultMetricsCollectorConfig
+	}
+	return []testDataCollector{
+		newThroughputCollector(podInformer, map[string]string{"Name": name}, []string{namespace}),
+		newMetricsCollector(mcc, map[string]string{"Name": name}),
 	}
-	collectors = append(collectors, newMetricsCollector(metricsCollectorConfig, map[string]string{"Name": b.Name()}))
-	return collectors
 }

-func getNodePreparer(nc nodeCase, clientset clientset.Interface) (testutils.TestNodePreparer, error) {
+func getNodePreparer(prefix string, cno *createNodesOp, clientset clientset.Interface) (testutils.TestNodePreparer, error) {
 	var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{}
-	if nc.NodeAllocatableStrategy != nil {
-		nodeStrategy = nc.NodeAllocatableStrategy
-	} else if nc.LabelNodePrepareStrategy != nil {
-		nodeStrategy = nc.LabelNodePrepareStrategy
-	} else if nc.UniqueNodeLabelStrategy != nil {
-		nodeStrategy = nc.UniqueNodeLabelStrategy
+	if cno.NodeAllocatableStrategy != nil {
+		nodeStrategy = cno.NodeAllocatableStrategy
+	} else if cno.LabelNodePrepareStrategy != nil {
+		nodeStrategy = cno.LabelNodePrepareStrategy
+	} else if cno.UniqueNodeLabelStrategy != nil {
+		nodeStrategy = cno.UniqueNodeLabelStrategy
 	}

-	if nc.NodeTemplatePath != nil {
-		node, err := getNodeSpecFromFile(nc.NodeTemplatePath)
+	if cno.NodeTemplatePath != nil {
+		node, err := getNodeSpecFromFile(cno.NodeTemplatePath)
 		if err != nil {
 			return nil, err
 		}
 		return framework.NewIntegrationTestNodePreparerWithNodeSpec(
 			clientset,
-			[]testutils.CountToStrategy{{Count: nc.Num, Strategy: nodeStrategy}},
+			[]testutils.CountToStrategy{{Count: cno.Count, Strategy: nodeStrategy}},
 			node,
 		), nil
 	}
 	return framework.NewIntegrationTestNodePreparer(
 		clientset,
-		[]testutils.CountToStrategy{{Count: nc.Num, Strategy: nodeStrategy}},
-		"scheduler-perf-",
+		[]testutils.CountToStrategy{{Count: cno.Count, Strategy: nodeStrategy}},
+		prefix,
 	), nil
 }

-func createPods(ns string, pc podCase, clientset clientset.Interface) error {
-	strategy, err := getPodStrategy(pc)
+func createPods(namespace string, cpo *createPodsOp, clientset clientset.Interface) error {
+	strategy, err := getPodStrategy(cpo)
 	if err != nil {
 		return err
 	}
 	config := testutils.NewTestPodCreatorConfig()
-	config.AddStrategy(ns, pc.Num, strategy)
+	config.AddStrategy(namespace, cpo.Count, strategy)
 	podCreator := testutils.NewTestPodCreator(clientset, config)
 	return podCreator.CreatePods()
 }

-func getPodStrategy(pc podCase) (testutils.TestPodCreateStrategy, error) {
-	basePod := makeBasePod()
-	if pc.PodTemplatePath != nil {
-		var err error
-		basePod, err = getPodSpecFromFile(pc.PodTemplatePath)
+// waitUntilPodsScheduledInNamespace blocks until all pods in the given
+// namespace are scheduled. Times out after 10 minutes because even at the
+// lowest observed QPS of ~10 pods/sec, a 5000-node test should complete.
+func waitUntilPodsScheduledInNamespace(ctx context.Context, podInformer coreinformers.PodInformer, name string, namespace string, wantCount int) error {
+	return wait.PollImmediate(1*time.Second, 10*time.Minute, func() (bool, error) {
+		select {
+		case <-ctx.Done():
+			return true, ctx.Err()
+		default:
+		}
+		scheduled, err := getScheduledPods(podInformer, namespace)
 		if err != nil {
-			return nil, err
+			return false, err
+		}
+		if len(scheduled) >= wantCount {
+			return true, nil
+		}
+		klog.Infof("%s: namespace %s: got %d pods, want %d", name, namespace, len(scheduled), wantCount)
+		return false, nil
+	})
+}
+
+// waitUntilPodsScheduled blocks until the all pods in the given namespaces are
+// scheduled.
+func waitUntilPodsScheduled(ctx context.Context, podInformer coreinformers.PodInformer, name string, namespaces []string, numPodsScheduledPerNamespace map[string]int) error {
+	// If unspecified, default to all known namespaces.
+	if len(namespaces) == 0 {
+		for namespace := range numPodsScheduledPerNamespace {
+			namespaces = append(namespaces, namespace)
 		}
 	}
-	if pc.PersistentVolumeClaimTemplatePath == nil {
-		return testutils.NewCustomCreatePodStrategy(basePod), nil
-	}
-
-	pvTemplate, err := getPersistentVolumeSpecFromFile(pc.PersistentVolumeTemplatePath)
-	if err != nil {
-		return nil, err
-	}
-	pvcTemplate, err := getPersistentVolumeClaimSpecFromFile(pc.PersistentVolumeClaimTemplatePath)
-	if err != nil {
-		return nil, err
-	}
-	return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), basePod), nil
-}
-
-func parseTestCases(path string) ([]testCase, error) {
-	var simpleTests []simpleTestCases
-	if err := getSpecFromFile(&path, &simpleTests); err != nil {
-		return nil, fmt.Errorf("parsing test cases: %v", err)
-	}
-
-	testCases := make([]testCase, 0)
-	for _, s := range simpleTests {
-		testCase := s.Template
-		for _, p := range s.Params {
-			testCase.Nodes.Num = p.NumNodes
-			testCase.InitPods = append([]podCase(nil), testCase.InitPods...)
-			for i, v := range p.NumInitPods {
-				testCase.InitPods[i].Num = v
-			}
-			testCase.PodsToSchedule.Num = p.NumPodsToSchedule
-			testCases = append(testCases, testCase)
+	for _, namespace := range namespaces {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+		wantCount, ok := numPodsScheduledPerNamespace[namespace]
+		if !ok {
+			return fmt.Errorf("unknown namespace %s", namespace)
+		}
+		if err := waitUntilPodsScheduledInNamespace(ctx, podInformer, name, namespace, wantCount); err != nil {
+			return fmt.Errorf("error waiting for pods in namespace %q: %w", namespace, err)
 		}
 	}
-
-	return testCases, nil
-}
-
-func getNodeSpecFromFile(path *string) (*v1.Node, error) {
-	nodeSpec := &v1.Node{}
-	if err := getSpecFromFile(path, nodeSpec); err != nil {
-		return nil, fmt.Errorf("parsing Node: %v", err)
-	}
-	return nodeSpec, nil
-}
-
-func getPodSpecFromFile(path *string) (*v1.Pod, error) {
-	podSpec := &v1.Pod{}
-	if err := getSpecFromFile(path, podSpec); err != nil {
-		return nil, fmt.Errorf("parsing Pod: %v", err)
-	}
-	return podSpec, nil
-}
-
-func getPersistentVolumeSpecFromFile(path *string) (*v1.PersistentVolume, error) {
-	persistentVolumeSpec := &v1.PersistentVolume{}
-	if err := getSpecFromFile(path, persistentVolumeSpec); err != nil {
-		return nil, fmt.Errorf("parsing PersistentVolume: %v", err)
-	}
-	return persistentVolumeSpec, nil
-}
-
-func getPersistentVolumeClaimSpecFromFile(path *string) (*v1.PersistentVolumeClaim, error) {
-	persistentVolumeClaimSpec := &v1.PersistentVolumeClaim{}
-	if err := getSpecFromFile(path, persistentVolumeClaimSpec); err != nil {
-		return nil, fmt.Errorf("parsing PersistentVolumeClaim: %v", err)
-	}
-	return persistentVolumeClaimSpec, nil
+	return nil
 }

 func getSpecFromFile(path *string, spec interface{}) error {
@@ -342,7 +528,95 @@ func getSpecFromFile(path *string, spec interface{}) error {
 	if err != nil {
 		return err
 	}
-	return yaml.Unmarshal(bytes, spec)
+	return yaml.UnmarshalStrict(bytes, spec)
+}
+
+func getTestCases(path string) ([]*testCase, error) {
+	testCases := make([]*testCase, 0)
+	if err := getSpecFromFile(&path, &testCases); err != nil {
+		return nil, fmt.Errorf("parsing test cases: %w", err)
+	}
+	return testCases, nil
+}
+
+func validateTestCases(testCases []*testCase) error {
+	if len(testCases) == 0 {
+		return fmt.Errorf("no test cases defined")
+	}
+	for _, tc := range testCases {
+		if len(tc.Workloads) == 0 {
+			return fmt.Errorf("%s: no workloads defined", tc.Name)
+		}
+		if len(tc.WorkloadTemplate) == 0 {
+			return fmt.Errorf("%s: no ops defined", tc.Name)
+		}
+		// Make sure there's at least one CreatePods op with collectMetrics set to
+		// true in each workload. What's the point of running a performance
+		// benchmark if no statistics are collected for reporting?
+		if !tc.collectsMetrics() {
+			return fmt.Errorf("%s: no op in the workload template collects metrics", tc.Name)
+		}
+		// TODO(#93795): make sure each workload within a test case has a unique
+		// name? The name is used to identify the stats in benchmark reports.
+		// TODO(#94404): check for unused template parameters? Probably a typo.
+	}
+	return nil
+}
+
+func getPodStrategy(cpo *createPodsOp) (testutils.TestPodCreateStrategy, error) {
+	basePod := makeBasePod()
+	if cpo.PodTemplatePath != nil {
+		var err error
+		basePod, err = getPodSpecFromFile(cpo.PodTemplatePath)
+		if err != nil {
+			return nil, err
+		}
+	}
+	if cpo.PersistentVolumeClaimTemplatePath == nil {
+		return testutils.NewCustomCreatePodStrategy(basePod), nil
+	}
+
+	pvTemplate, err := getPersistentVolumeSpecFromFile(cpo.PersistentVolumeTemplatePath)
+	if err != nil {
+		return nil, err
+	}
+	pvcTemplate, err := getPersistentVolumeClaimSpecFromFile(cpo.PersistentVolumeClaimTemplatePath)
+	if err != nil {
+		return nil, err
+	}
+	return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), basePod), nil
+}
+
+func getNodeSpecFromFile(path *string) (*v1.Node, error) {
+	nodeSpec := &v1.Node{}
+	if err := getSpecFromFile(path, nodeSpec); err != nil {
+		return nil, fmt.Errorf("parsing Node: %w", err)
+	}
+	return nodeSpec, nil
+}
+
+func getPodSpecFromFile(path *string) (*v1.Pod, error) {
+	podSpec := &v1.Pod{}
+	if err := getSpecFromFile(path, podSpec); err != nil {
+		return nil, fmt.Errorf("parsing Pod: %w", err)
+	}
+	return podSpec, nil
+}
+
+func getPersistentVolumeSpecFromFile(path *string) (*v1.PersistentVolume, error) {
+	persistentVolumeSpec := &v1.PersistentVolume{}
+	if err := getSpecFromFile(path, persistentVolumeSpec); err != nil {
+		return nil, fmt.Errorf("parsing PersistentVolume: %w", err)
+	}
+	return persistentVolumeSpec, nil
+}
+
+func getPersistentVolumeClaimSpecFromFile(path *string) (*v1.PersistentVolumeClaim, error) {
+	persistentVolumeClaimSpec := &v1.PersistentVolumeClaim{}
+	if err := getSpecFromFile(path, persistentVolumeClaimSpec); err != nil {
+		return nil, fmt.Errorf("parsing PersistentVolumeClaim: %w", err)
+	}
+	return persistentVolumeClaimSpec, nil
 }

 func getCustomVolumeFactory(pvTemplate *v1.PersistentVolume) func(id int) *v1.PersistentVolume {
--- a/test/integration/scheduler_perf/util.go
+++ b/test/integration/scheduler_perf/util.go
@@ -17,6 +17,7 @@ limitations under the License.
 package benchmark

 import (
+	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
@@ -147,18 +148,18 @@ type metricsCollectorConfig struct {
 // metricsCollector collects metrics from legacyregistry.DefaultGatherer.Gather() endpoint.
 // Currently only Histrogram metrics are supported.
 type metricsCollector struct {
-	metricsCollectorConfig
+	*metricsCollectorConfig
 	labels map[string]string
 }

-func newMetricsCollector(config metricsCollectorConfig, labels map[string]string) *metricsCollector {
+func newMetricsCollector(config *metricsCollectorConfig, labels map[string]string) *metricsCollector {
 	return &metricsCollector{
 		metricsCollectorConfig: config,
 		labels:                 labels,
 	}
 }

-func (*metricsCollector) run(stopCh chan struct{}) {
+func (*metricsCollector) run(ctx context.Context) {
 	// metricCollector doesn't need to start before the tests, so nothing to do here.
 }

@@ -231,7 +232,7 @@ func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[st
 	}
 }

-func (tc *throughputCollector) run(stopCh chan struct{}) {
+func (tc *throughputCollector) run(ctx context.Context) {
 	podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...)
 	if err != nil {
 		klog.Fatalf("%v", err)
@@ -239,8 +240,9 @@ func (tc *throughputCollector) run(stopCh chan struct{}) {
 	lastScheduledCount := len(podsScheduled)
 	for {
 		select {
-		case <-stopCh:
+		case <-ctx.Done():
 			return
+		// TODO(#94665): use time.Ticker instead
 		case <-time.After(throughputSampleFrequency):
 			podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...)
 			if err != nil {
--- a/test/utils/runners.go
+++ b/test/utils/runners.go
@@ -931,7 +931,7 @@ type CountToStrategy struct {
 }

 type TestNodePreparer interface {
-	PrepareNodes() error
+	PrepareNodes(nextNodeIndex int) error
 	CleanupNodes() error
 }