diff --git a/test/integration/framework/perf_utils.go b/test/integration/framework/perf_utils.go index 692da386bce..f6f3eba88c6 100644 --- a/test/integration/framework/perf_utils.go +++ b/test/integration/framework/perf_utils.go @@ -58,7 +58,7 @@ func NewIntegrationTestNodePreparerWithNodeSpec(client clientset.Interface, coun } // PrepareNodes prepares countToStrategy test nodes. -func (p *IntegrationTestNodePreparer) PrepareNodes() error { +func (p *IntegrationTestNodePreparer) PrepareNodes(nextNodeIndex int) error { numNodes := 0 for _, v := range p.countToStrategy { numNodes += v.Count @@ -103,11 +103,9 @@ func (p *IntegrationTestNodePreparer) PrepareNodes() error { if err != nil { klog.Fatalf("Error listing nodes: %v", err) } - index := 0 - sum := 0 + index := nextNodeIndex for _, v := range p.countToStrategy { - sum += v.Count - for ; index < sum; index++ { + for i := 0; i < v.Count; i, index = i+1, index+1 { if err := testutils.DoPrepareNode(p.client, &nodes.Items[index], v.Strategy); err != nil { klog.Errorf("Aborting node preparation: %v", err) return err @@ -119,14 +117,18 @@ func (p *IntegrationTestNodePreparer) PrepareNodes() error { // CleanupNodes deletes existing test nodes. func (p *IntegrationTestNodePreparer) CleanupNodes() error { + // TODO(#93794): make CleanupNodes only clean up the nodes created by this + // IntegrationTestNodePreparer to make this more intuitive. nodes, err := GetReadySchedulableNodes(p.client) if err != nil { klog.Fatalf("Error listing nodes: %v", err) } + var errRet error for i := range nodes.Items { if err := p.client.CoreV1().Nodes().Delete(context.TODO(), nodes.Items[i].Name, metav1.DeleteOptions{}); err != nil { klog.Errorf("Error while deleting Node: %v", err) + errRet = err } } - return nil + return errRet } diff --git a/test/integration/scheduler_perf/.gitignore b/test/integration/scheduler_perf/.gitignore new file mode 100644 index 00000000000..1233e053690 --- /dev/null +++ b/test/integration/scheduler_perf/.gitignore @@ -0,0 +1 @@ +BenchmarkPerfScheduling_*.json diff --git a/test/integration/scheduler_perf/BUILD b/test/integration/scheduler_perf/BUILD index bc6b8fb0ec0..0347135abab 100644 --- a/test/integration/scheduler_perf/BUILD +++ b/test/integration/scheduler_perf/BUILD @@ -48,6 +48,7 @@ go_test( "//staging/src/k8s.io/api/storage/v1beta1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/informers/core/v1:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", diff --git a/test/integration/scheduler_perf/config/performance-config.yaml b/test/integration/scheduler_perf/config/performance-config.yaml index 56ccb103fac..5644f93ca33 100644 --- a/test/integration/scheduler_perf/config/performance-config.yaml +++ b/test/integration/scheduler_perf/config/performance-config.yaml @@ -1,264 +1,426 @@ -- template: - desc: SchedulingBasic - initPods: - - podTemplatePath: config/pod-default.yaml - podsToSchedule: - podTemplatePath: config/pod-default.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingPodAntiAffinity - nodes: - uniqueNodeLabelStrategy: - labelKey: kubernetes.io/hostname - initPods: - - podTemplatePath: config/pod-with-pod-anti-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-with-pod-anti-affinity.yaml - params: - - numNodes: 500 - numInitPods: [100] - numPodsToSchedule: 400 - - numNodes: 5000 - numInitPods: [1000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingSecrets - initPods: - - podTemplatePath: config/pod-with-secret-volume.yaml - podsToSchedule: - podTemplatePath: config/pod-with-secret-volume.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingInTreePVs - initPods: - - persistentVolumeTemplatePath: config/pv-aws.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - podsToSchedule: - persistentVolumeTemplatePath: config/pv-aws.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingMigratedInTreePVs - nodes: - nodeTemplatePath: config/node-default.yaml - nodeAllocatableStrategy: - nodeAllocatable: - attachable-volumes-csi-ebs.csi.aws.com: 39 - csiNodeAllocatable: - ebs.csi.aws.com: - count: 39 - migratedPlugins: - - "kubernetes.io/aws-ebs" - initPods: - - persistentVolumeTemplatePath: config/pv-aws.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - podsToSchedule: - persistentVolumeTemplatePath: config/pv-aws.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - featureGates: - CSIMigration: true - CSIMigrationAWS: true - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingCSIPVs - nodes: - nodeTemplatePath: config/node-default.yaml - nodeAllocatableStrategy: - nodeAllocatable: - attachable-volumes-csi-ebs.csi.aws.com: 39 - csiNodeAllocatable: - ebs.csi.aws.com: - count: 39 - initPods: - - persistentVolumeTemplatePath: config/pv-csi.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - podsToSchedule: - persistentVolumeTemplatePath: config/pv-csi.yaml - persistentVolumeClaimTemplatePath: config/pvc.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingPodAffinity - nodes: - nodeTemplatePath: config/node-default.yaml - labelNodePrepareStrategy: - labelKey: "failure-domain.beta.kubernetes.io/zone" - labelValues: ["zone1"] - initPods: - - podTemplatePath: config/pod-with-pod-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-with-pod-affinity.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingPreferredPodAffinity - nodes: - uniqueNodeLabelStrategy: - labelKey: kubernetes.io/hostname - initPods: - - podTemplatePath: config/pod-with-preferred-pod-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-with-preferred-pod-affinity.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingPreferredPodAntiAffinity - nodes: - uniqueNodeLabelStrategy: - labelKey: kubernetes.io/hostname - initPods: - - podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: SchedulingNodeAffinity - nodes: - nodeTemplatePath: config/node-default.yaml - labelNodePrepareStrategy: - labelKey: "failure-domain.beta.kubernetes.io/zone" - labelValues: ["zone1"] - initPods: - - podTemplatePath: config/pod-with-node-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-with-node-affinity.yaml - params: - - numNodes: 500 - numInitPods: [500] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 1000 -- template: - desc: TopologySpreading - nodes: - nodeTemplatePath: config/node-default.yaml - labelNodePrepareStrategy: - labelKey: "topology.kubernetes.io/zone" - labelValues: ["moon-1", "moon-2", "moon-3"] - initPods: - - podTemplatePath: config/pod-default.yaml - podsToSchedule: - podTemplatePath: config/pod-with-topology-spreading.yaml - params: - - numNodes: 500 - numInitPods: [1000] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 2000 -- template: - desc: PreferredTopologySpreading - nodes: - nodeTemplatePath: config/node-default.yaml - labelNodePrepareStrategy: - labelKey: "topology.kubernetes.io/zone" - labelValues: ["moon-1", "moon-2", "moon-3"] - initPods: - - podTemplatePath: config/pod-default.yaml - podsToSchedule: - podTemplatePath: config/pod-with-preferred-topology-spreading.yaml - params: - - numNodes: 500 - numInitPods: [1000] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [5000] - numPodsToSchedule: 2000 -- template: - desc: MixedSchedulingBasePod - nodes: - nodeTemplatePath: config/node-default.yaml - labelNodePrepareStrategy: - labelKey: "topology.kubernetes.io/zone" - labelValues: ["zone1"] - initPods: - - podTemplatePath: config/pod-default.yaml - - podTemplatePath: config/pod-with-pod-affinity.yaml - - podTemplatePath: config/pod-with-pod-anti-affinity.yaml - - podTemplatePath: config/pod-with-preferred-pod-affinity.yaml - - podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml - podsToSchedule: - podTemplatePath: config/pod-default.yaml - params: - - numNodes: 500 - numInitPods: [200, 200, 200, 200, 200] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [2000, 2000, 2000, 2000, 2000] - numPodsToSchedule: 1000 -- template: - desc: Preemption - initPods: - - podTemplatePath: config/pod-low-priority.yaml - podsToSchedule: - podTemplatePath: config/pod-high-priority.yaml - params: - - numNodes: 500 - numInitPods: [2000] - numPodsToSchedule: 500 - - numNodes: 5000 - numInitPods: [20000] - numPodsToSchedule: 5000 -- template: - desc: Unschedulable - skipWaitUntilInitPodsScheduled: true - initPods: - - podTemplatePath: config/pod-large-cpu.yaml - podsToSchedule: - podTemplatePath: config/pod-default.yaml - params: - - numNodes: 500 - numInitPods: [200] - numPodsToSchedule: 1000 - - numNodes: 5000 - numInitPods: [200] - numPodsToSchedule: 5000 - - numNodes: 5000 - numInitPods: [2000] - numPodsToSchedule: 5000 +- name: SchedulingBasic + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-default.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-default.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 1000 + measurePods: 1000 + +- name: SchedulingPodAntiAffinity + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + uniqueNodeLabelStrategy: + labelKey: kubernetes.io/hostname + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-pod-anti-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-pod-anti-affinity.yaml + collectMetrics: true + namespace: sched-test + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 100 + measurePods: 400 + - name: 5000Nodes + params: + initNodes: 500 + initPods: 100 + measurePods: 400 + +- name: SchedulingSecrets + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-secret-volume.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-secret-volume.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingInTreePVs + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + - opcode: createPods + countParam: $initPods + persistentVolumeTemplatePath: config/pv-aws.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + - opcode: createPods + countParam: $measurePods + persistentVolumeTemplatePath: config/pv-aws.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingMigratedInTreePVs + featureGates: + CSIMigration: true + CSIMigrationAWS: true + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + nodeAllocatableStrategy: + nodeAllocatable: + attachable-volumes-csi-ebs.csi.aws.com: "39" + csiNodeAllocatable: + ebs.csi.aws.com: + count: 39 + migratedPlugins: + - "kubernetes.io/aws-ebs" + - opcode: createPods + countParam: $initPods + persistentVolumeTemplatePath: config/pv-aws.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + - opcode: createPods + countParam: $measurePods + persistentVolumeTemplatePath: config/pv-aws.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingCSIPVs + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + nodeAllocatableStrategy: + nodeAllocatable: + attachable-volumes-csi-ebs.csi.aws.com: "39" + csiNodeAllocatable: + ebs.csi.aws.com: + count: 39 + - opcode: createPods + countParam: $initPods + persistentVolumeTemplatePath: config/pv-csi.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + - opcode: createPods + countParam: $measurePods + persistentVolumeTemplatePath: config/pv-csi.yaml + persistentVolumeClaimTemplatePath: config/pvc.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingPodAffinity + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + labelNodePrepareStrategy: + labelKey: "failure-domain.beta.kubernetes.io/zone" + labelValues: ["zone1"] + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-pod-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-pod-affinity.yaml + namespace: sched-test + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingPreferredPodAffinity + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + uniqueNodeLabelStrategy: + labelKey: kubernetes.io/hostname + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-preferred-pod-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-preferred-pod-affinity.yaml + namespace: sched-test + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingPreferredPodAntiAffinity + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + uniqueNodeLabelStrategy: + labelKey: kubernetes.io/hostname + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml + namespace: sched-test + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: SchedulingNodeAffinity + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + labelNodePrepareStrategy: + labelKey: "failure-domain.beta.kubernetes.io/zone" + labelValues: ["zone1"] + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-node-affinity.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-node-affinity.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 500 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 1000 + +- name: TopologySpreading + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + labelNodePrepareStrategy: + labelKey: "topology.kubernetes.io/zone" + labelValues: ["moon-1", "moon-2", "moon-3"] + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-default.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-topology-spreading.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 1000 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 2000 + +- name: PreferredTopologySpreading + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + labelNodePrepareStrategy: + labelKey: "topology.kubernetes.io/zone" + labelValues: ["moon-1", "moon-2", "moon-3"] + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-default.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-with-preferred-topology-spreading.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 1000 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 5000 + measurePods: 2000 + +- name: MixedSchedulingBasePod + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: config/node-default.yaml + labelNodePrepareStrategy: + labelKey: "topology.kubernetes.io/zone" + labelValues: ["zone1"] + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-default.yaml + namespace: sched-setup + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-pod-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-pod-anti-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-preferred-pod-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-with-preferred-pod-anti-affinity.yaml + namespace: sched-setup + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-default.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 200 + measurePods: 1000 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 2000 + measurePods: 1000 + +- name: Preemption + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-low-priority.yaml + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-high-priority.yaml + collectMetrics: true + workloads: + - name: 500Nodes + params: + initNodes: 500 + initPods: 2000 + measurePods: 500 + - name: 5000Nodes + params: + initNodes: 5000 + initPods: 20000 + measurePods: 5000 + +- name: Unschedulable + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + - opcode: createPods + countParam: $initPods + podTemplatePath: config/pod-large-cpu.yaml + skipWaitToCompletion: true + - opcode: createPods + countParam: $measurePods + podTemplatePath: config/pod-default.yaml + collectMetrics: true + workloads: + - name: 500Nodes/200InitPods + params: + initNodes: 500 + initPods: 200 + measurePods: 1000 + - name: 5000Nodes/200InitPods + params: + initNodes: 5000 + initPods: 200 + measurePods: 5000 + - name: 5000Nodes/2000InitPods + params: + initNodes: 5000 + initPods: 2000 + measurePods: 5000 diff --git a/test/integration/scheduler_perf/scheduler_perf_legacy_test.go b/test/integration/scheduler_perf/scheduler_perf_legacy_test.go index 28762935e10..9527376bacd 100644 --- a/test/integration/scheduler_perf/scheduler_perf_legacy_test.go +++ b/test/integration/scheduler_perf/scheduler_perf_legacy_test.go @@ -448,7 +448,7 @@ func benchmarkScheduling(numExistingPods, minPods int, clientset, nodeStrategies, "scheduler-perf-") - if err := nodePreparer.PrepareNodes(); err != nil { + if err := nodePreparer.PrepareNodes(0); err != nil { klog.Fatalf("%v", err) } defer nodePreparer.CleanupNodes() diff --git a/test/integration/scheduler_perf/scheduler_perf_test.go b/test/integration/scheduler_perf/scheduler_perf_test.go index 545ad6c7dbb..0a0511151a7 100644 --- a/test/integration/scheduler_perf/scheduler_perf_test.go +++ b/test/integration/scheduler_perf/scheduler_perf_test.go @@ -17,12 +17,17 @@ limitations under the License. package benchmark import ( + "context" + "encoding/json" "fmt" "io/ioutil" + "strings" + "sync" "testing" "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" utilfeature "k8s.io/apiserver/pkg/util/feature" coreinformers "k8s.io/client-go/informers/core/v1" clientset "k8s.io/client-go/kubernetes" @@ -35,7 +40,10 @@ import ( ) const ( - configFile = "config/performance-config.yaml" + configFile = "config/performance-config.yaml" + createNodesOpcode = "createNodes" + createPodsOpcode = "createPods" + barrierOpcode = "barrier" ) var ( @@ -51,87 +59,248 @@ var ( } ) -// testCase configures a test case to run the scheduler performance test. Users should be able to -// provide this via a YAML file. -// -// It specifies nodes and pods in the cluster before running the test. It also specifies the pods to -// schedule during the test. The config can be as simple as just specify number of nodes/pods, where -// default spec will be applied. It also allows the user to specify a pod spec template for more -// complicated test cases. -// -// It also specifies the metrics to be collected after the test. If nothing is specified, default metrics -// such as scheduling throughput and latencies will be collected. +// testCase defines a set of test cases that intend to test the performance of +// similar workloads of varying sizes with shared overall settings such as +// feature gates and metrics collected. type testCase struct { - // description of the test case - Desc string - // configures nodes in the cluster - Nodes nodeCase - // configures pods in the cluster before running the tests - InitPods []podCase - // configures the test to now wait for init pods to schedule before creating - // test pods. - SkipWaitUntilInitPodsScheduled bool - // pods to be scheduled during the test. - PodsToSchedule podCase - // optional, feature gates to set before running the test + // Name of the testCase. + Name string + // Feature gates to set before running the test. Optional. FeatureGates map[featuregate.Feature]bool - // optional, replaces default defaultMetricsCollectorConfig if supplied. + // List of metrics to collect. Optional, defaults to + // defaultMetricsCollectorConfig if unspecified. MetricsCollectorConfig *metricsCollectorConfig + // Template for sequence of ops that each workload must follow. Each op will + // be executed serially one after another. Each element of the list must be + // createNodesOp, createPodsOp, or barrierOp. + WorkloadTemplate []op + // List of workloads to run under this testCase. + Workloads []*workload + // TODO(#93792): reduce config toil by having a default pod and node spec per + // testCase? CreatePods and CreateNodes ops will inherit these unless + // manually overridden. } -type nodeCase struct { - Num int +func (tc *testCase) collectsMetrics() bool { + for _, op := range tc.WorkloadTemplate { + if op.realOp.collectsMetrics() { + return true + } + } + return false +} + +// workload is a subtest under a testCase that tests the scheduler performance +// for a certain ordering of ops. The set of nodes created and pods scheduled +// in a workload may be heterogenous. +type workload struct { + // Name of the workload. + Name string + // Values of parameters used in the workloadTemplate. + Params map[string]int +} + +// op is a dummy struct which stores the real op in itself. +type op struct { + realOp realOp +} + +// UnmarshalJSON is a custom unmarshaler for the op struct since we don't know +// which op we're decoding at runtime. +func (op *op) UnmarshalJSON(b []byte) error { + possibleOps := []realOp{ + &createNodesOp{}, + &createPodsOp{}, + &barrierOp{}, + // TODO(#93793): add a sleep timer op to simulate waiting? + // TODO(#94601): add a delete nodes op to simulate scaling behaviour? + } + var firstError error + for _, possibleOp := range possibleOps { + if err := json.Unmarshal(b, possibleOp); err == nil { + if err2 := possibleOp.isValid(true); err2 == nil { + op.realOp = possibleOp + return nil + } else if firstError == nil { + // Don't return an error yet. Even though this op is invalid, it may + // still match other possible ops. + firstError = err2 + } + } + } + return fmt.Errorf("cannot unmarshal %s into any known op type: %w", string(b), firstError) +} + +// realOp is an interface that is implemented by different structs. To evaluate +// the validity of ops at parse-time, a isValid function must be implemented. +type realOp interface { + // isValid verifies the validity of the op args such as node/pod count. Note + // that we don't catch undefined parameters at this stage. + isValid(allowParameterization bool) error + // collectsMetrics checks if the op collects metrics. + collectsMetrics() bool + // patchParams returns a patched realOp of the same type after substituting + // parameterizable values with workload-specific values. One should implement + // this method on the value receiver base type, not a pointer receiver base + // type, even though calls will be made from with a *realOp. This is because + // callers don't want the receiver to inadvertently modify the realOp + // (instead, it's returned as a return value). + patchParams(w *workload) (realOp, error) +} + +func isValidParameterizable(val string) bool { + return strings.HasPrefix(val, "$") +} + +// createNodesOp defines an op where nodes are created as a part of a workload. +type createNodesOp struct { + // Must be "createNodes". + Opcode string + // Number of nodes to create. Parameterizable through CountParam. + Count int + // Template parameter for Count. + CountParam string + // Path to spec file describing the nodes to create. Optional. NodeTemplatePath *string - // At most one of the following strategies can be defined. If not specified, default to TrivialNodePrepareStrategy. + // At most one of the following strategies can be defined. Optional, defaults + // to TrivialNodePrepareStrategy if unspecified. NodeAllocatableStrategy *testutils.NodeAllocatableStrategy LabelNodePrepareStrategy *testutils.LabelNodePrepareStrategy UniqueNodeLabelStrategy *testutils.UniqueNodeLabelStrategy } -type podCase struct { - Num int - PodTemplatePath *string +func (cno *createNodesOp) isValid(allowParameterization bool) error { + if cno.Opcode != createNodesOpcode { + return fmt.Errorf("invalid opcode") + } + ok := (cno.Count > 0 || + (cno.CountParam != "" && allowParameterization && isValidParameterizable(cno.CountParam))) + if !ok { + return fmt.Errorf("invalid Count=%d / CountParam=%q", cno.Count, cno.CountParam) + } + return nil +} + +func (*createNodesOp) collectsMetrics() bool { + return false +} + +func (cno createNodesOp) patchParams(w *workload) (realOp, error) { + if cno.CountParam != "" { + var ok bool + if cno.Count, ok = w.Params[cno.CountParam[1:]]; !ok { + return nil, fmt.Errorf("parameter %s is undefined", cno.CountParam) + } + } + return &cno, (&cno).isValid(false) +} + +// createPodsOp defines an op where pods are scheduled as a part of a workload. +// The test can block on the completion of this op before moving forward or +// continue asynchronously. +type createPodsOp struct { + // Must be "createPods". + Opcode string + // Number of pods to schedule. Parameterizable through CountParam. + Count int + // Template parameter for Count. + CountParam string + // Whether or not to enable metrics collection for this createPodsOp. + // Optional. Both CollectMetrics and SkipWaitToCompletion cannot be true at + // the same time for a particular createPodsOp. + CollectMetrics bool + // Namespace the pods should be created in. Optional, defaults to a unique + // namespace of the format "namespace-". + Namespace *string + // Path to spec file describing the pods to schedule. Optional. + PodTemplatePath *string + // Whether or not to wait for all pods in this op to get scheduled. Optional, + // defaults to false. + SkipWaitToCompletion bool + // Persistent volume settings for the pods to be scheduled. Optional. PersistentVolumeTemplatePath *string PersistentVolumeClaimTemplatePath *string } -// simpleTestCases defines a set of test cases that share the same template (node spec, pod spec, etc) -// with testParams(e.g., NumNodes) being overridden. This provides a convenient way to define multiple tests -// with various sizes. -type simpleTestCases struct { - Template testCase - Params []testParams +func (cpo *createPodsOp) isValid(allowParameterization bool) error { + if cpo.Opcode != createPodsOpcode { + return fmt.Errorf("invalid opcode") + } + ok := (cpo.Count > 0 || + (cpo.CountParam != "" && allowParameterization && isValidParameterizable(cpo.CountParam))) + if !ok { + return fmt.Errorf("invalid Count=%d / CountParam=%q", cpo.Count, cpo.CountParam) + } + if cpo.CollectMetrics && cpo.SkipWaitToCompletion { + // While it's technically possible to achieve this, the additional + // complexity is not worth it, especially given that we don't have any + // use-cases right now. + return fmt.Errorf("collectMetrics and skipWaitToCompletion cannot be true at the same time") + } + return nil } -type testParams struct { - NumNodes int - NumInitPods []int - NumPodsToSchedule int +func (cpo *createPodsOp) collectsMetrics() bool { + return cpo.CollectMetrics } -type testDataCollector interface { - run(stopCh chan struct{}) - collect() []DataItem +func (cpo createPodsOp) patchParams(w *workload) (realOp, error) { + if cpo.CountParam != "" { + var ok bool + if cpo.Count, ok = w.Params[cpo.CountParam[1:]]; !ok { + return nil, fmt.Errorf("parameter %s is undefined", cpo.CountParam) + } + } + return &cpo, (&cpo).isValid(false) +} + +// barrierOp defines an op that can be used to wait until all scheduled pods of +// one or many namespaces have been bound to nodes. This is useful when pods +// were scheduled with SkipWaitToCompletion set to true. A barrierOp is added +// at the end of each each workload automatically. +type barrierOp struct { + // Must be "barrier". + Opcode string + // Namespaces to block on. Empty array or not specifying this field signifies + // that the barrier should block on all namespaces. + Namespaces []string +} + +func (bo *barrierOp) isValid(allowParameterization bool) error { + if bo.Opcode != barrierOpcode { + return fmt.Errorf("invalid opcode") + } + return nil +} + +func (*barrierOp) collectsMetrics() bool { + return false +} + +func (bo barrierOp) patchParams(w *workload) (realOp, error) { + return &bo, nil } func BenchmarkPerfScheduling(b *testing.B) { - dataItems := DataItems{Version: "v1"} - tests, err := parseTestCases(configFile) + testCases, err := getTestCases(configFile) if err != nil { b.Fatal(err) } + if err = validateTestCases(testCases); err != nil { + b.Fatal(err) + } - for _, test := range tests { - initPods := 0 - for _, p := range test.InitPods { - initPods += p.Num - } - name := fmt.Sprintf("%v/%vNodes/%vInitPods/%vPodsToSchedule", test.Desc, test.Nodes.Num, initPods, test.PodsToSchedule.Num) - b.Run(name, func(b *testing.B) { - for feature, flag := range test.FeatureGates { - defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)() + dataItems := DataItems{Version: "v1"} + for _, tc := range testCases { + b.Run(tc.Name, func(b *testing.B) { + for _, w := range tc.Workloads { + b.Run(w.Name, func(b *testing.B) { + for feature, flag := range tc.FeatureGates { + defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)() + } + dataItems.DataItems = append(dataItems.DataItems, runWorkload(b, tc, w)...) + }) } - dataItems.DataItems = append(dataItems.DataItems, perfScheduling(test, b)...) }) } if err := dataItems2JSONFile(dataItems, b.Name()); err != nil { @@ -139,202 +308,219 @@ func BenchmarkPerfScheduling(b *testing.B) { } } -func perfScheduling(test testCase, b *testing.B) []DataItem { +func runWorkload(b *testing.B, tc *testCase, w *workload) []DataItem { + // 30 minutes should be plenty enough even for the 5000-node tests. + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) + defer cancel() finalFunc, podInformer, clientset := mustSetupScheduler() - defer finalFunc() - - nodePreparer, err := getNodePreparer(test.Nodes, clientset) - if err != nil { - b.Fatal(err) - } - if err := nodePreparer.PrepareNodes(); err != nil { - b.Fatal(err) - } - defer nodePreparer.CleanupNodes() - - total := 0 - for _, p := range test.InitPods { - if err := createPods(setupNamespace, p, clientset); err != nil { - b.Fatal(err) - } - total += p.Num - } - if !test.SkipWaitUntilInitPodsScheduled { - if err := waitNumPodsScheduled(b, total, podInformer, setupNamespace); err != nil { - b.Fatal(err) - } - } - - // start benchmark - b.ResetTimer() - - // Start test data collectors. - stopCh := make(chan struct{}) - collectors := getTestDataCollectors(test, podInformer, b) - for _, collector := range collectors { - go collector.run(stopCh) - } - - // Schedule the main workload - if err := createPods(testNamespace, test.PodsToSchedule, clientset); err != nil { - b.Fatal(err) - } - if err := waitNumPodsScheduled(b, test.PodsToSchedule.Num, podInformer, testNamespace); err != nil { - b.Fatal(err) - } - - close(stopCh) - // Note: without this line we're taking the overhead of defer() into account. - b.StopTimer() + b.Cleanup(finalFunc) + var mu sync.Mutex var dataItems []DataItem - for _, collector := range collectors { - dataItems = append(dataItems, collector.collect()...) + numPodsScheduledPerNamespace := make(map[string]int) + nextNodeIndex := 0 + + for opIndex, op := range tc.WorkloadTemplate { + realOp, err := op.realOp.patchParams(w) + if err != nil { + b.Fatalf("op %d: %v", opIndex, err) + } + select { + case <-ctx.Done(): + b.Fatalf("op %d: %v", opIndex, ctx.Err()) + default: + } + switch concreteOp := realOp.(type) { + case *createNodesOp: + nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, clientset) + if err != nil { + b.Fatalf("op %d: %v", opIndex, err) + } + if err := nodePreparer.PrepareNodes(nextNodeIndex); err != nil { + b.Fatalf("op %d: %v", opIndex, err) + } + b.Cleanup(func() { + nodePreparer.CleanupNodes() + }) + nextNodeIndex += concreteOp.Count + + case *createPodsOp: + var namespace string + if concreteOp.Namespace != nil { + namespace = *concreteOp.Namespace + } else { + namespace = fmt.Sprintf("namespace-%d", opIndex) + } + var collectors []testDataCollector + var collectorCtx context.Context + var collectorCancel func() + if concreteOp.CollectMetrics { + collectorCtx, collectorCancel = context.WithCancel(ctx) + defer collectorCancel() + collectors = getTestDataCollectors(podInformer, fmt.Sprintf("%s/%s", b.Name(), namespace), namespace, tc.MetricsCollectorConfig) + for _, collector := range collectors { + go collector.run(collectorCtx) + } + } + if err := createPods(namespace, concreteOp, clientset); err != nil { + b.Fatalf("op %d: %v", opIndex, err) + } + if concreteOp.SkipWaitToCompletion { + // Only record those namespaces that may potentially require barriers + // in the future. + if _, ok := numPodsScheduledPerNamespace[namespace]; ok { + numPodsScheduledPerNamespace[namespace] += concreteOp.Count + } else { + numPodsScheduledPerNamespace[namespace] = concreteOp.Count + } + } else { + if err := waitUntilPodsScheduledInNamespace(ctx, podInformer, b.Name(), namespace, concreteOp.Count); err != nil { + b.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err) + } + } + if concreteOp.CollectMetrics { + // CollectMetrics and SkipWaitToCompletion can never be true at the + // same time, so if we're here, it means that all pods have been + // scheduled. + collectorCancel() + mu.Lock() + for _, collector := range collectors { + dataItems = append(dataItems, collector.collect()...) + } + mu.Unlock() + } + + case *barrierOp: + for _, namespace := range concreteOp.Namespaces { + if _, ok := numPodsScheduledPerNamespace[namespace]; !ok { + b.Fatalf("op %d: unknown namespace %s", opIndex, namespace) + } + } + if err := waitUntilPodsScheduled(ctx, podInformer, b.Name(), concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil { + b.Fatalf("op %d: %v", opIndex, err) + } + // At the end of the barrier, we can be sure that there are no pods + // pending scheduling in the namespaces that we just blocked on. + if len(concreteOp.Namespaces) == 0 { + numPodsScheduledPerNamespace = make(map[string]int) + } else { + for _, namespace := range concreteOp.Namespaces { + delete(numPodsScheduledPerNamespace, namespace) + } + } + + default: + b.Fatalf("op %d: invalid op %v", opIndex, concreteOp) + } + } + if err := waitUntilPodsScheduled(ctx, podInformer, b.Name(), nil, numPodsScheduledPerNamespace); err != nil { + // Any pending pods must be scheduled before this test can be considered to + // be complete. + b.Fatal(err) } return dataItems } -func waitNumPodsScheduled(b *testing.B, num int, podInformer coreinformers.PodInformer, namespace string) error { - for { - scheduled, err := getScheduledPods(podInformer, namespace) - if err != nil { - return err - } - if len(scheduled) >= num { - break - } - klog.Infof("%s: got %d existing pods, required: %d", b.Name(), len(scheduled), num) - time.Sleep(1 * time.Second) - } - return nil +type testDataCollector interface { + run(ctx context.Context) + collect() []DataItem } -func getTestDataCollectors(tc testCase, podInformer coreinformers.PodInformer, b *testing.B) []testDataCollector { - collectors := []testDataCollector{newThroughputCollector(podInformer, map[string]string{"Name": b.Name()}, []string{testNamespace})} - metricsCollectorConfig := defaultMetricsCollectorConfig - if tc.MetricsCollectorConfig != nil { - metricsCollectorConfig = *tc.MetricsCollectorConfig +func getTestDataCollectors(podInformer coreinformers.PodInformer, name, namespace string, mcc *metricsCollectorConfig) []testDataCollector { + if mcc == nil { + mcc = &defaultMetricsCollectorConfig + } + return []testDataCollector{ + newThroughputCollector(podInformer, map[string]string{"Name": name}, []string{namespace}), + newMetricsCollector(mcc, map[string]string{"Name": name}), } - collectors = append(collectors, newMetricsCollector(metricsCollectorConfig, map[string]string{"Name": b.Name()})) - return collectors } -func getNodePreparer(nc nodeCase, clientset clientset.Interface) (testutils.TestNodePreparer, error) { +func getNodePreparer(prefix string, cno *createNodesOp, clientset clientset.Interface) (testutils.TestNodePreparer, error) { var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{} - if nc.NodeAllocatableStrategy != nil { - nodeStrategy = nc.NodeAllocatableStrategy - } else if nc.LabelNodePrepareStrategy != nil { - nodeStrategy = nc.LabelNodePrepareStrategy - } else if nc.UniqueNodeLabelStrategy != nil { - nodeStrategy = nc.UniqueNodeLabelStrategy + if cno.NodeAllocatableStrategy != nil { + nodeStrategy = cno.NodeAllocatableStrategy + } else if cno.LabelNodePrepareStrategy != nil { + nodeStrategy = cno.LabelNodePrepareStrategy + } else if cno.UniqueNodeLabelStrategy != nil { + nodeStrategy = cno.UniqueNodeLabelStrategy } - if nc.NodeTemplatePath != nil { - node, err := getNodeSpecFromFile(nc.NodeTemplatePath) + if cno.NodeTemplatePath != nil { + node, err := getNodeSpecFromFile(cno.NodeTemplatePath) if err != nil { return nil, err } return framework.NewIntegrationTestNodePreparerWithNodeSpec( clientset, - []testutils.CountToStrategy{{Count: nc.Num, Strategy: nodeStrategy}}, + []testutils.CountToStrategy{{Count: cno.Count, Strategy: nodeStrategy}}, node, ), nil } return framework.NewIntegrationTestNodePreparer( clientset, - []testutils.CountToStrategy{{Count: nc.Num, Strategy: nodeStrategy}}, - "scheduler-perf-", + []testutils.CountToStrategy{{Count: cno.Count, Strategy: nodeStrategy}}, + prefix, ), nil } -func createPods(ns string, pc podCase, clientset clientset.Interface) error { - strategy, err := getPodStrategy(pc) +func createPods(namespace string, cpo *createPodsOp, clientset clientset.Interface) error { + strategy, err := getPodStrategy(cpo) if err != nil { return err } config := testutils.NewTestPodCreatorConfig() - config.AddStrategy(ns, pc.Num, strategy) + config.AddStrategy(namespace, cpo.Count, strategy) podCreator := testutils.NewTestPodCreator(clientset, config) return podCreator.CreatePods() } -func getPodStrategy(pc podCase) (testutils.TestPodCreateStrategy, error) { - basePod := makeBasePod() - if pc.PodTemplatePath != nil { - var err error - basePod, err = getPodSpecFromFile(pc.PodTemplatePath) +// waitUntilPodsScheduledInNamespace blocks until all pods in the given +// namespace are scheduled. Times out after 10 minutes because even at the +// lowest observed QPS of ~10 pods/sec, a 5000-node test should complete. +func waitUntilPodsScheduledInNamespace(ctx context.Context, podInformer coreinformers.PodInformer, name string, namespace string, wantCount int) error { + return wait.PollImmediate(1*time.Second, 10*time.Minute, func() (bool, error) { + select { + case <-ctx.Done(): + return true, ctx.Err() + default: + } + scheduled, err := getScheduledPods(podInformer, namespace) if err != nil { - return nil, err + return false, err + } + if len(scheduled) >= wantCount { + return true, nil + } + klog.Infof("%s: namespace %s: got %d pods, want %d", name, namespace, len(scheduled), wantCount) + return false, nil + }) +} + +// waitUntilPodsScheduled blocks until the all pods in the given namespaces are +// scheduled. +func waitUntilPodsScheduled(ctx context.Context, podInformer coreinformers.PodInformer, name string, namespaces []string, numPodsScheduledPerNamespace map[string]int) error { + // If unspecified, default to all known namespaces. + if len(namespaces) == 0 { + for namespace := range numPodsScheduledPerNamespace { + namespaces = append(namespaces, namespace) } } - if pc.PersistentVolumeClaimTemplatePath == nil { - return testutils.NewCustomCreatePodStrategy(basePod), nil - } - - pvTemplate, err := getPersistentVolumeSpecFromFile(pc.PersistentVolumeTemplatePath) - if err != nil { - return nil, err - } - pvcTemplate, err := getPersistentVolumeClaimSpecFromFile(pc.PersistentVolumeClaimTemplatePath) - if err != nil { - return nil, err - } - return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), basePod), nil -} - -func parseTestCases(path string) ([]testCase, error) { - var simpleTests []simpleTestCases - if err := getSpecFromFile(&path, &simpleTests); err != nil { - return nil, fmt.Errorf("parsing test cases: %v", err) - } - - testCases := make([]testCase, 0) - for _, s := range simpleTests { - testCase := s.Template - for _, p := range s.Params { - testCase.Nodes.Num = p.NumNodes - testCase.InitPods = append([]podCase(nil), testCase.InitPods...) - for i, v := range p.NumInitPods { - testCase.InitPods[i].Num = v - } - testCase.PodsToSchedule.Num = p.NumPodsToSchedule - testCases = append(testCases, testCase) + for _, namespace := range namespaces { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + wantCount, ok := numPodsScheduledPerNamespace[namespace] + if !ok { + return fmt.Errorf("unknown namespace %s", namespace) + } + if err := waitUntilPodsScheduledInNamespace(ctx, podInformer, name, namespace, wantCount); err != nil { + return fmt.Errorf("error waiting for pods in namespace %q: %w", namespace, err) } } - - return testCases, nil -} - -func getNodeSpecFromFile(path *string) (*v1.Node, error) { - nodeSpec := &v1.Node{} - if err := getSpecFromFile(path, nodeSpec); err != nil { - return nil, fmt.Errorf("parsing Node: %v", err) - } - return nodeSpec, nil -} - -func getPodSpecFromFile(path *string) (*v1.Pod, error) { - podSpec := &v1.Pod{} - if err := getSpecFromFile(path, podSpec); err != nil { - return nil, fmt.Errorf("parsing Pod: %v", err) - } - return podSpec, nil -} - -func getPersistentVolumeSpecFromFile(path *string) (*v1.PersistentVolume, error) { - persistentVolumeSpec := &v1.PersistentVolume{} - if err := getSpecFromFile(path, persistentVolumeSpec); err != nil { - return nil, fmt.Errorf("parsing PersistentVolume: %v", err) - } - return persistentVolumeSpec, nil -} - -func getPersistentVolumeClaimSpecFromFile(path *string) (*v1.PersistentVolumeClaim, error) { - persistentVolumeClaimSpec := &v1.PersistentVolumeClaim{} - if err := getSpecFromFile(path, persistentVolumeClaimSpec); err != nil { - return nil, fmt.Errorf("parsing PersistentVolumeClaim: %v", err) - } - return persistentVolumeClaimSpec, nil + return nil } func getSpecFromFile(path *string, spec interface{}) error { @@ -342,7 +528,95 @@ func getSpecFromFile(path *string, spec interface{}) error { if err != nil { return err } - return yaml.Unmarshal(bytes, spec) + return yaml.UnmarshalStrict(bytes, spec) +} + +func getTestCases(path string) ([]*testCase, error) { + testCases := make([]*testCase, 0) + if err := getSpecFromFile(&path, &testCases); err != nil { + return nil, fmt.Errorf("parsing test cases: %w", err) + } + return testCases, nil +} + +func validateTestCases(testCases []*testCase) error { + if len(testCases) == 0 { + return fmt.Errorf("no test cases defined") + } + for _, tc := range testCases { + if len(tc.Workloads) == 0 { + return fmt.Errorf("%s: no workloads defined", tc.Name) + } + if len(tc.WorkloadTemplate) == 0 { + return fmt.Errorf("%s: no ops defined", tc.Name) + } + // Make sure there's at least one CreatePods op with collectMetrics set to + // true in each workload. What's the point of running a performance + // benchmark if no statistics are collected for reporting? + if !tc.collectsMetrics() { + return fmt.Errorf("%s: no op in the workload template collects metrics", tc.Name) + } + // TODO(#93795): make sure each workload within a test case has a unique + // name? The name is used to identify the stats in benchmark reports. + // TODO(#94404): check for unused template parameters? Probably a typo. + } + return nil +} + +func getPodStrategy(cpo *createPodsOp) (testutils.TestPodCreateStrategy, error) { + basePod := makeBasePod() + if cpo.PodTemplatePath != nil { + var err error + basePod, err = getPodSpecFromFile(cpo.PodTemplatePath) + if err != nil { + return nil, err + } + } + if cpo.PersistentVolumeClaimTemplatePath == nil { + return testutils.NewCustomCreatePodStrategy(basePod), nil + } + + pvTemplate, err := getPersistentVolumeSpecFromFile(cpo.PersistentVolumeTemplatePath) + if err != nil { + return nil, err + } + pvcTemplate, err := getPersistentVolumeClaimSpecFromFile(cpo.PersistentVolumeClaimTemplatePath) + if err != nil { + return nil, err + } + return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), basePod), nil +} + +func getNodeSpecFromFile(path *string) (*v1.Node, error) { + nodeSpec := &v1.Node{} + if err := getSpecFromFile(path, nodeSpec); err != nil { + return nil, fmt.Errorf("parsing Node: %w", err) + } + return nodeSpec, nil +} + +func getPodSpecFromFile(path *string) (*v1.Pod, error) { + podSpec := &v1.Pod{} + if err := getSpecFromFile(path, podSpec); err != nil { + return nil, fmt.Errorf("parsing Pod: %w", err) + } + return podSpec, nil +} + +func getPersistentVolumeSpecFromFile(path *string) (*v1.PersistentVolume, error) { + persistentVolumeSpec := &v1.PersistentVolume{} + if err := getSpecFromFile(path, persistentVolumeSpec); err != nil { + return nil, fmt.Errorf("parsing PersistentVolume: %w", err) + } + return persistentVolumeSpec, nil +} + +func getPersistentVolumeClaimSpecFromFile(path *string) (*v1.PersistentVolumeClaim, error) { + persistentVolumeClaimSpec := &v1.PersistentVolumeClaim{} + if err := getSpecFromFile(path, persistentVolumeClaimSpec); err != nil { + return nil, fmt.Errorf("parsing PersistentVolumeClaim: %w", err) + } + return persistentVolumeClaimSpec, nil } func getCustomVolumeFactory(pvTemplate *v1.PersistentVolume) func(id int) *v1.PersistentVolume { diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go index e440f88c0e7..ca335b25019 100644 --- a/test/integration/scheduler_perf/util.go +++ b/test/integration/scheduler_perf/util.go @@ -17,6 +17,7 @@ limitations under the License. package benchmark import ( + "context" "encoding/json" "flag" "fmt" @@ -147,18 +148,18 @@ type metricsCollectorConfig struct { // metricsCollector collects metrics from legacyregistry.DefaultGatherer.Gather() endpoint. // Currently only Histrogram metrics are supported. type metricsCollector struct { - metricsCollectorConfig + *metricsCollectorConfig labels map[string]string } -func newMetricsCollector(config metricsCollectorConfig, labels map[string]string) *metricsCollector { +func newMetricsCollector(config *metricsCollectorConfig, labels map[string]string) *metricsCollector { return &metricsCollector{ metricsCollectorConfig: config, labels: labels, } } -func (*metricsCollector) run(stopCh chan struct{}) { +func (*metricsCollector) run(ctx context.Context) { // metricCollector doesn't need to start before the tests, so nothing to do here. } @@ -231,7 +232,7 @@ func newThroughputCollector(podInformer coreinformers.PodInformer, labels map[st } } -func (tc *throughputCollector) run(stopCh chan struct{}) { +func (tc *throughputCollector) run(ctx context.Context) { podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...) if err != nil { klog.Fatalf("%v", err) @@ -239,8 +240,9 @@ func (tc *throughputCollector) run(stopCh chan struct{}) { lastScheduledCount := len(podsScheduled) for { select { - case <-stopCh: + case <-ctx.Done(): return + // TODO(#94665): use time.Ticker instead case <-time.After(throughputSampleFrequency): podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...) if err != nil { diff --git a/test/utils/runners.go b/test/utils/runners.go index 4b0283eca9e..89a3102f6e6 100644 --- a/test/utils/runners.go +++ b/test/utils/runners.go @@ -931,7 +931,7 @@ type CountToStrategy struct { } type TestNodePreparer interface { - PrepareNodes() error + PrepareNodes(nextNodeIndex int) error CleanupNodes() error }