mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-10-31 22:01:06 +00:00
2295 lines
76 KiB
Go
2295 lines
76 KiB
Go
/*
|
|
Copyright 2023 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package benchmark
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"maps"
|
|
"math"
|
|
"os"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
|
"k8s.io/apimachinery/pkg/util/runtime"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
|
|
"k8s.io/client-go/dynamic"
|
|
"k8s.io/client-go/informers"
|
|
coreinformers "k8s.io/client-go/informers/core/v1"
|
|
clientset "k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/restmapper"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/component-base/featuregate"
|
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
|
"k8s.io/component-base/logs"
|
|
logsapi "k8s.io/component-base/logs/api/v1"
|
|
"k8s.io/component-base/metrics/legacyregistry"
|
|
"k8s.io/component-base/metrics/testutil"
|
|
"k8s.io/klog/v2"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
|
schedframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
|
frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
|
|
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
|
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
|
"k8s.io/kubernetes/test/integration/framework"
|
|
testutils "k8s.io/kubernetes/test/utils"
|
|
"k8s.io/kubernetes/test/utils/ktesting"
|
|
"k8s.io/kubernetes/test/utils/ktesting/initoption"
|
|
"k8s.io/utils/ptr"
|
|
"sigs.k8s.io/yaml"
|
|
)
|
|
|
|
type operationCode string
|
|
|
|
const (
|
|
allocResourceClaimsOpcode operationCode = "allocResourceClaims"
|
|
createAnyOpcode operationCode = "createAny"
|
|
createNodesOpcode operationCode = "createNodes"
|
|
createNamespacesOpcode operationCode = "createNamespaces"
|
|
createPodsOpcode operationCode = "createPods"
|
|
createPodSetsOpcode operationCode = "createPodSets"
|
|
deletePodsOpcode operationCode = "deletePods"
|
|
createResourceClaimsOpcode operationCode = "createResourceClaims"
|
|
createResourceDriverOpcode operationCode = "createResourceDriver"
|
|
churnOpcode operationCode = "churn"
|
|
updateAnyOpcode operationCode = "updateAny"
|
|
barrierOpcode operationCode = "barrier"
|
|
sleepOpcode operationCode = "sleep"
|
|
startCollectingMetricsOpcode operationCode = "startCollectingMetrics"
|
|
stopCollectingMetricsOpcode operationCode = "stopCollectingMetrics"
|
|
)
|
|
|
|
const (
|
|
// Two modes supported in "churn" operator.
|
|
|
|
// Create continuously create API objects without deleting them.
|
|
Create = "create"
|
|
// Recreate creates a number of API objects and then delete them, and repeat the iteration.
|
|
Recreate = "recreate"
|
|
)
|
|
|
|
const (
|
|
extensionPointsLabelName = "extension_point"
|
|
resultLabelName = "result"
|
|
pluginLabelName = "plugin"
|
|
eventLabelName = "event"
|
|
)
|
|
|
|
// Run with -v=2, this is the default log level in production.
|
|
//
|
|
// In a PR this can be bumped up temporarily to run pull-kubernetes-scheduler-perf
|
|
// with more log output.
|
|
const DefaultLoggingVerbosity = 2
|
|
|
|
var LoggingFeatureGate FeatureGateFlag
|
|
var LoggingConfig *logsapi.LoggingConfiguration
|
|
|
|
type FeatureGateFlag interface {
|
|
featuregate.FeatureGate
|
|
flag.Value
|
|
}
|
|
|
|
func init() {
|
|
f := featuregate.NewFeatureGate()
|
|
runtime.Must(logsapi.AddFeatureGates(f))
|
|
LoggingFeatureGate = f
|
|
|
|
LoggingConfig = logsapi.NewLoggingConfiguration()
|
|
LoggingConfig.Verbosity = DefaultLoggingVerbosity
|
|
}
|
|
|
|
var (
|
|
defaultMetricsCollectorConfig = metricsCollectorConfig{
|
|
Metrics: map[string][]*labelValues{
|
|
"scheduler_framework_extension_point_duration_seconds": {
|
|
{
|
|
label: extensionPointsLabelName,
|
|
values: metrics.ExtentionPoints,
|
|
},
|
|
},
|
|
"scheduler_scheduling_attempt_duration_seconds": {
|
|
{
|
|
label: resultLabelName,
|
|
values: []string{metrics.ScheduledResult, metrics.UnschedulableResult, metrics.ErrorResult},
|
|
},
|
|
},
|
|
"scheduler_pod_scheduling_duration_seconds": nil,
|
|
"scheduler_plugin_execution_duration_seconds": {
|
|
{
|
|
label: pluginLabelName,
|
|
values: PluginNames,
|
|
},
|
|
{
|
|
label: extensionPointsLabelName,
|
|
values: metrics.ExtentionPoints,
|
|
},
|
|
},
|
|
"scheduler_queueing_hint_execution_duration_seconds": {
|
|
{
|
|
label: pluginLabelName,
|
|
values: PluginNames,
|
|
},
|
|
{
|
|
label: eventLabelName,
|
|
values: schedframework.AllClusterEventLabels(),
|
|
},
|
|
},
|
|
"scheduler_event_handling_duration_seconds": {
|
|
{
|
|
label: eventLabelName,
|
|
values: schedframework.AllClusterEventLabels(),
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
// PluginNames is the names of the plugins that scheduler_perf collects metrics for.
|
|
// We export this variable because people outside k/k may want to put their custom plugins.
|
|
PluginNames = []string{
|
|
names.PrioritySort,
|
|
names.DefaultBinder,
|
|
names.DefaultPreemption,
|
|
names.DynamicResources,
|
|
names.ImageLocality,
|
|
names.InterPodAffinity,
|
|
names.NodeAffinity,
|
|
names.NodeName,
|
|
names.NodePorts,
|
|
names.NodeResourcesBalancedAllocation,
|
|
names.NodeResourcesFit,
|
|
names.NodeUnschedulable,
|
|
names.NodeVolumeLimits,
|
|
names.PodTopologySpread,
|
|
names.SchedulingGates,
|
|
names.TaintToleration,
|
|
names.VolumeBinding,
|
|
names.VolumeRestrictions,
|
|
names.VolumeZone,
|
|
}
|
|
)
|
|
|
|
var UseTestingLog *bool
|
|
var PerfSchedulingLabelFilter *string
|
|
var TestSchedulingLabelFilter *string
|
|
|
|
// InitTests should be called in a TestMain in each config subdirectory.
|
|
func InitTests() error {
|
|
// Run with -v=2, this is the default log level in production.
|
|
ktesting.SetDefaultVerbosity(DefaultLoggingVerbosity)
|
|
|
|
// test/integration/framework/flags.go unconditionally initializes the
|
|
// logging flags. That's correct for most tests, but in the
|
|
// scheduler_perf test we want more control over the flags, therefore
|
|
// here strip them out.
|
|
var fs flag.FlagSet
|
|
flag.CommandLine.VisitAll(func(f *flag.Flag) {
|
|
switch f.Name {
|
|
case "log-flush-frequency", "v", "vmodule":
|
|
// These will be added below ourselves, don't copy.
|
|
default:
|
|
fs.Var(f.Value, f.Name, f.Usage)
|
|
}
|
|
})
|
|
flag.CommandLine = &fs
|
|
|
|
flag.Var(LoggingFeatureGate, "feature-gate",
|
|
"A set of key=value pairs that describe feature gates for alpha/experimental features. "+
|
|
"Options are:\n"+strings.Join(LoggingFeatureGate.KnownFeatures(), "\n"))
|
|
|
|
UseTestingLog = flag.Bool("use-testing-log", false, "Write log entries with testing.TB.Log. This is more suitable for unit testing and debugging, but less realistic in real benchmarks.")
|
|
PerfSchedulingLabelFilter = flag.String("perf-scheduling-label-filter", "performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by BenchmarkPerfScheduling")
|
|
TestSchedulingLabelFilter = flag.String("test-scheduling-label-filter", "integration-test,-performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by TestScheduling")
|
|
|
|
// This would fail if we hadn't removed the logging flags above.
|
|
logsapi.AddGoFlags(LoggingConfig, flag.CommandLine)
|
|
|
|
flag.Parse()
|
|
|
|
logs.InitLogs()
|
|
return logsapi.ValidateAndApply(LoggingConfig, LoggingFeatureGate)
|
|
}
|
|
|
|
// testCase defines a set of test cases that intends to test the performance of
|
|
// similar workloads of varying sizes with shared overall settings such as
|
|
// feature gates and metrics collected.
|
|
type testCase struct {
|
|
// Name of the testCase.
|
|
Name string
|
|
// Feature gates to set before running the test.
|
|
// Optional
|
|
FeatureGates map[featuregate.Feature]bool
|
|
// List of metrics to collect. Defaults to
|
|
// defaultMetricsCollectorConfig if unspecified.
|
|
// Optional
|
|
MetricsCollectorConfig *metricsCollectorConfig
|
|
// Template for sequence of ops that each workload must follow. Each op will
|
|
// be executed serially one after another. Each element of the list must be
|
|
// createNodesOp, createPodsOp, or barrierOp.
|
|
WorkloadTemplate []op
|
|
// List of workloads to run under this testCase.
|
|
Workloads []*workload
|
|
// SchedulerConfigPath is the path of scheduler configuration
|
|
// Optional
|
|
SchedulerConfigPath string
|
|
// Default path to spec file describing the pods to create.
|
|
// This path can be overridden in createPodsOp by setting PodTemplatePath .
|
|
// Optional
|
|
DefaultPodTemplatePath *string
|
|
// Labels can be used to enable or disable workloads inside this test case.
|
|
Labels []string
|
|
// DefaultThresholdMetricSelector defines default metric used for threshold comparison.
|
|
// It is only populated to workloads without their ThresholdMetricSelector set.
|
|
// If nil, the default metric is set to "SchedulingThroughput".
|
|
// Optional
|
|
DefaultThresholdMetricSelector *thresholdMetricSelector
|
|
}
|
|
|
|
func (tc *testCase) collectsMetrics() bool {
|
|
for _, op := range tc.WorkloadTemplate {
|
|
if op.realOp.collectsMetrics() {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (tc *testCase) workloadNamesUnique() error {
|
|
workloadUniqueNames := map[string]bool{}
|
|
for _, w := range tc.Workloads {
|
|
if workloadUniqueNames[w.Name] {
|
|
return fmt.Errorf("%s: workload name %s is not unique", tc.Name, w.Name)
|
|
}
|
|
workloadUniqueNames[w.Name] = true
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// workload is a subtest under a testCase that tests the scheduler performance
|
|
// for a certain ordering of ops. The set of nodes created and pods scheduled
|
|
// in a workload may be heterogeneous.
|
|
type workload struct {
|
|
// Name of the workload.
|
|
Name string
|
|
// Values of parameters used in the workloadTemplate.
|
|
Params params
|
|
// Labels can be used to enable or disable a workload.
|
|
Labels []string
|
|
// Threshold is compared to average value of metric specified using thresholdMetricSelector.
|
|
// The comparison is performed for op with CollectMetrics set to true.
|
|
// If the measured value is below the threshold, the workload's test case will fail.
|
|
// If set to zero, the threshold check is disabled.
|
|
// Optional
|
|
Threshold float64
|
|
// ThresholdMetricSelector defines to what metric the Threshold should be compared.
|
|
// If nil, the metric is set to DefaultThresholdMetricSelector of the testCase.
|
|
// If DefaultThresholdMetricSelector is nil, the metric is set to "SchedulingThroughput".
|
|
// Optional
|
|
ThresholdMetricSelector *thresholdMetricSelector
|
|
// Feature gates to set before running the workload.
|
|
// Explicitly setting a feature in this map overrides the test case settings.
|
|
// Optional
|
|
FeatureGates map[featuregate.Feature]bool
|
|
}
|
|
|
|
func (w *workload) isValid(mcc *metricsCollectorConfig) error {
|
|
if w.Threshold < 0 {
|
|
return fmt.Errorf("invalid Threshold=%f; should be non-negative", w.Threshold)
|
|
}
|
|
|
|
return w.ThresholdMetricSelector.isValid(mcc)
|
|
}
|
|
|
|
func (w *workload) setDefaults(testCaseThresholdMetricSelector *thresholdMetricSelector) {
|
|
if w.ThresholdMetricSelector != nil {
|
|
return
|
|
}
|
|
if testCaseThresholdMetricSelector != nil {
|
|
w.ThresholdMetricSelector = testCaseThresholdMetricSelector
|
|
return
|
|
}
|
|
// By defult, SchedulingThroughput should be compared with the threshold.
|
|
w.ThresholdMetricSelector = &thresholdMetricSelector{
|
|
Name: "SchedulingThroughput",
|
|
}
|
|
}
|
|
|
|
// thresholdMetricSelector defines the name and labels of metric to compare with threshold.
|
|
type thresholdMetricSelector struct {
|
|
// Name of the metric is compared to "Metric" field in DataItem labels.
|
|
Name string
|
|
// Labels of the metric. All of them needs to match the metric's labels to assume equality.
|
|
Labels map[string]string
|
|
// ExpectLower defines whether the threshold should denote the maximum allowable value of the metric.
|
|
// If false, the threshold defines minimum allowable value.
|
|
// Optional
|
|
ExpectLower bool
|
|
}
|
|
|
|
func (ms thresholdMetricSelector) isValid(mcc *metricsCollectorConfig) error {
|
|
if ms.Name == "SchedulingThroughput" {
|
|
return nil
|
|
}
|
|
|
|
if mcc == nil {
|
|
mcc = &defaultMetricsCollectorConfig
|
|
}
|
|
|
|
labels, ok := mcc.Metrics[ms.Name]
|
|
if !ok {
|
|
return fmt.Errorf("the metric %v is targeted, but it's not collected during the test. Make sure the MetricsCollectorConfig is valid", ms.Name)
|
|
}
|
|
|
|
for _, labelsComb := range uniqueLVCombos(labels) {
|
|
if labelsMatch(labelsComb, ms.Labels) {
|
|
return nil
|
|
}
|
|
}
|
|
return fmt.Errorf("no matching labels found for metric %v", ms.Name)
|
|
}
|
|
|
|
type params struct {
|
|
params map[string]any
|
|
// isUsed field records whether params is used or not.
|
|
isUsed map[string]bool
|
|
}
|
|
|
|
// UnmarshalJSON is a custom unmarshaler for params.
|
|
//
|
|
// from(json):
|
|
//
|
|
// {
|
|
// "initNodes": 500,
|
|
// "initPods": 50
|
|
// }
|
|
//
|
|
// to:
|
|
//
|
|
// params{
|
|
// params: map[string]any{
|
|
// "intNodes": 500,
|
|
// "initPods": 50,
|
|
// },
|
|
// isUsed: map[string]bool{}, // empty map
|
|
// }
|
|
func (p *params) UnmarshalJSON(b []byte) error {
|
|
aux := map[string]any{}
|
|
|
|
if err := json.Unmarshal(b, &aux); err != nil {
|
|
return err
|
|
}
|
|
|
|
p.params = aux
|
|
p.isUsed = map[string]bool{}
|
|
return nil
|
|
}
|
|
|
|
// get retrieves the parameter as an integer
|
|
func (p params) get(key string) (int, error) {
|
|
// JSON unmarshals integer constants in an "any" field as float.
|
|
f, err := getParam[float64](p, key)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return int(f), nil
|
|
}
|
|
|
|
// getParam retrieves the parameter as specific type. There is no conversion,
|
|
// so in practice this means that only types that JSON unmarshaling uses
|
|
// (float64, string, bool) work.
|
|
func getParam[T float64 | string | bool](p params, key string) (T, error) {
|
|
p.isUsed[key] = true
|
|
param, ok := p.params[key]
|
|
var t T
|
|
if !ok {
|
|
return t, fmt.Errorf("parameter %s is undefined", key)
|
|
}
|
|
t, ok = param.(T)
|
|
if !ok {
|
|
return t, fmt.Errorf("parameter %s has the wrong type %T", key, param)
|
|
}
|
|
return t, nil
|
|
}
|
|
|
|
// unusedParams returns the names of unusedParams
|
|
func (w workload) unusedParams() []string {
|
|
var ret []string
|
|
for name := range w.Params.params {
|
|
if !w.Params.isUsed[name] {
|
|
ret = append(ret, name)
|
|
}
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// op is a dummy struct which stores the real op in itself.
|
|
type op struct {
|
|
realOp realOp
|
|
}
|
|
|
|
// UnmarshalJSON is a custom unmarshaler for the op struct since we don't know
|
|
// which op we're decoding at runtime.
|
|
func (op *op) UnmarshalJSON(b []byte) error {
|
|
possibleOps := map[operationCode]realOp{
|
|
allocResourceClaimsOpcode: &allocResourceClaimsOp{},
|
|
createAnyOpcode: &createAny{},
|
|
createNodesOpcode: &createNodesOp{},
|
|
createNamespacesOpcode: &createNamespacesOp{},
|
|
createPodsOpcode: &createPodsOp{},
|
|
createPodSetsOpcode: &createPodSetsOp{},
|
|
deletePodsOpcode: &deletePodsOp{},
|
|
createResourceClaimsOpcode: &createResourceClaimsOp{},
|
|
createResourceDriverOpcode: &createResourceDriverOp{},
|
|
churnOpcode: &churnOp{},
|
|
updateAnyOpcode: &updateAny{},
|
|
barrierOpcode: &barrierOp{},
|
|
sleepOpcode: &sleepOp{},
|
|
startCollectingMetricsOpcode: &startCollectingMetricsOp{},
|
|
stopCollectingMetricsOpcode: &stopCollectingMetricsOp{},
|
|
// TODO(#94601): add a delete nodes op to simulate scaling behaviour?
|
|
}
|
|
// First determine the opcode using lenient decoding (= ignore extra fields).
|
|
var possibleOp struct {
|
|
Opcode operationCode
|
|
}
|
|
if err := json.Unmarshal(b, &possibleOp); err != nil {
|
|
return fmt.Errorf("decoding opcode from %s: %w", string(b), err)
|
|
}
|
|
realOp, ok := possibleOps[possibleOp.Opcode]
|
|
if !ok {
|
|
return fmt.Errorf("unknown opcode %q in %s", possibleOp.Opcode, string(b))
|
|
}
|
|
decoder := json.NewDecoder(bytes.NewReader(b))
|
|
decoder.DisallowUnknownFields()
|
|
if err := decoder.Decode(realOp); err != nil {
|
|
return fmt.Errorf("decoding %s into %T: %w", string(b), realOp, err)
|
|
}
|
|
if err := realOp.isValid(true); err != nil {
|
|
return fmt.Errorf("%s not valid for %T: %w", string(b), realOp, err)
|
|
}
|
|
op.realOp = realOp
|
|
return nil
|
|
}
|
|
|
|
// realOp is an interface that is implemented by different structs. To evaluate
|
|
// the validity of ops at parse-time, a isValid function must be implemented.
|
|
type realOp interface {
|
|
// isValid verifies the validity of the op args such as node/pod count. Note
|
|
// that we don't catch undefined parameters at this stage.
|
|
//
|
|
// This returns errInvalidOp if the configured operation does not match.
|
|
isValid(allowParameterization bool) error
|
|
// collectsMetrics checks if the op collects metrics.
|
|
collectsMetrics() bool
|
|
// patchParams returns a patched realOp of the same type after substituting
|
|
// parameterizable values with workload-specific values. One should implement
|
|
// this method on the value receiver base type, not a pointer receiver base
|
|
// type, even though calls will be made from with a *realOp. This is because
|
|
// callers don't want the receiver to inadvertently modify the realOp
|
|
// (instead, it's returned as a return value).
|
|
patchParams(w *workload) (realOp, error)
|
|
}
|
|
|
|
// runnableOp is an interface implemented by some operations. It makes it posssible
|
|
// to execute the operation without having to add separate code into runWorkload.
|
|
type runnableOp interface {
|
|
realOp
|
|
|
|
// requiredNamespaces returns all namespaces that runWorkload must create
|
|
// before running the operation.
|
|
requiredNamespaces() []string
|
|
// run executes the steps provided by the operation.
|
|
run(ktesting.TContext)
|
|
}
|
|
|
|
func isValidParameterizable(val string) bool {
|
|
return strings.HasPrefix(val, "$")
|
|
}
|
|
|
|
func isValidCount(allowParameterization bool, count int, countParam string) bool {
|
|
if !allowParameterization || countParam == "" {
|
|
// Ignore parameter. The value itself must be okay.
|
|
return count >= 0
|
|
}
|
|
return isValidParameterizable(countParam)
|
|
}
|
|
|
|
// createNodesOp defines an op where nodes are created as a part of a workload.
|
|
type createNodesOp struct {
|
|
// Must be "createNodes".
|
|
Opcode operationCode
|
|
// Number of nodes to create. Parameterizable through CountParam.
|
|
Count int
|
|
// Template parameter for Count.
|
|
CountParam string
|
|
// Path to spec file describing the nodes to create.
|
|
// Optional
|
|
NodeTemplatePath *string
|
|
// At most one of the following strategies can be defined. Defaults
|
|
// to TrivialNodePrepareStrategy if unspecified.
|
|
// Optional
|
|
NodeAllocatableStrategy *testutils.NodeAllocatableStrategy
|
|
LabelNodePrepareStrategy *testutils.LabelNodePrepareStrategy
|
|
UniqueNodeLabelStrategy *testutils.UniqueNodeLabelStrategy
|
|
}
|
|
|
|
func (cno *createNodesOp) isValid(allowParameterization bool) error {
|
|
if !isValidCount(allowParameterization, cno.Count, cno.CountParam) {
|
|
return fmt.Errorf("invalid Count=%d / CountParam=%q", cno.Count, cno.CountParam)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*createNodesOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (cno createNodesOp) patchParams(w *workload) (realOp, error) {
|
|
if cno.CountParam != "" {
|
|
var err error
|
|
cno.Count, err = w.Params.get(cno.CountParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return &cno, (&cno).isValid(false)
|
|
}
|
|
|
|
// createNamespacesOp defines an op for creating namespaces
|
|
type createNamespacesOp struct {
|
|
// Must be "createNamespaces".
|
|
Opcode operationCode
|
|
// Name prefix of the Namespace. The format is "<prefix>-<number>", where number is
|
|
// between 0 and count-1.
|
|
Prefix string
|
|
// Number of namespaces to create. Parameterizable through CountParam.
|
|
Count int
|
|
// Template parameter for Count. Takes precedence over Count if both set.
|
|
CountParam string
|
|
// Path to spec file describing the Namespaces to create.
|
|
// Optional
|
|
NamespaceTemplatePath *string
|
|
}
|
|
|
|
func (cmo *createNamespacesOp) isValid(allowParameterization bool) error {
|
|
if !isValidCount(allowParameterization, cmo.Count, cmo.CountParam) {
|
|
return fmt.Errorf("invalid Count=%d / CountParam=%q", cmo.Count, cmo.CountParam)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*createNamespacesOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (cmo createNamespacesOp) patchParams(w *workload) (realOp, error) {
|
|
if cmo.CountParam != "" {
|
|
var err error
|
|
cmo.Count, err = w.Params.get(cmo.CountParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return &cmo, (&cmo).isValid(false)
|
|
}
|
|
|
|
// createPodsOp defines an op where pods are scheduled as a part of a workload.
|
|
// The test can block on the completion of this op before moving forward or
|
|
// continue asynchronously.
|
|
type createPodsOp struct {
|
|
// Must be "createPods".
|
|
Opcode operationCode
|
|
// Number of pods to schedule. Parameterizable through CountParam.
|
|
Count int
|
|
// Template parameter for Count.
|
|
CountParam string
|
|
// If false, Count pods get created rapidly. This can be used to
|
|
// measure how quickly the scheduler can fill up a cluster.
|
|
//
|
|
// If true, Count pods get created, the operation waits for
|
|
// a pod to get scheduled, deletes it and then creates another.
|
|
// This continues until the configured Duration is over.
|
|
// Metrics collection, if enabled, runs in parallel.
|
|
//
|
|
// This mode can be used to measure how the scheduler behaves
|
|
// in a steady state where the cluster is always at roughly the
|
|
// same level of utilization. Pods can be created in a separate,
|
|
// earlier operation to simulate non-empty clusters.
|
|
//
|
|
// Note that the operation will delete any scheduled pod in
|
|
// the namespace, so use different namespaces for pods that
|
|
// are supposed to be kept running.
|
|
SteadyState bool
|
|
// How long to keep the cluster in a steady state.
|
|
Duration metav1.Duration
|
|
// Template parameter for Duration.
|
|
DurationParam string
|
|
// Whether or not to enable metrics collection for this createPodsOp.
|
|
// Optional. Both CollectMetrics and SkipWaitToCompletion cannot be true at
|
|
// the same time for a particular createPodsOp.
|
|
CollectMetrics bool
|
|
// Namespace the pods should be created in. Defaults to a unique
|
|
// namespace of the format "namespace-<number>".
|
|
// Optional
|
|
Namespace *string
|
|
// Path to spec file describing the pods to schedule.
|
|
// If nil, DefaultPodTemplatePath will be used.
|
|
// Optional
|
|
PodTemplatePath *string
|
|
// Whether or not to wait for all pods in this op to get scheduled.
|
|
// Defaults to false if not specified.
|
|
// Optional
|
|
SkipWaitToCompletion bool
|
|
// Persistent volume settings for the pods to be scheduled.
|
|
// Optional
|
|
PersistentVolumeTemplatePath *string
|
|
PersistentVolumeClaimTemplatePath *string
|
|
}
|
|
|
|
func (cpo *createPodsOp) isValid(allowParameterization bool) error {
|
|
if !isValidCount(allowParameterization, cpo.Count, cpo.CountParam) {
|
|
return fmt.Errorf("invalid Count=%d / CountParam=%q", cpo.Count, cpo.CountParam)
|
|
}
|
|
if cpo.CollectMetrics && cpo.SkipWaitToCompletion {
|
|
// While it's technically possible to achieve this, the additional
|
|
// complexity is not worth it, especially given that we don't have any
|
|
// use-cases right now.
|
|
return fmt.Errorf("collectMetrics and skipWaitToCompletion cannot be true at the same time")
|
|
}
|
|
if cpo.SkipWaitToCompletion && cpo.SteadyState {
|
|
return errors.New("skipWaitToCompletion and steadyState cannot be true at the same time")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (cpo *createPodsOp) collectsMetrics() bool {
|
|
return cpo.CollectMetrics
|
|
}
|
|
|
|
func (cpo createPodsOp) patchParams(w *workload) (realOp, error) {
|
|
if cpo.CountParam != "" {
|
|
var err error
|
|
cpo.Count, err = w.Params.get(cpo.CountParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if cpo.DurationParam != "" {
|
|
durationStr, err := getParam[string](w.Params, cpo.DurationParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if cpo.Duration.Duration, err = time.ParseDuration(durationStr); err != nil {
|
|
return nil, fmt.Errorf("parsing duration parameter %s: %w", cpo.DurationParam, err)
|
|
}
|
|
}
|
|
return &cpo, (&cpo).isValid(false)
|
|
}
|
|
|
|
// createPodSetsOp defines an op where a set of createPodsOps is created in each unique namespace.
|
|
type createPodSetsOp struct {
|
|
// Must be "createPodSets".
|
|
Opcode operationCode
|
|
// Number of sets to create.
|
|
Count int
|
|
// Template parameter for Count.
|
|
CountParam string
|
|
// Each set of pods will be created in a namespace of the form namespacePrefix-<number>,
|
|
// where number is from 0 to count-1
|
|
NamespacePrefix string
|
|
// The template of a createPodsOp.
|
|
CreatePodsOp createPodsOp
|
|
}
|
|
|
|
func (cpso *createPodSetsOp) isValid(allowParameterization bool) error {
|
|
if !isValidCount(allowParameterization, cpso.Count, cpso.CountParam) {
|
|
return fmt.Errorf("invalid Count=%d / CountParam=%q", cpso.Count, cpso.CountParam)
|
|
}
|
|
return cpso.CreatePodsOp.isValid(allowParameterization)
|
|
}
|
|
|
|
func (cpso *createPodSetsOp) collectsMetrics() bool {
|
|
return cpso.CreatePodsOp.CollectMetrics
|
|
}
|
|
|
|
func (cpso createPodSetsOp) patchParams(w *workload) (realOp, error) {
|
|
if cpso.CountParam != "" {
|
|
var err error
|
|
cpso.Count, err = w.Params.get(cpso.CountParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return &cpso, (&cpso).isValid(true)
|
|
}
|
|
|
|
// deletePodsOp defines an op where previously created pods are deleted.
|
|
// The test can block on the completion of this op before moving forward or
|
|
// continue asynchronously.
|
|
type deletePodsOp struct {
|
|
// Must be "deletePods".
|
|
Opcode operationCode
|
|
// Namespace the pods should be deleted from.
|
|
Namespace string
|
|
// Labels used to filter the pods to delete.
|
|
// If empty, it will delete all Pods in the namespace.
|
|
// Optional.
|
|
LabelSelector map[string]string
|
|
// Whether or not to wait for all pods in this op to be deleted.
|
|
// Defaults to false if not specified.
|
|
// Optional
|
|
SkipWaitToCompletion bool
|
|
// Number of pods to be deleted per second.
|
|
// If zero, all pods are deleted at once.
|
|
// Optional
|
|
DeletePodsPerSecond int
|
|
}
|
|
|
|
func (dpo *deletePodsOp) isValid(allowParameterization bool) error {
|
|
if dpo.Opcode != deletePodsOpcode {
|
|
return fmt.Errorf("invalid opcode %q; expected %q", dpo.Opcode, deletePodsOpcode)
|
|
}
|
|
if dpo.DeletePodsPerSecond < 0 {
|
|
return fmt.Errorf("invalid DeletePodsPerSecond=%d; should be non-negative", dpo.DeletePodsPerSecond)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (dpo *deletePodsOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (dpo deletePodsOp) patchParams(w *workload) (realOp, error) {
|
|
return &dpo, nil
|
|
}
|
|
|
|
// churnOp defines an op where services are created as a part of a workload.
|
|
type churnOp struct {
|
|
// Must be "churnOp".
|
|
Opcode operationCode
|
|
// Value must be one of the followings:
|
|
// - recreate. In this mode, API objects will be created for N cycles, and then
|
|
// deleted in the next N cycles. N is specified by the "Number" field.
|
|
// - create. In this mode, API objects will be created (without deletion) until
|
|
// reaching a threshold - which is specified by the "Number" field.
|
|
Mode string
|
|
// Maximum number of API objects to be created.
|
|
// Defaults to 0, which means unlimited.
|
|
Number int
|
|
// Intervals of churning. Defaults to 500 millisecond.
|
|
IntervalMilliseconds int64
|
|
// Namespace the churning objects should be created in. Defaults to a unique
|
|
// namespace of the format "namespace-<number>".
|
|
// Optional
|
|
Namespace *string
|
|
// Path of API spec files.
|
|
TemplatePaths []string
|
|
}
|
|
|
|
func (co *churnOp) isValid(_ bool) error {
|
|
if co.Mode != Recreate && co.Mode != Create {
|
|
return fmt.Errorf("invalid mode: %v. must be one of %v", co.Mode, []string{Recreate, Create})
|
|
}
|
|
if co.Number < 0 {
|
|
return fmt.Errorf("number (%v) cannot be negative", co.Number)
|
|
}
|
|
if co.Mode == Recreate && co.Number == 0 {
|
|
return fmt.Errorf("number cannot be 0 when mode is %v", Recreate)
|
|
}
|
|
if len(co.TemplatePaths) == 0 {
|
|
return fmt.Errorf("at least one template spec file needs to be specified")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*churnOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (co churnOp) patchParams(w *workload) (realOp, error) {
|
|
return &co, nil
|
|
}
|
|
|
|
type SchedulingStage string
|
|
|
|
const (
|
|
Scheduled SchedulingStage = "Scheduled"
|
|
Attempted SchedulingStage = "Attempted"
|
|
)
|
|
|
|
// barrierOp defines an op that can be used to wait until all scheduled pods of
|
|
// one or many namespaces have been bound to nodes. This is useful when pods
|
|
// were scheduled with SkipWaitToCompletion set to true.
|
|
type barrierOp struct {
|
|
// Must be "barrier".
|
|
Opcode operationCode
|
|
// Namespaces to block on. Empty array or not specifying this field signifies
|
|
// that the barrier should block on all namespaces.
|
|
Namespaces []string
|
|
// Labels used to filter the pods to block on.
|
|
// If empty, it won't filter the labels.
|
|
// Optional.
|
|
LabelSelector map[string]string
|
|
// Determines what stage of pods scheduling the barrier should wait for.
|
|
// If empty, it is interpreted as "Scheduled".
|
|
// Optional
|
|
StageRequirement SchedulingStage
|
|
}
|
|
|
|
func (bo *barrierOp) isValid(allowParameterization bool) error {
|
|
if bo.StageRequirement != "" && bo.StageRequirement != Scheduled && bo.StageRequirement != Attempted {
|
|
return fmt.Errorf("invalid StageRequirement %s", bo.StageRequirement)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*barrierOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (bo barrierOp) patchParams(w *workload) (realOp, error) {
|
|
if bo.StageRequirement == "" {
|
|
bo.StageRequirement = Scheduled
|
|
}
|
|
return &bo, nil
|
|
}
|
|
|
|
// sleepOp defines an op that can be used to sleep for a specified amount of time.
|
|
// This is useful in simulating workloads that require some sort of time-based synchronisation.
|
|
type sleepOp struct {
|
|
// Must be "sleep".
|
|
Opcode operationCode
|
|
// Duration of sleep.
|
|
Duration metav1.Duration
|
|
// Template parameter for Duration.
|
|
DurationParam string
|
|
}
|
|
|
|
func (so *sleepOp) isValid(_ bool) error {
|
|
return nil
|
|
}
|
|
|
|
func (so *sleepOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (so sleepOp) patchParams(w *workload) (realOp, error) {
|
|
if so.DurationParam != "" {
|
|
durationStr, err := getParam[string](w.Params, so.DurationParam[1:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if so.Duration.Duration, err = time.ParseDuration(durationStr); err != nil {
|
|
return nil, fmt.Errorf("invalid duration parameter %s: %w", so.DurationParam, err)
|
|
}
|
|
}
|
|
return &so, nil
|
|
}
|
|
|
|
// startCollectingMetricsOp defines an op that starts metrics collectors.
|
|
// stopCollectingMetricsOp has to be used after this op to finish collecting.
|
|
type startCollectingMetricsOp struct {
|
|
// Must be "startCollectingMetrics".
|
|
Opcode operationCode
|
|
// Name appended to workload's name in results.
|
|
Name string
|
|
// Namespaces for which the scheduling throughput metric is calculated.
|
|
Namespaces []string
|
|
// Labels used to filter the pods for which the scheduling throughput metric is collected.
|
|
// If empty, it will collect the metric for all pods in the selected namespaces.
|
|
// Optional.
|
|
LabelSelector map[string]string
|
|
}
|
|
|
|
func (scm *startCollectingMetricsOp) isValid(_ bool) error {
|
|
if len(scm.Namespaces) == 0 {
|
|
return fmt.Errorf("namespaces cannot be empty")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*startCollectingMetricsOp) collectsMetrics() bool {
|
|
return false
|
|
}
|
|
|
|
func (scm startCollectingMetricsOp) patchParams(_ *workload) (realOp, error) {
|
|
return &scm, nil
|
|
}
|
|
|
|
// stopCollectingMetricsOp defines an op that stops collecting the metrics
|
|
// and writes them into the result slice.
|
|
// startCollectingMetricsOp has be used before this op to begin collecting.
|
|
type stopCollectingMetricsOp struct {
|
|
// Must be "stopCollectingMetrics".
|
|
Opcode operationCode
|
|
}
|
|
|
|
func (scm *stopCollectingMetricsOp) isValid(_ bool) error {
|
|
return nil
|
|
}
|
|
|
|
func (*stopCollectingMetricsOp) collectsMetrics() bool {
|
|
return true
|
|
}
|
|
|
|
func (scm stopCollectingMetricsOp) patchParams(_ *workload) (realOp, error) {
|
|
return &scm, nil
|
|
}
|
|
|
|
func initTestOutput(tb testing.TB) io.Writer {
|
|
var output io.Writer
|
|
if *UseTestingLog {
|
|
output = framework.NewTBWriter(tb)
|
|
} else {
|
|
tmpDir := tb.TempDir()
|
|
logfileName := path.Join(tmpDir, "output.log")
|
|
fileOutput, err := os.Create(logfileName)
|
|
if err != nil {
|
|
tb.Fatalf("create log file: %v", err)
|
|
}
|
|
output = fileOutput
|
|
|
|
tb.Cleanup(func() {
|
|
// Dump the log output when the test is done. The user
|
|
// can decide how much of it will be visible in case of
|
|
// success: then "go test" truncates, "go test -v"
|
|
// doesn't. All of it will be shown for a failure.
|
|
if err := fileOutput.Close(); err != nil {
|
|
tb.Fatalf("close log file: %v", err)
|
|
}
|
|
log, err := os.ReadFile(logfileName)
|
|
if err != nil {
|
|
tb.Fatalf("read log file: %v", err)
|
|
}
|
|
tb.Logf("full log output:\n%s", string(log))
|
|
})
|
|
}
|
|
return output
|
|
}
|
|
|
|
var specialFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9-_]`)
|
|
|
|
func setupTestCase(t testing.TB, tc *testCase, featureGates map[featuregate.Feature]bool, output io.Writer, outOfTreePluginRegistry frameworkruntime.Registry) (informers.SharedInformerFactory, ktesting.TContext) {
|
|
tCtx := ktesting.Init(t, initoption.PerTestOutput(*UseTestingLog))
|
|
artifacts, doArtifacts := os.LookupEnv("ARTIFACTS")
|
|
if !*UseTestingLog && doArtifacts {
|
|
// Reconfigure logging so that it goes to a separate file per
|
|
// test instead of stderr. If the test passes, the file gets
|
|
// deleted. The overall output can be very large (> 200 MB for
|
|
// ci-benchmark-scheduler-perf-master). With this approach, we
|
|
// have log output for failures without having to store large
|
|
// amounts of data that no-one is looking at. The performance
|
|
// is the same as writing to stderr.
|
|
if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
|
|
t.Fatalf("Failed to reset the logging configuration: %v", err)
|
|
}
|
|
logfileName := path.Join(artifacts, specialFilenameChars.ReplaceAllString(t.Name(), "_")+".log")
|
|
out, err := os.Create(logfileName)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create per-test log output file: %v", err)
|
|
}
|
|
t.Cleanup(func() {
|
|
// Everything should have stopped by now, checked below
|
|
// by GoleakCheck (which runs first during test
|
|
// shutdown!). Therefore we can clean up. Errors get logged
|
|
// and fail the test, but cleanup tries to continue.
|
|
//
|
|
// Note that the race detector will flag any goroutine
|
|
// as causing a race if there is no explicit wait for
|
|
// that goroutine to stop. We know that they must have
|
|
// stopped (GoLeakCheck!) but the race detector
|
|
// doesn't.
|
|
//
|
|
// This is a major issue because many Kubernetes goroutines get
|
|
// started without waiting for them to stop :-(
|
|
if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
|
|
t.Errorf("Failed to reset the logging configuration: %v", err)
|
|
}
|
|
if err := out.Close(); err != nil {
|
|
t.Errorf("Failed to close the per-test log output file: %s: %v", logfileName, err)
|
|
}
|
|
if !t.Failed() {
|
|
if err := os.Remove(logfileName); err != nil {
|
|
t.Errorf("Failed to remove the per-test log output file: %v", err)
|
|
}
|
|
}
|
|
})
|
|
opts := &logsapi.LoggingOptions{
|
|
ErrorStream: out,
|
|
InfoStream: out,
|
|
}
|
|
if err := logsapi.ValidateAndApplyWithOptions(LoggingConfig, opts, LoggingFeatureGate); err != nil {
|
|
t.Fatalf("Failed to apply the per-test logging configuration: %v", err)
|
|
}
|
|
|
|
}
|
|
|
|
// Ensure that there are no leaked
|
|
// goroutines. They could influence
|
|
// performance of the next benchmark.
|
|
// This must *after* RedirectKlog
|
|
// because then during cleanup, the
|
|
// test will wait for goroutines to
|
|
// quit *before* restoring klog settings.
|
|
framework.GoleakCheck(t)
|
|
|
|
// Now that we are ready to run, start
|
|
// a brand new etcd.
|
|
framework.StartEtcd(t, output, true)
|
|
|
|
for feature, flag := range featureGates {
|
|
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, feature, flag)
|
|
}
|
|
|
|
// 30 minutes should be plenty enough even for the 5000-node tests.
|
|
timeout := 30 * time.Minute
|
|
tCtx = ktesting.WithTimeout(tCtx, timeout, fmt.Sprintf("timed out after the %s per-test timeout", timeout))
|
|
|
|
return setupClusterForWorkload(tCtx, tc.SchedulerConfigPath, featureGates, outOfTreePluginRegistry)
|
|
}
|
|
|
|
func featureGatesMerge(src map[featuregate.Feature]bool, overrides map[featuregate.Feature]bool) map[featuregate.Feature]bool {
|
|
if len(src) == 0 {
|
|
return maps.Clone(overrides)
|
|
}
|
|
result := maps.Clone(src)
|
|
for feature, enabled := range overrides {
|
|
result[feature] = enabled
|
|
}
|
|
return result
|
|
}
|
|
|
|
// RunBenchmarkPerfScheduling runs the scheduler performance benchmark tests.
|
|
//
|
|
// You can pass your own scheduler plugins via outOfTreePluginRegistry.
|
|
// Also, you may want to put your plugins in PluginNames variable in this package
|
|
// to collect metrics for them.
|
|
func RunBenchmarkPerfScheduling(b *testing.B, configFile string, topicName string, outOfTreePluginRegistry frameworkruntime.Registry) {
|
|
testCases, err := getTestCases(configFile)
|
|
if err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
if err = validateTestCases(testCases); err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
|
|
if testing.Short() {
|
|
*PerfSchedulingLabelFilter += ",+short"
|
|
}
|
|
testcaseLabelSelectors := strings.Split(*PerfSchedulingLabelFilter, ",")
|
|
|
|
output := initTestOutput(b)
|
|
|
|
// Because we run sequentially, it is possible to change the global
|
|
// klog logger and redirect log output. Quite a lot of code still uses
|
|
// it instead of supporting contextual logging.
|
|
//
|
|
// Because we leak one goroutine which calls klog, we cannot restore
|
|
// the previous state.
|
|
_ = framework.RedirectKlog(b, output)
|
|
|
|
dataItems := DataItems{Version: "v1"}
|
|
for _, tc := range testCases {
|
|
b.Run(tc.Name, func(b *testing.B) {
|
|
for _, w := range tc.Workloads {
|
|
b.Run(w.Name, func(b *testing.B) {
|
|
if !enabled(testcaseLabelSelectors, append(tc.Labels, w.Labels...)...) {
|
|
b.Skipf("disabled by label filter %v", PerfSchedulingLabelFilter)
|
|
}
|
|
|
|
featureGates := featureGatesMerge(tc.FeatureGates, w.FeatureGates)
|
|
informerFactory, tCtx := setupTestCase(b, tc, featureGates, output, outOfTreePluginRegistry)
|
|
|
|
results := runWorkload(tCtx, tc, w, informerFactory)
|
|
dataItems.DataItems = append(dataItems.DataItems, results...)
|
|
|
|
if len(results) > 0 {
|
|
// The default ns/op is not
|
|
// useful because it includes
|
|
// the time spent on
|
|
// initialization and shutdown. Here we suppress it.
|
|
b.ReportMetric(0, "ns/op")
|
|
|
|
// Instead, report the same
|
|
// results that also get stored
|
|
// in the JSON file.
|
|
for _, result := range results {
|
|
// For some metrics like
|
|
// scheduler_framework_extension_point_duration_seconds
|
|
// the actual value has some
|
|
// other unit. We patch the key
|
|
// to make it look right.
|
|
metric := strings.ReplaceAll(result.Labels["Metric"], "_seconds", "_"+result.Unit)
|
|
for key, value := range result.Data {
|
|
b.ReportMetric(value, metric+"/"+key)
|
|
}
|
|
}
|
|
}
|
|
|
|
if featureGates[features.SchedulerQueueingHints] {
|
|
// In any case, we should make sure InFlightEvents is empty after running the scenario.
|
|
if err = checkEmptyInFlightEvents(); err != nil {
|
|
tCtx.Errorf("%s: %s", w.Name, err)
|
|
}
|
|
}
|
|
|
|
// Reset metrics to prevent metrics generated in current workload gets
|
|
// carried over to the next workload.
|
|
legacyregistry.Reset()
|
|
|
|
// Exactly one result is expected to contain the progress information.
|
|
for _, item := range results {
|
|
if len(item.progress) == 0 {
|
|
continue
|
|
}
|
|
|
|
destFile, err := dataFilename(strings.ReplaceAll(fmt.Sprintf("%s_%s_%s_%s.dat", tc.Name, w.Name, topicName, runID), "/", "_"))
|
|
if err != nil {
|
|
b.Fatalf("prepare data file: %v", err)
|
|
}
|
|
f, err := os.Create(destFile)
|
|
if err != nil {
|
|
b.Fatalf("create data file: %v", err)
|
|
}
|
|
|
|
// Print progress over time.
|
|
for _, sample := range item.progress {
|
|
fmt.Fprintf(f, "%.1fs %d %d %d %f\n", sample.ts.Sub(item.start).Seconds(), sample.completed, sample.attempts, sample.observedTotal, sample.observedRate)
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
b.Fatalf("closing data file: %v", err)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|
|
if err := dataItems2JSONFile(dataItems, b.Name()+"_benchmark_"+topicName); err != nil {
|
|
b.Fatalf("unable to write measured data %+v: %v", dataItems, err)
|
|
}
|
|
}
|
|
|
|
// RunIntegrationPerfScheduling runs the scheduler performance integration tests.
|
|
func RunIntegrationPerfScheduling(t *testing.T, configFile string) {
|
|
testCases, err := getTestCases(configFile)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err = validateTestCases(testCases); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if testing.Short() {
|
|
*TestSchedulingLabelFilter += ",+short"
|
|
}
|
|
testcaseLabelSelectors := strings.Split(*TestSchedulingLabelFilter, ",")
|
|
|
|
for _, tc := range testCases {
|
|
t.Run(tc.Name, func(t *testing.T) {
|
|
for _, w := range tc.Workloads {
|
|
t.Run(w.Name, func(t *testing.T) {
|
|
if !enabled(testcaseLabelSelectors, append(tc.Labels, w.Labels...)...) {
|
|
t.Skipf("disabled by label filter %q", *TestSchedulingLabelFilter)
|
|
}
|
|
featureGates := featureGatesMerge(tc.FeatureGates, w.FeatureGates)
|
|
informerFactory, tCtx := setupTestCase(t, tc, featureGates, nil, nil)
|
|
|
|
runWorkload(tCtx, tc, w, informerFactory)
|
|
|
|
if featureGates[features.SchedulerQueueingHints] {
|
|
// In any case, we should make sure InFlightEvents is empty after running the scenario.
|
|
if err = checkEmptyInFlightEvents(); err != nil {
|
|
tCtx.Errorf("%s: %s", w.Name, err)
|
|
}
|
|
}
|
|
|
|
// Reset metrics to prevent metrics generated in current workload gets
|
|
// carried over to the next workload.
|
|
legacyregistry.Reset()
|
|
})
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func loadSchedulerConfig(file string) (*config.KubeSchedulerConfiguration, error) {
|
|
data, err := os.ReadFile(file)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// The UniversalDecoder runs defaulting and returns the internal type by default.
|
|
obj, gvk, err := scheme.Codecs.UniversalDecoder().Decode(data, nil, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if cfgObj, ok := obj.(*config.KubeSchedulerConfiguration); ok {
|
|
return cfgObj, nil
|
|
}
|
|
return nil, fmt.Errorf("couldn't decode as KubeSchedulerConfiguration, got %s: ", gvk)
|
|
}
|
|
|
|
func unrollWorkloadTemplate(tb ktesting.TB, wt []op, w *workload) []op {
|
|
var unrolled []op
|
|
for opIndex, o := range wt {
|
|
realOp, err := o.realOp.patchParams(w)
|
|
if err != nil {
|
|
tb.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
switch concreteOp := realOp.(type) {
|
|
case *createPodSetsOp:
|
|
tb.Logf("Creating %d pod sets %s", concreteOp.Count, concreteOp.CountParam)
|
|
for i := 0; i < concreteOp.Count; i++ {
|
|
copy := concreteOp.CreatePodsOp
|
|
ns := fmt.Sprintf("%s-%d", concreteOp.NamespacePrefix, i)
|
|
copy.Namespace = &ns
|
|
unrolled = append(unrolled, op{realOp: ©})
|
|
}
|
|
default:
|
|
unrolled = append(unrolled, o)
|
|
}
|
|
}
|
|
return unrolled
|
|
}
|
|
|
|
func setupClusterForWorkload(tCtx ktesting.TContext, configPath string, featureGates map[featuregate.Feature]bool, outOfTreePluginRegistry frameworkruntime.Registry) (informers.SharedInformerFactory, ktesting.TContext) {
|
|
var cfg *config.KubeSchedulerConfiguration
|
|
var err error
|
|
if configPath != "" {
|
|
cfg, err = loadSchedulerConfig(configPath)
|
|
if err != nil {
|
|
tCtx.Fatalf("error loading scheduler config file: %v", err)
|
|
}
|
|
if err = validation.ValidateKubeSchedulerConfiguration(cfg); err != nil {
|
|
tCtx.Fatalf("validate scheduler config file failed: %v", err)
|
|
}
|
|
}
|
|
return mustSetupCluster(tCtx, cfg, featureGates, outOfTreePluginRegistry)
|
|
}
|
|
|
|
func labelsMatch(actualLabels, requiredLabels map[string]string) bool {
|
|
for requiredLabel, requiredValue := range requiredLabels {
|
|
actualValue, ok := actualLabels[requiredLabel]
|
|
if !ok || requiredValue != actualValue {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func valueWithinThreshold(value, threshold float64, expectLower bool) bool {
|
|
if expectLower {
|
|
return value < threshold
|
|
}
|
|
return value > threshold
|
|
}
|
|
|
|
func compareMetricWithThreshold(items []DataItem, threshold float64, metricSelector thresholdMetricSelector) error {
|
|
if threshold == 0 {
|
|
return nil
|
|
}
|
|
for _, item := range items {
|
|
if item.Labels["Metric"] == metricSelector.Name && labelsMatch(item.Labels, metricSelector.Labels) && !valueWithinThreshold(item.Data["Average"], threshold, metricSelector.ExpectLower) {
|
|
if metricSelector.ExpectLower {
|
|
return fmt.Errorf("%s: expected %s Average to be lower: got %f, want %f", item.Labels["Name"], metricSelector.Name, item.Data["Average"], threshold)
|
|
}
|
|
return fmt.Errorf("%s: expected %s Average to be higher: got %f, want %f", item.Labels["Name"], metricSelector.Name, item.Data["Average"], threshold)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkEmptyInFlightEvents() error {
|
|
labels := append(schedframework.AllClusterEventLabels(), metrics.PodPoppedInFlightEvent)
|
|
for _, label := range labels {
|
|
value, err := testutil.GetGaugeMetricValue(metrics.InFlightEvents.WithLabelValues(label))
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get InFlightEvents metric for label %s", label)
|
|
}
|
|
if value > 0 {
|
|
return fmt.Errorf("InFlightEvents for label %s should be empty, but has %v items", label, value)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func startCollectingMetrics(tCtx ktesting.TContext, collectorWG *sync.WaitGroup, podInformer coreinformers.PodInformer, mcc *metricsCollectorConfig, throughputErrorMargin float64, opIndex int, name string, namespaces []string, labelSelector map[string]string) (ktesting.TContext, []testDataCollector) {
|
|
collectorCtx := ktesting.WithCancel(tCtx)
|
|
workloadName := tCtx.Name()
|
|
// The first part is the same for each workload, therefore we can strip it.
|
|
workloadName = workloadName[strings.Index(name, "/")+1:]
|
|
collectors := getTestDataCollectors(podInformer, fmt.Sprintf("%s/%s", workloadName, name), namespaces, labelSelector, mcc, throughputErrorMargin)
|
|
for _, collector := range collectors {
|
|
// Need loop-local variable for function below.
|
|
collector := collector
|
|
err := collector.init()
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: Failed to initialize data collector: %v", opIndex, err)
|
|
}
|
|
collectorWG.Add(1)
|
|
go func() {
|
|
defer collectorWG.Done()
|
|
collector.run(collectorCtx)
|
|
}()
|
|
}
|
|
return collectorCtx, collectors
|
|
}
|
|
|
|
func stopCollectingMetrics(tCtx ktesting.TContext, collectorCtx ktesting.TContext, collectorWG *sync.WaitGroup, threshold float64, tms thresholdMetricSelector, opIndex int, collectors []testDataCollector) []DataItem {
|
|
if collectorCtx == nil {
|
|
tCtx.Fatalf("op %d: Missing startCollectingMetrics operation before stopping", opIndex)
|
|
}
|
|
collectorCtx.Cancel("collecting metrics, collector must stop first")
|
|
collectorWG.Wait()
|
|
var dataItems []DataItem
|
|
for _, collector := range collectors {
|
|
items := collector.collect()
|
|
dataItems = append(dataItems, items...)
|
|
err := compareMetricWithThreshold(items, threshold, tms)
|
|
if err != nil {
|
|
tCtx.Errorf("op %d: %s", opIndex, err)
|
|
}
|
|
}
|
|
return dataItems
|
|
}
|
|
|
|
func runWorkload(tCtx ktesting.TContext, tc *testCase, w *workload, informerFactory informers.SharedInformerFactory) []DataItem {
|
|
b, benchmarking := tCtx.TB().(*testing.B)
|
|
if benchmarking {
|
|
start := time.Now()
|
|
b.Cleanup(func() {
|
|
duration := time.Since(start)
|
|
// This includes startup and shutdown time and thus does not
|
|
// reflect scheduling performance. It's useful to get a feeling
|
|
// for how long each workload runs overall.
|
|
b.ReportMetric(duration.Seconds(), "runtime_seconds")
|
|
})
|
|
}
|
|
|
|
// Disable error checking of the sampling interval length in the
|
|
// throughput collector by default. When running benchmarks, report
|
|
// it as test failure when samples are not taken regularly.
|
|
var throughputErrorMargin float64
|
|
if benchmarking {
|
|
// TODO: To prevent the perf-test failure, we increased the error margin, if still not enough
|
|
// one day, we should think of another approach to avoid this trick.
|
|
throughputErrorMargin = 30
|
|
}
|
|
|
|
// Additional informers needed for testing. The pod informer was
|
|
// already created before (scheduler.NewInformerFactory) and the
|
|
// factory was started for it (mustSetupCluster), therefore we don't
|
|
// need to start again.
|
|
podInformer := informerFactory.Core().V1().Pods()
|
|
|
|
// Everything else started by this function gets stopped before it returns.
|
|
tCtx = ktesting.WithCancel(tCtx)
|
|
var wg sync.WaitGroup
|
|
defer wg.Wait()
|
|
defer tCtx.Cancel("workload is done")
|
|
|
|
var dataItems []DataItem
|
|
nextNodeIndex := 0
|
|
// numPodsScheduledPerNamespace has all namespaces created in workload and the number of pods they (will) have.
|
|
// All namespaces listed in numPodsScheduledPerNamespace will be cleaned up.
|
|
numPodsScheduledPerNamespace := make(map[string]int)
|
|
|
|
var collectors []testDataCollector
|
|
// This needs a separate context and wait group because
|
|
// the metrics collecting needs to be sure that the goroutines
|
|
// are stopped.
|
|
var collectorCtx ktesting.TContext
|
|
var collectorWG sync.WaitGroup
|
|
defer collectorWG.Wait()
|
|
|
|
for opIndex, op := range unrollWorkloadTemplate(tCtx, tc.WorkloadTemplate, w) {
|
|
realOp, err := op.realOp.patchParams(w)
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
select {
|
|
case <-tCtx.Done():
|
|
tCtx.Fatalf("op %d: %v", opIndex, context.Cause(tCtx))
|
|
default:
|
|
}
|
|
switch concreteOp := realOp.(type) {
|
|
case *createNodesOp:
|
|
nodePreparer, err := getNodePreparer(fmt.Sprintf("node-%d-", opIndex), concreteOp, tCtx.Client())
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
if err := nodePreparer.PrepareNodes(tCtx, nextNodeIndex); err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
nextNodeIndex += concreteOp.Count
|
|
|
|
case *createNamespacesOp:
|
|
nsPreparer, err := newNamespacePreparer(tCtx, concreteOp)
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
if err := nsPreparer.prepare(tCtx); err != nil {
|
|
err2 := nsPreparer.cleanup(tCtx)
|
|
if err2 != nil {
|
|
err = fmt.Errorf("prepare: %v; cleanup: %v", err, err2)
|
|
}
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
for _, n := range nsPreparer.namespaces() {
|
|
if _, ok := numPodsScheduledPerNamespace[n]; ok {
|
|
// this namespace has been already created.
|
|
continue
|
|
}
|
|
numPodsScheduledPerNamespace[n] = 0
|
|
}
|
|
|
|
case *createPodsOp:
|
|
var namespace string
|
|
// define Pod's namespace automatically, and create that namespace.
|
|
namespace = fmt.Sprintf("namespace-%d", opIndex)
|
|
if concreteOp.Namespace != nil {
|
|
namespace = *concreteOp.Namespace
|
|
}
|
|
createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace)
|
|
if concreteOp.PodTemplatePath == nil {
|
|
concreteOp.PodTemplatePath = tc.DefaultPodTemplatePath
|
|
}
|
|
|
|
if concreteOp.CollectMetrics {
|
|
if collectorCtx != nil {
|
|
tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex)
|
|
}
|
|
collectorCtx, collectors = startCollectingMetrics(tCtx, &collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, namespace, []string{namespace}, nil)
|
|
defer collectorCtx.Cancel("cleaning up")
|
|
}
|
|
if err := createPodsRapidly(tCtx, namespace, concreteOp); err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
switch {
|
|
case concreteOp.SkipWaitToCompletion:
|
|
// Only record those namespaces that may potentially require barriers
|
|
// in the future.
|
|
numPodsScheduledPerNamespace[namespace] += concreteOp.Count
|
|
case concreteOp.SteadyState:
|
|
if err := createPodsSteadily(tCtx, namespace, podInformer, concreteOp); err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
default:
|
|
if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, nil, namespace, concreteOp.Count); err != nil {
|
|
tCtx.Fatalf("op %d: error in waiting for pods to get scheduled: %v", opIndex, err)
|
|
}
|
|
}
|
|
if concreteOp.CollectMetrics {
|
|
// CollectMetrics and SkipWaitToCompletion can never be true at the
|
|
// same time, so if we're here, it means that all pods have been
|
|
// scheduled.
|
|
items := stopCollectingMetrics(tCtx, collectorCtx, &collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors)
|
|
dataItems = append(dataItems, items...)
|
|
collectorCtx = nil
|
|
}
|
|
|
|
case *deletePodsOp:
|
|
labelSelector := labels.ValidatedSetSelector(concreteOp.LabelSelector)
|
|
|
|
podsToDelete, err := podInformer.Lister().Pods(concreteOp.Namespace).List(labelSelector)
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: error in listing pods in the namespace %s: %v", opIndex, concreteOp.Namespace, err)
|
|
}
|
|
|
|
deletePods := func(opIndex int) {
|
|
if concreteOp.DeletePodsPerSecond > 0 {
|
|
ticker := time.NewTicker(time.Second / time.Duration(concreteOp.DeletePodsPerSecond))
|
|
defer ticker.Stop()
|
|
|
|
for i := 0; i < len(podsToDelete); i++ {
|
|
select {
|
|
case <-ticker.C:
|
|
if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).Delete(tCtx, podsToDelete[i].Name, metav1.DeleteOptions{}); err != nil {
|
|
if errors.Is(err, context.Canceled) {
|
|
return
|
|
}
|
|
tCtx.Errorf("op %d: unable to delete pod %v: %v", opIndex, podsToDelete[i].Name, err)
|
|
}
|
|
case <-tCtx.Done():
|
|
return
|
|
}
|
|
}
|
|
return
|
|
}
|
|
listOpts := metav1.ListOptions{
|
|
LabelSelector: labelSelector.String(),
|
|
}
|
|
if err := tCtx.Client().CoreV1().Pods(concreteOp.Namespace).DeleteCollection(tCtx, metav1.DeleteOptions{}, listOpts); err != nil {
|
|
if errors.Is(err, context.Canceled) {
|
|
return
|
|
}
|
|
tCtx.Errorf("op %d: unable to delete pods in namespace %v: %v", opIndex, concreteOp.Namespace, err)
|
|
}
|
|
}
|
|
|
|
if concreteOp.SkipWaitToCompletion {
|
|
wg.Add(1)
|
|
go func(opIndex int) {
|
|
defer wg.Done()
|
|
deletePods(opIndex)
|
|
}(opIndex)
|
|
} else {
|
|
deletePods(opIndex)
|
|
}
|
|
|
|
case *churnOp:
|
|
var namespace string
|
|
if concreteOp.Namespace != nil {
|
|
namespace = *concreteOp.Namespace
|
|
} else {
|
|
namespace = fmt.Sprintf("namespace-%d", opIndex)
|
|
}
|
|
restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cacheddiscovery.NewMemCacheClient(tCtx.Client().Discovery()))
|
|
// Ensure the namespace exists.
|
|
nsObj := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}}
|
|
if _, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, nsObj, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) {
|
|
tCtx.Fatalf("op %d: unable to create namespace %v: %v", opIndex, namespace, err)
|
|
}
|
|
|
|
var churnFns []func(name string) string
|
|
|
|
for i, path := range concreteOp.TemplatePaths {
|
|
unstructuredObj, gvk, err := getUnstructuredFromFile(path)
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: unable to parse the %v-th template path: %v", opIndex, i, err)
|
|
}
|
|
// Obtain GVR.
|
|
mapping, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version)
|
|
if err != nil {
|
|
tCtx.Fatalf("op %d: unable to find GVR for %v: %v", opIndex, gvk, err)
|
|
}
|
|
gvr := mapping.Resource
|
|
// Distinguish cluster-scoped with namespaced API objects.
|
|
var dynRes dynamic.ResourceInterface
|
|
if mapping.Scope.Name() == meta.RESTScopeNameNamespace {
|
|
dynRes = tCtx.Dynamic().Resource(gvr).Namespace(namespace)
|
|
} else {
|
|
dynRes = tCtx.Dynamic().Resource(gvr)
|
|
}
|
|
|
|
churnFns = append(churnFns, func(name string) string {
|
|
if name != "" {
|
|
if err := dynRes.Delete(tCtx, name, metav1.DeleteOptions{}); err != nil && !errors.Is(err, context.Canceled) {
|
|
tCtx.Errorf("op %d: unable to delete %v: %v", opIndex, name, err)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
live, err := dynRes.Create(tCtx, unstructuredObj, metav1.CreateOptions{})
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return live.GetName()
|
|
})
|
|
}
|
|
|
|
var interval int64 = 500
|
|
if concreteOp.IntervalMilliseconds != 0 {
|
|
interval = concreteOp.IntervalMilliseconds
|
|
}
|
|
ticker := time.NewTicker(time.Duration(interval) * time.Millisecond)
|
|
defer ticker.Stop()
|
|
|
|
switch concreteOp.Mode {
|
|
case Create:
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
count, threshold := 0, concreteOp.Number
|
|
if threshold == 0 {
|
|
threshold = math.MaxInt32
|
|
}
|
|
for count < threshold {
|
|
select {
|
|
case <-ticker.C:
|
|
for i := range churnFns {
|
|
churnFns[i]("")
|
|
}
|
|
count++
|
|
case <-tCtx.Done():
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
case Recreate:
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
retVals := make([][]string, len(churnFns))
|
|
// For each churn function, instantiate a slice of strings with length "concreteOp.Number".
|
|
for i := range retVals {
|
|
retVals[i] = make([]string, concreteOp.Number)
|
|
}
|
|
|
|
count := 0
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
for i := range churnFns {
|
|
retVals[i][count%concreteOp.Number] = churnFns[i](retVals[i][count%concreteOp.Number])
|
|
}
|
|
count++
|
|
case <-tCtx.Done():
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
case *barrierOp:
|
|
for _, namespace := range concreteOp.Namespaces {
|
|
if _, ok := numPodsScheduledPerNamespace[namespace]; !ok {
|
|
tCtx.Fatalf("op %d: unknown namespace %s", opIndex, namespace)
|
|
}
|
|
}
|
|
switch concreteOp.StageRequirement {
|
|
case Attempted:
|
|
if err := waitUntilPodsAttempted(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
case Scheduled:
|
|
// Default should be treated like "Scheduled", so handling both in the same way.
|
|
fallthrough
|
|
default:
|
|
if err := waitUntilPodsScheduled(tCtx, podInformer, concreteOp.LabelSelector, concreteOp.Namespaces, numPodsScheduledPerNamespace); err != nil {
|
|
tCtx.Fatalf("op %d: %v", opIndex, err)
|
|
}
|
|
// At the end of the barrier, we can be sure that there are no pods
|
|
// pending scheduling in the namespaces that we just blocked on.
|
|
if len(concreteOp.Namespaces) == 0 {
|
|
numPodsScheduledPerNamespace = make(map[string]int)
|
|
} else {
|
|
for _, namespace := range concreteOp.Namespaces {
|
|
delete(numPodsScheduledPerNamespace, namespace)
|
|
}
|
|
}
|
|
}
|
|
|
|
case *sleepOp:
|
|
select {
|
|
case <-tCtx.Done():
|
|
case <-time.After(concreteOp.Duration.Duration):
|
|
}
|
|
|
|
case *startCollectingMetricsOp:
|
|
if collectorCtx != nil {
|
|
tCtx.Fatalf("op %d: Metrics collection is overlapping. Probably second collector was started before stopping a previous one", opIndex)
|
|
}
|
|
collectorCtx, collectors = startCollectingMetrics(tCtx, &collectorWG, podInformer, tc.MetricsCollectorConfig, throughputErrorMargin, opIndex, concreteOp.Name, concreteOp.Namespaces, concreteOp.LabelSelector)
|
|
defer collectorCtx.Cancel("cleaning up")
|
|
|
|
case *stopCollectingMetricsOp:
|
|
items := stopCollectingMetrics(tCtx, collectorCtx, &collectorWG, w.Threshold, *w.ThresholdMetricSelector, opIndex, collectors)
|
|
dataItems = append(dataItems, items...)
|
|
collectorCtx = nil
|
|
|
|
default:
|
|
runable, ok := concreteOp.(runnableOp)
|
|
if !ok {
|
|
tCtx.Fatalf("op %d: invalid op %v", opIndex, concreteOp)
|
|
}
|
|
for _, namespace := range runable.requiredNamespaces() {
|
|
createNamespaceIfNotPresent(tCtx, namespace, &numPodsScheduledPerNamespace)
|
|
}
|
|
runable.run(tCtx)
|
|
}
|
|
}
|
|
|
|
// check unused params and inform users
|
|
unusedParams := w.unusedParams()
|
|
if len(unusedParams) != 0 {
|
|
tCtx.Fatalf("the parameters %v are defined on workload %s, but unused.\nPlease make sure there are no typos.", unusedParams, w.Name)
|
|
}
|
|
|
|
// Some tests have unschedulable pods. Do not add an implicit barrier at the
|
|
// end as we do not want to wait for them.
|
|
return dataItems
|
|
}
|
|
|
|
func createNamespaceIfNotPresent(tCtx ktesting.TContext, namespace string, podsPerNamespace *map[string]int) {
|
|
if _, ok := (*podsPerNamespace)[namespace]; !ok {
|
|
// The namespace has not created yet.
|
|
// So, create that and register it.
|
|
_, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}}, metav1.CreateOptions{})
|
|
if err != nil {
|
|
tCtx.Fatalf("failed to create namespace for Pod: %v", namespace)
|
|
}
|
|
(*podsPerNamespace)[namespace] = 0
|
|
}
|
|
}
|
|
|
|
type testDataCollector interface {
|
|
init() error
|
|
run(tCtx ktesting.TContext)
|
|
collect() []DataItem
|
|
}
|
|
|
|
func getTestDataCollectors(podInformer coreinformers.PodInformer, name string, namespaces []string, labelSelector map[string]string, mcc *metricsCollectorConfig, throughputErrorMargin float64) []testDataCollector {
|
|
if mcc == nil {
|
|
mcc = &defaultMetricsCollectorConfig
|
|
}
|
|
return []testDataCollector{
|
|
newThroughputCollector(podInformer, map[string]string{"Name": name}, labelSelector, namespaces, throughputErrorMargin),
|
|
newMetricsCollector(mcc, map[string]string{"Name": name}),
|
|
}
|
|
}
|
|
|
|
func getNodePreparer(prefix string, cno *createNodesOp, clientset clientset.Interface) (testutils.TestNodePreparer, error) {
|
|
var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{}
|
|
if cno.NodeAllocatableStrategy != nil {
|
|
nodeStrategy = cno.NodeAllocatableStrategy
|
|
} else if cno.LabelNodePrepareStrategy != nil {
|
|
nodeStrategy = cno.LabelNodePrepareStrategy
|
|
} else if cno.UniqueNodeLabelStrategy != nil {
|
|
nodeStrategy = cno.UniqueNodeLabelStrategy
|
|
}
|
|
|
|
nodeTemplate := StaticNodeTemplate(makeBaseNode(prefix))
|
|
if cno.NodeTemplatePath != nil {
|
|
nodeTemplate = nodeTemplateFromFile(*cno.NodeTemplatePath)
|
|
}
|
|
|
|
return NewIntegrationTestNodePreparer(
|
|
clientset,
|
|
[]testutils.CountToStrategy{{Count: cno.Count, Strategy: nodeStrategy}},
|
|
nodeTemplate,
|
|
), nil
|
|
}
|
|
|
|
// createPodsRapidly implements the "create pods rapidly" mode of [createPodsOp].
|
|
// It's a nop when cpo.SteadyState is true.
|
|
func createPodsRapidly(tCtx ktesting.TContext, namespace string, cpo *createPodsOp) error {
|
|
if cpo.SteadyState {
|
|
return nil
|
|
}
|
|
strategy, err := getPodStrategy(cpo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tCtx.Logf("creating %d pods in namespace %q", cpo.Count, namespace)
|
|
config := testutils.NewTestPodCreatorConfig()
|
|
config.AddStrategy(namespace, cpo.Count, strategy)
|
|
podCreator := testutils.NewTestPodCreator(tCtx.Client(), config)
|
|
return podCreator.CreatePods(tCtx)
|
|
}
|
|
|
|
// createPodsSteadily implements the "create pods and delete pods" mode of [createPodsOp].
|
|
// It's a nop when cpo.SteadyState is false.
|
|
func createPodsSteadily(tCtx ktesting.TContext, namespace string, podInformer coreinformers.PodInformer, cpo *createPodsOp) error {
|
|
if !cpo.SteadyState {
|
|
return nil
|
|
}
|
|
strategy, err := getPodStrategy(cpo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tCtx.Logf("creating pods in namespace %q for %s", namespace, cpo.Duration)
|
|
tCtx = ktesting.WithTimeout(tCtx, cpo.Duration.Duration, fmt.Sprintf("the operation ran for the configured %s", cpo.Duration.Duration))
|
|
|
|
// Start watching pods in the namespace. Any pod which is seen as being scheduled
|
|
// gets deleted.
|
|
scheduledPods := make(chan *v1.Pod, cpo.Count)
|
|
scheduledPodsClosed := false
|
|
var mutex sync.Mutex
|
|
defer func() {
|
|
mutex.Lock()
|
|
defer mutex.Unlock()
|
|
close(scheduledPods)
|
|
scheduledPodsClosed = true
|
|
}()
|
|
|
|
existingPods := 0
|
|
runningPods := 0
|
|
onPodChange := func(oldObj, newObj any) {
|
|
oldPod, newPod, err := schedutil.As[*v1.Pod](oldObj, newObj)
|
|
if err != nil {
|
|
tCtx.Errorf("unexpected pod events: %v", err)
|
|
return
|
|
}
|
|
|
|
mutex.Lock()
|
|
defer mutex.Unlock()
|
|
if oldPod == nil {
|
|
existingPods++
|
|
}
|
|
if (oldPod == nil || oldPod.Spec.NodeName == "") && newPod.Spec.NodeName != "" {
|
|
// Got scheduled.
|
|
runningPods++
|
|
|
|
// Only ask for deletion in our namespace.
|
|
if newPod.Namespace != namespace {
|
|
return
|
|
}
|
|
if !scheduledPodsClosed {
|
|
select {
|
|
case <-tCtx.Done():
|
|
case scheduledPods <- newPod:
|
|
}
|
|
}
|
|
}
|
|
}
|
|
handle, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: func(obj any) {
|
|
onPodChange(nil, obj)
|
|
},
|
|
UpdateFunc: func(oldObj, newObj any) {
|
|
onPodChange(oldObj, newObj)
|
|
},
|
|
DeleteFunc: func(obj any) {
|
|
pod, _, err := schedutil.As[*v1.Pod](obj, nil)
|
|
if err != nil {
|
|
tCtx.Errorf("unexpected pod events: %v", err)
|
|
return
|
|
}
|
|
|
|
existingPods--
|
|
if pod.Spec.NodeName != "" {
|
|
runningPods--
|
|
}
|
|
},
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("register event handler: %w", err)
|
|
}
|
|
defer func() {
|
|
tCtx.ExpectNoError(podInformer.Informer().RemoveEventHandler(handle), "remove event handler")
|
|
}()
|
|
|
|
// Seed the namespace with the initial number of pods.
|
|
if err := strategy(tCtx, tCtx.Client(), namespace, cpo.Count); err != nil {
|
|
return fmt.Errorf("create initial %d pods: %w", cpo.Count, err)
|
|
}
|
|
|
|
// Now loop until we are done. Report periodically how many pods were scheduled.
|
|
countScheduledPods := 0
|
|
lastCountScheduledPods := 0
|
|
logPeriod := time.Second
|
|
ticker := time.NewTicker(logPeriod)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-tCtx.Done():
|
|
tCtx.Logf("Completed after seeing %d scheduled pod: %v", countScheduledPods, context.Cause(tCtx))
|
|
return nil
|
|
case <-scheduledPods:
|
|
countScheduledPods++
|
|
if countScheduledPods%cpo.Count == 0 {
|
|
// All scheduled. Start over with a new batch.
|
|
err := tCtx.Client().CoreV1().Pods(namespace).DeleteCollection(tCtx, metav1.DeleteOptions{
|
|
GracePeriodSeconds: ptr.To(int64(0)),
|
|
PropagationPolicy: ptr.To(metav1.DeletePropagationBackground), // Foreground will block.
|
|
}, metav1.ListOptions{})
|
|
// Ignore errors when the time is up. errors.Is(context.Canceled) would
|
|
// be more precise, but doesn't work because client-go doesn't reliably
|
|
// propagate it. Instead, this was seen:
|
|
// client rate limiter Wait returned an error: rate: Wait(n=1) would exceed context deadline
|
|
if tCtx.Err() != nil {
|
|
continue
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("delete scheduled pods: %w", err)
|
|
}
|
|
err = strategy(tCtx, tCtx.Client(), namespace, cpo.Count)
|
|
if tCtx.Err() != nil {
|
|
continue
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("create next batch of pods: %w", err)
|
|
}
|
|
}
|
|
case <-ticker.C:
|
|
delta := countScheduledPods - lastCountScheduledPods
|
|
lastCountScheduledPods = countScheduledPods
|
|
func() {
|
|
mutex.Lock()
|
|
defer mutex.Unlock()
|
|
|
|
tCtx.Logf("%d pods got scheduled in total in namespace %q, overall %d out of %d pods scheduled: %f pods/s in last interval",
|
|
countScheduledPods, namespace,
|
|
runningPods, existingPods,
|
|
float64(delta)/logPeriod.Seconds(),
|
|
)
|
|
}()
|
|
}
|
|
}
|
|
}
|
|
|
|
// waitUntilPodsScheduledInNamespace blocks until all pods in the given
|
|
// namespace are scheduled. Times out after 10 minutes because even at the
|
|
// lowest observed QPS of ~10 pods/sec, a 5000-node test should complete.
|
|
func waitUntilPodsScheduledInNamespace(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespace string, wantCount int) error {
|
|
var pendingPod *v1.Pod
|
|
|
|
err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
|
|
select {
|
|
case <-ctx.Done():
|
|
return true, ctx.Err()
|
|
default:
|
|
}
|
|
scheduled, attempted, unattempted, err := getScheduledPods(podInformer, labelSelector, namespace)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if len(scheduled) >= wantCount {
|
|
tCtx.Logf("scheduling succeed")
|
|
return true, nil
|
|
}
|
|
tCtx.Logf("namespace: %s, pods: want %d, got %d", namespace, wantCount, len(scheduled))
|
|
if len(attempted) > 0 {
|
|
pendingPod = attempted[0]
|
|
} else if len(unattempted) > 0 {
|
|
pendingPod = unattempted[0]
|
|
} else {
|
|
pendingPod = nil
|
|
}
|
|
return false, nil
|
|
})
|
|
|
|
if err != nil && pendingPod != nil {
|
|
err = fmt.Errorf("at least pod %s is not scheduled: %w", klog.KObj(pendingPod), err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// waitUntilPodsAttemptedInNamespace blocks until all pods in the given
|
|
// namespace at least once went through a schedyling cycle.
|
|
// Times out after 10 minutes similarly to waitUntilPodsScheduledInNamespace.
|
|
func waitUntilPodsAttemptedInNamespace(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespace string, wantCount int) error {
|
|
var pendingPod *v1.Pod
|
|
|
|
err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
|
|
select {
|
|
case <-ctx.Done():
|
|
return true, ctx.Err()
|
|
default:
|
|
}
|
|
scheduled, attempted, unattempted, err := getScheduledPods(podInformer, labelSelector, namespace)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if len(scheduled)+len(attempted) >= wantCount {
|
|
tCtx.Logf("all pods attempted to be scheduled")
|
|
return true, nil
|
|
}
|
|
tCtx.Logf("namespace: %s, attempted pods: want %d, got %d", namespace, wantCount, len(scheduled)+len(attempted))
|
|
if len(unattempted) > 0 {
|
|
pendingPod = unattempted[0]
|
|
} else {
|
|
pendingPod = nil
|
|
}
|
|
return false, nil
|
|
})
|
|
|
|
if err != nil && pendingPod != nil {
|
|
err = fmt.Errorf("at least pod %s is not attempted: %w", klog.KObj(pendingPod), err)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// waitUntilPodsScheduled blocks until the all pods in the given namespaces are
|
|
// scheduled.
|
|
func waitUntilPodsScheduled(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespaces []string, numPodsScheduledPerNamespace map[string]int) error {
|
|
// If unspecified, default to all known namespaces.
|
|
if len(namespaces) == 0 {
|
|
for namespace := range numPodsScheduledPerNamespace {
|
|
namespaces = append(namespaces, namespace)
|
|
}
|
|
}
|
|
for _, namespace := range namespaces {
|
|
select {
|
|
case <-tCtx.Done():
|
|
return context.Cause(tCtx)
|
|
default:
|
|
}
|
|
wantCount, ok := numPodsScheduledPerNamespace[namespace]
|
|
if !ok {
|
|
return fmt.Errorf("unknown namespace %s", namespace)
|
|
}
|
|
if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, labelSelector, namespace, wantCount); err != nil {
|
|
return fmt.Errorf("error waiting for pods in namespace %q: %w", namespace, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// waitUntilPodsAttempted blocks until the all pods in the given namespaces are
|
|
// attempted (at least once went through a schedyling cycle).
|
|
func waitUntilPodsAttempted(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespaces []string, numPodsScheduledPerNamespace map[string]int) error {
|
|
// If unspecified, default to all known namespaces.
|
|
if len(namespaces) == 0 {
|
|
for namespace := range numPodsScheduledPerNamespace {
|
|
namespaces = append(namespaces, namespace)
|
|
}
|
|
}
|
|
for _, namespace := range namespaces {
|
|
select {
|
|
case <-tCtx.Done():
|
|
return context.Cause(tCtx)
|
|
default:
|
|
}
|
|
wantCount, ok := numPodsScheduledPerNamespace[namespace]
|
|
if !ok {
|
|
return fmt.Errorf("unknown namespace %s", namespace)
|
|
}
|
|
if err := waitUntilPodsAttemptedInNamespace(tCtx, podInformer, labelSelector, namespace, wantCount); err != nil {
|
|
return fmt.Errorf("error waiting for pods in namespace %q: %w", namespace, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getSpecFromFile(path *string, spec interface{}) error {
|
|
bytes, err := os.ReadFile(*path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return yaml.UnmarshalStrict(bytes, spec)
|
|
}
|
|
|
|
func getUnstructuredFromFile(path string) (*unstructured.Unstructured, *schema.GroupVersionKind, error) {
|
|
bytes, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
bytes, err = yaml.YAMLToJSONStrict(bytes)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("cannot covert YAML to JSON: %v", err)
|
|
}
|
|
|
|
obj, gvk, err := unstructured.UnstructuredJSONScheme.Decode(bytes, nil, nil)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
unstructuredObj, ok := obj.(*unstructured.Unstructured)
|
|
if !ok {
|
|
return nil, nil, fmt.Errorf("cannot convert spec file in %v to an unstructured obj", path)
|
|
}
|
|
return unstructuredObj, gvk, nil
|
|
}
|
|
|
|
func getTestCases(path string) ([]*testCase, error) {
|
|
testCases := make([]*testCase, 0)
|
|
if err := getSpecFromFile(&path, &testCases); err != nil {
|
|
return nil, fmt.Errorf("parsing test cases error: %w", err)
|
|
}
|
|
for _, tc := range testCases {
|
|
for _, w := range tc.Workloads {
|
|
w.setDefaults(tc.DefaultThresholdMetricSelector)
|
|
}
|
|
}
|
|
return testCases, nil
|
|
}
|
|
|
|
func validateTestCases(testCases []*testCase) error {
|
|
if len(testCases) == 0 {
|
|
return fmt.Errorf("no test cases defined")
|
|
}
|
|
testCaseUniqueNames := map[string]bool{}
|
|
for _, tc := range testCases {
|
|
if testCaseUniqueNames[tc.Name] {
|
|
return fmt.Errorf("%s: name is not unique", tc.Name)
|
|
}
|
|
testCaseUniqueNames[tc.Name] = true
|
|
if len(tc.Workloads) == 0 {
|
|
return fmt.Errorf("%s: no workloads defined", tc.Name)
|
|
}
|
|
if err := tc.workloadNamesUnique(); err != nil {
|
|
return err
|
|
}
|
|
if len(tc.WorkloadTemplate) == 0 {
|
|
return fmt.Errorf("%s: no ops defined", tc.Name)
|
|
}
|
|
// Make sure there's at least one CreatePods op with collectMetrics set to
|
|
// true in each workload. What's the point of running a performance
|
|
// benchmark if no statistics are collected for reporting?
|
|
if !tc.collectsMetrics() {
|
|
return fmt.Errorf("%s: no op in the workload template collects metrics", tc.Name)
|
|
}
|
|
// TODO(#93795): make sure each workload within a test case has a unique
|
|
// name? The name is used to identify the stats in benchmark reports.
|
|
// TODO(#94404): check for unused template parameters? Probably a typo.
|
|
for _, w := range tc.Workloads {
|
|
err := w.isValid(tc.MetricsCollectorConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getPodStrategy(cpo *createPodsOp) (testutils.TestPodCreateStrategy, error) {
|
|
podTemplate := testutils.StaticPodTemplate(makeBasePod())
|
|
if cpo.PodTemplatePath != nil {
|
|
podTemplate = podTemplateFromFile(*cpo.PodTemplatePath)
|
|
}
|
|
if cpo.PersistentVolumeClaimTemplatePath == nil {
|
|
return testutils.NewCustomCreatePodStrategy(podTemplate), nil
|
|
}
|
|
|
|
pvTemplate, err := getPersistentVolumeSpecFromFile(cpo.PersistentVolumeTemplatePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pvcTemplate, err := getPersistentVolumeClaimSpecFromFile(cpo.PersistentVolumeClaimTemplatePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return testutils.NewCreatePodWithPersistentVolumeStrategy(pvcTemplate, getCustomVolumeFactory(pvTemplate), podTemplate), nil
|
|
}
|
|
|
|
type nodeTemplateFromFile string
|
|
|
|
func (f nodeTemplateFromFile) GetNodeTemplate(index, count int) (*v1.Node, error) {
|
|
nodeSpec := &v1.Node{}
|
|
if err := getSpecFromTextTemplateFile(string(f), map[string]any{"Index": index, "Count": count}, nodeSpec); err != nil {
|
|
return nil, fmt.Errorf("parsing Node: %w", err)
|
|
}
|
|
return nodeSpec, nil
|
|
}
|
|
|
|
type podTemplateFromFile string
|
|
|
|
func (f podTemplateFromFile) GetPodTemplate(index, count int) (*v1.Pod, error) {
|
|
podSpec := &v1.Pod{}
|
|
if err := getSpecFromTextTemplateFile(string(f), map[string]any{"Index": index, "Count": count}, podSpec); err != nil {
|
|
return nil, fmt.Errorf("parsing Pod: %w", err)
|
|
}
|
|
return podSpec, nil
|
|
}
|
|
|
|
func getPersistentVolumeSpecFromFile(path *string) (*v1.PersistentVolume, error) {
|
|
persistentVolumeSpec := &v1.PersistentVolume{}
|
|
if err := getSpecFromFile(path, persistentVolumeSpec); err != nil {
|
|
return nil, fmt.Errorf("parsing PersistentVolume: %w", err)
|
|
}
|
|
return persistentVolumeSpec, nil
|
|
}
|
|
|
|
func getPersistentVolumeClaimSpecFromFile(path *string) (*v1.PersistentVolumeClaim, error) {
|
|
persistentVolumeClaimSpec := &v1.PersistentVolumeClaim{}
|
|
if err := getSpecFromFile(path, persistentVolumeClaimSpec); err != nil {
|
|
return nil, fmt.Errorf("parsing PersistentVolumeClaim: %w", err)
|
|
}
|
|
return persistentVolumeClaimSpec, nil
|
|
}
|
|
|
|
func getCustomVolumeFactory(pvTemplate *v1.PersistentVolume) func(id int) *v1.PersistentVolume {
|
|
return func(id int) *v1.PersistentVolume {
|
|
pv := pvTemplate.DeepCopy()
|
|
volumeID := fmt.Sprintf("vol-%d", id)
|
|
pv.ObjectMeta.Name = volumeID
|
|
pvs := pv.Spec.PersistentVolumeSource
|
|
if pvs.CSI != nil {
|
|
pvs.CSI.VolumeHandle = volumeID
|
|
} else if pvs.AWSElasticBlockStore != nil {
|
|
pvs.AWSElasticBlockStore.VolumeID = volumeID
|
|
}
|
|
return pv
|
|
}
|
|
}
|
|
|
|
// namespacePreparer holds configuration information for the test namespace preparer.
|
|
type namespacePreparer struct {
|
|
count int
|
|
prefix string
|
|
spec *v1.Namespace
|
|
}
|
|
|
|
func newNamespacePreparer(tCtx ktesting.TContext, cno *createNamespacesOp) (*namespacePreparer, error) {
|
|
ns := &v1.Namespace{}
|
|
if cno.NamespaceTemplatePath != nil {
|
|
if err := getSpecFromFile(cno.NamespaceTemplatePath, ns); err != nil {
|
|
return nil, fmt.Errorf("parsing NamespaceTemplate: %w", err)
|
|
}
|
|
}
|
|
|
|
return &namespacePreparer{
|
|
count: cno.Count,
|
|
prefix: cno.Prefix,
|
|
spec: ns,
|
|
}, nil
|
|
}
|
|
|
|
// namespaces returns namespace names have been (or will be) created by this namespacePreparer
|
|
func (p *namespacePreparer) namespaces() []string {
|
|
namespaces := make([]string, p.count)
|
|
for i := 0; i < p.count; i++ {
|
|
namespaces[i] = fmt.Sprintf("%s-%d", p.prefix, i)
|
|
}
|
|
return namespaces
|
|
}
|
|
|
|
// prepare creates the namespaces.
|
|
func (p *namespacePreparer) prepare(tCtx ktesting.TContext) error {
|
|
base := &v1.Namespace{}
|
|
if p.spec != nil {
|
|
base = p.spec
|
|
}
|
|
tCtx.Logf("Making %d namespaces with prefix %q and template %v", p.count, p.prefix, *base)
|
|
for i := 0; i < p.count; i++ {
|
|
n := base.DeepCopy()
|
|
n.Name = fmt.Sprintf("%s-%d", p.prefix, i)
|
|
if err := testutils.RetryWithExponentialBackOff(func() (bool, error) {
|
|
_, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, n, metav1.CreateOptions{})
|
|
return err == nil || apierrors.IsAlreadyExists(err), nil
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// cleanup deletes existing test namespaces.
|
|
func (p *namespacePreparer) cleanup(tCtx ktesting.TContext) error {
|
|
var errRet error
|
|
for i := 0; i < p.count; i++ {
|
|
n := fmt.Sprintf("%s-%d", p.prefix, i)
|
|
if err := tCtx.Client().CoreV1().Namespaces().Delete(tCtx, n, metav1.DeleteOptions{}); err != nil {
|
|
tCtx.Errorf("Deleting Namespace: %v", err)
|
|
errRet = err
|
|
}
|
|
}
|
|
return errRet
|
|
}
|