From d100768d940ced902307b29907b11d5c0f63c1bc Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 27 Jan 2023 19:31:37 +0100 Subject: [PATCH] scheduler_perf: track and visualize progress over time This is useful to see whether pod scheduling happens in bursts and how it behaves over time, which is relevant in particular for dynamic resource allocation where it may become harder at the end to find the node which still has resources available. Besides "pods scheduled" it's also useful to know how many attempts were needed, so schedule_attempts_total also gets sampled and stored. To visualize the result of one or more test runs, use: gnuplot.sh *.dat --- .../metrics/testutil/metrics.go | 52 +++++++-- .../metrics/testutil/metrics_test.go | 102 ++++++++++++++++++ test/integration/scheduler_perf/README.md | 19 ++++ test/integration/scheduler_perf/gnuplot.sh | 54 ++++++++++ .../scheduler_perf/scheduler_perf.go | 24 +++++ test/integration/scheduler_perf/util.go | 49 ++++++++- 6 files changed, 293 insertions(+), 7 deletions(-) create mode 100755 test/integration/scheduler_perf/gnuplot.sh diff --git a/staging/src/k8s.io/component-base/metrics/testutil/metrics.go b/staging/src/k8s.io/component-base/metrics/testutil/metrics.go index c595f55d64a..05d15b08d75 100644 --- a/staging/src/k8s.io/component-base/metrics/testutil/metrics.go +++ b/staging/src/k8s.io/component-base/metrics/testutil/metrics.go @@ -258,12 +258,8 @@ func GetHistogramVecFromGatherer(gatherer metrics.Gatherer, metricName string, l if err != nil { return nil, err } - for _, mFamily := range m { - if mFamily.GetName() == metricName { - metricFamily = mFamily - break - } - } + + metricFamily = findMetricFamily(m, metricName) if metricFamily == nil { return nil, fmt.Errorf("metric %q not found", metricName) @@ -433,3 +429,47 @@ func LabelsMatch(metric *dto.Metric, labelFilter map[string]string) bool { return true } + +// GetCounterVecFromGatherer collects a counter that matches the given name +// from a gatherer implementing k8s.io/component-base/metrics.Gatherer interface. +// It returns all counter values that had a label with a certain name in a map +// that uses the label value as keys. +// +// Used only for testing purposes where we need to gather metrics directly from a running binary (without metrics endpoint). +func GetCounterValuesFromGatherer(gatherer metrics.Gatherer, metricName string, lvMap map[string]string, labelName string) (map[string]float64, error) { + m, err := gatherer.Gather() + if err != nil { + return nil, err + } + + metricFamily := findMetricFamily(m, metricName) + if metricFamily == nil { + return nil, fmt.Errorf("metric %q not found", metricName) + } + if len(metricFamily.GetMetric()) == 0 { + return nil, fmt.Errorf("metric %q is empty", metricName) + } + + values := make(map[string]float64) + for _, metric := range metricFamily.GetMetric() { + if LabelsMatch(metric, lvMap) { + if counter := metric.GetCounter(); counter != nil { + for _, labelPair := range metric.Label { + if labelPair.GetName() == labelName { + values[labelPair.GetValue()] = counter.GetValue() + } + } + } + } + } + return values, nil +} + +func findMetricFamily(metricFamilies []*dto.MetricFamily, metricName string) *dto.MetricFamily { + for _, mFamily := range metricFamilies { + if mFamily.GetName() == metricName { + return mFamily + } + } + return nil +} diff --git a/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go b/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go index 702fb7fcfde..adfc1999989 100644 --- a/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go +++ b/staging/src/k8s.io/component-base/metrics/testutil/metrics_test.go @@ -20,6 +20,7 @@ import ( "fmt" "math" "reflect" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -591,3 +592,104 @@ func TestGetHistogramVecFromGatherer(t *testing.T) { }) } } + +func TestGetCounterValuesFromGatherer(t *testing.T) { + namespace := "namespace" + subsystem := "subsystem" + name := "metric_test_name" + metricName := fmt.Sprintf("%s_%s_%s", namespace, subsystem, name) + + tests := map[string]struct { + metricName string // Empty is replaced with valid name. + lvMap map[string]string + labelName string + + wantCounterValues map[string]float64 + wantErr string + }{ + "wrong-metric": { + metricName: "no-such-metric", + wantErr: `metric "no-such-metric" not found`, + }, + + "none": { + metricName: metricName, + lvMap: map[string]string{"no-such-label": "a"}, + + wantCounterValues: map[string]float64{}, + }, + + "value1-0": { + metricName: metricName, + lvMap: map[string]string{"label1": "value1-0"}, + labelName: "label2", + + wantCounterValues: map[string]float64{"value2-0": 1.5, "value2-1": 2.5}, + }, + + "value1-1": { + metricName: metricName, + lvMap: map[string]string{"label1": "value1-1"}, + labelName: "label2", + + wantCounterValues: map[string]float64{"value2-0": 3.5, "value2-1": 4.5}, + }, + + "value1-1-value2-0-none": { + metricName: metricName, + lvMap: map[string]string{"label1": "value1-1", "label2": "value2-0"}, + labelName: "none", + + wantCounterValues: map[string]float64{}, + }, + + "value1-0-value2-0-one": { + metricName: metricName, + lvMap: map[string]string{"label1": "value1-0", "label2": "value2-0"}, + labelName: "label2", + + wantCounterValues: map[string]float64{"value2-0": 1.5}, + }, + } + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + // CounterVec has two labels defined. + labels := []string{"label1", "label2"} + counterOpts := &metrics.CounterOpts{ + Namespace: "namespace", + Name: "metric_test_name", + Subsystem: "subsystem", + Help: "counter help message", + } + vec := metrics.NewCounterVec(counterOpts, labels) + // Use local registry + var registry = metrics.NewKubeRegistry() + var gather metrics.Gatherer = registry + registry.MustRegister(vec) + // Observe two metrics with same value for label1 but different value of label2. + vec.WithLabelValues("value1-0", "value2-0").Add(1.5) + vec.WithLabelValues("value1-0", "value2-1").Add(2.5) + vec.WithLabelValues("value1-1", "value2-0").Add(3.5) + vec.WithLabelValues("value1-1", "value2-1").Add(4.5) + + // The check for empty metric apparently cannot be tested: registering + // a NewCounterVec with no values has the affect that it doesn't get + // returned, leading to "not found". + + counterValues, err := GetCounterValuesFromGatherer(gather, tt.metricName, tt.lvMap, tt.labelName) + if err != nil { + if tt.wantErr != "" && !strings.Contains(err.Error(), tt.wantErr) { + t.Errorf("expected error %q, got instead: %v", tt.wantErr, err) + } + return + } + if tt.wantErr != "" { + t.Fatalf("expected error %q, got none", tt.wantErr) + } + + if diff := cmp.Diff(tt.wantCounterValues, counterValues); diff != "" { + t.Errorf("Got unexpected HistogramVec (-want +got):\n%s", diff) + } + }) + } +} diff --git a/test/integration/scheduler_perf/README.md b/test/integration/scheduler_perf/README.md index 261dd5e776f..8a4bccba6b9 100644 --- a/test/integration/scheduler_perf/README.md +++ b/test/integration/scheduler_perf/README.md @@ -175,3 +175,22 @@ the ci-benchmark-scheduler-perf periodic job will fail with an error log such as This allows to analyze which workload failed. Make sure that the failure is not an outlier by checking multiple runs of the job. If the failures are not related to any regression, but to an incorrect threshold setting, it is reasonable to decrease it. + +### Visualization + +Some support for visualizing progress over time is built into the +benchmarks. The measurement operation which creates pods writes .dat files like +this: + + test/integration/scheduler_perf/SchedulingBasic_5000Nodes_2023-03-17T14:52:09Z.dat + +This file is in a text format that [gnuplot](http://www.gnuplot.info/) can +read. A wrapper script selects some suitable parameters: + + test/integration/scheduler_perf/gnuplot.sh test/integration/scheduler_perf/*.dat + +It plots in an interactive window by default. To write into a file, use + + test/integration/scheduler_perf/gnuplot.sh \ + -e 'set term png; set output ".png"' \ + test/integration/scheduler_perf/*.dat diff --git a/test/integration/scheduler_perf/gnuplot.sh b/test/integration/scheduler_perf/gnuplot.sh new file mode 100755 index 00000000000..885276559ae --- /dev/null +++ b/test/integration/scheduler_perf/gnuplot.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Invoke this script with a list of *.dat and it'll plot them with gnuplot. +# Any non-file parameter is passed through to gnuplot. By default, +# an X11 window is used to display the result. To write into a file, +# use +# -e "set term png; set output .png" + +files=() +args=( -e "set term x11 persist" ) + +for i in "$@"; do + if [ -f "$i" ]; then + files+=("$i") + else + args+=("$i") + fi +done + +( + cat < 0 { sort.Float64s(tc.schedulingThroughputs) sum := 0.0