mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-11-01 14:22:17 +00:00
Merge pull request #87923 from ingvagabund/move-direct-prometheus-metrics-under-component-base-metrics
Collect some of scheduling metrics and scheduling throughput (vol. 2)
This commit is contained in:
@@ -20,7 +20,10 @@ go_library(
|
||||
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
|
||||
"//staging/src/k8s.io/client-go/rest:go_default_library",
|
||||
"//staging/src/k8s.io/component-base/metrics/legacyregistry:go_default_library",
|
||||
"//staging/src/k8s.io/component-base/metrics/testutil:go_default_library",
|
||||
"//test/integration/util:go_default_library",
|
||||
"//vendor/k8s.io/klog:go_default_library",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@@ -38,6 +38,15 @@ const (
|
||||
configFile = "config/performance-config.yaml"
|
||||
)
|
||||
|
||||
var (
|
||||
defaultMetrics = []string{
|
||||
"scheduler_scheduling_algorithm_predicate_evaluation_seconds",
|
||||
"scheduler_scheduling_algorithm_priority_evaluation_seconds",
|
||||
"scheduler_binding_duration_seconds",
|
||||
"scheduler_e2e_scheduling_duration_seconds",
|
||||
}
|
||||
)
|
||||
|
||||
// testCase configures a test case to run the scheduler performance test. Users should be able to
|
||||
// provide this via a YAML file.
|
||||
//
|
||||
@@ -92,6 +101,7 @@ type testParams struct {
|
||||
}
|
||||
|
||||
func BenchmarkPerfScheduling(b *testing.B) {
|
||||
dataItems := DataItems{Version: "v1"}
|
||||
tests := getSimpleTestCases(configFile)
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -100,12 +110,15 @@ func BenchmarkPerfScheduling(b *testing.B) {
|
||||
for feature, flag := range test.FeatureGates {
|
||||
defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)()
|
||||
}
|
||||
perfScheduling(test, b)
|
||||
dataItems.DataItems = append(dataItems.DataItems, perfScheduling(test, b)...)
|
||||
})
|
||||
}
|
||||
if err := dataItems2JSONFile(dataItems, b.Name()); err != nil {
|
||||
klog.Fatalf("%v: unable to write measured data: %v", b.Name(), err)
|
||||
}
|
||||
}
|
||||
|
||||
func perfScheduling(test testCase, b *testing.B) {
|
||||
func perfScheduling(test testCase, b *testing.B) []DataItem {
|
||||
var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{}
|
||||
if test.Nodes.NodeAllocatableStrategy != nil {
|
||||
nodeStrategy = test.Nodes.NodeAllocatableStrategy
|
||||
@@ -180,15 +193,45 @@ func perfScheduling(test testCase, b *testing.B) {
|
||||
|
||||
// start benchmark
|
||||
b.ResetTimer()
|
||||
|
||||
// Start measuring throughput
|
||||
stopCh := make(chan struct{})
|
||||
throughputCollector := newThroughputCollector(podInformer)
|
||||
go throughputCollector.run(stopCh)
|
||||
|
||||
// Scheduling the main workload
|
||||
config = testutils.NewTestPodCreatorConfig()
|
||||
config.AddStrategy(testNamespace, test.PodsToSchedule.Num, testPodStrategy)
|
||||
podCreator = testutils.NewTestPodCreator(clientset, config)
|
||||
podCreator.CreatePods()
|
||||
|
||||
<-completedCh
|
||||
close(stopCh)
|
||||
|
||||
// Note: without this line we're taking the overhead of defer() into account.
|
||||
b.StopTimer()
|
||||
|
||||
setNameLabel := func(dataItem *DataItem) DataItem {
|
||||
if dataItem.Labels == nil {
|
||||
dataItem.Labels = map[string]string{}
|
||||
}
|
||||
dataItem.Labels["Name"] = b.Name()
|
||||
return *dataItem
|
||||
}
|
||||
|
||||
dataItems := []DataItem{
|
||||
setNameLabel(throughputCollector.collect()),
|
||||
}
|
||||
|
||||
for _, metric := range defaultMetrics {
|
||||
dataItem := newMetricsCollector(metric).collect()
|
||||
if dataItem == nil {
|
||||
continue
|
||||
}
|
||||
dataItems = append(dataItems, setNameLabel(dataItem))
|
||||
}
|
||||
|
||||
return dataItems
|
||||
}
|
||||
|
||||
func getPodStrategy(pc podCase) testutils.TestPodCreateStrategy {
|
||||
|
||||
@@ -17,15 +17,34 @@ limitations under the License.
|
||||
package benchmark
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math"
|
||||
"path"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
coreinformers "k8s.io/client-go/informers/core/v1"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
restclient "k8s.io/client-go/rest"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
"k8s.io/component-base/metrics/testutil"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/kubernetes/test/integration/util"
|
||||
)
|
||||
|
||||
const (
|
||||
dateFormat = "2006-01-02T15:04:05Z"
|
||||
throughputSampleFrequency = time.Second
|
||||
)
|
||||
|
||||
var dataItemsDir = flag.String("data-items-dir", "", "destination directory for storing generated data items for perf dashboard")
|
||||
|
||||
// mustSetupScheduler starts the following components:
|
||||
// - k8s api server (a.k.a. master)
|
||||
// - scheduler
|
||||
@@ -66,3 +85,145 @@ func getScheduledPods(podInformer coreinformers.PodInformer) ([]*v1.Pod, error)
|
||||
}
|
||||
return scheduled, nil
|
||||
}
|
||||
|
||||
// DataItem is the data point.
|
||||
type DataItem struct {
|
||||
// Data is a map from bucket to real data point (e.g. "Perc90" -> 23.5). Notice
|
||||
// that all data items with the same label combination should have the same buckets.
|
||||
Data map[string]float64 `json:"data"`
|
||||
// Unit is the data unit. Notice that all data items with the same label combination
|
||||
// should have the same unit.
|
||||
Unit string `json:"unit"`
|
||||
// Labels is the labels of the data item.
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
}
|
||||
|
||||
// DataItems is the data point set. It is the struct that perf dashboard expects.
|
||||
type DataItems struct {
|
||||
Version string `json:"version"`
|
||||
DataItems []DataItem `json:"dataItems"`
|
||||
}
|
||||
|
||||
func dataItems2JSONFile(dataItems DataItems, namePrefix string) error {
|
||||
b, err := json.Marshal(dataItems)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
destFile := fmt.Sprintf("%v_%v.json", namePrefix, time.Now().Format(dateFormat))
|
||||
if *dataItemsDir != "" {
|
||||
destFile = path.Join(*dataItemsDir, destFile)
|
||||
}
|
||||
|
||||
return ioutil.WriteFile(destFile, b, 0644)
|
||||
}
|
||||
|
||||
// metricsCollector collects metrics from legacyregistry.DefaultGatherer.Gather() endpoint.
|
||||
// Currently only Histrogram metrics are supported.
|
||||
type metricsCollector struct {
|
||||
metric string
|
||||
}
|
||||
|
||||
func newMetricsCollector(metric string) *metricsCollector {
|
||||
return &metricsCollector{
|
||||
metric: metric,
|
||||
}
|
||||
}
|
||||
|
||||
func (pc *metricsCollector) collect() *DataItem {
|
||||
hist, err := testutil.GetHistogramFromGatherer(legacyregistry.DefaultGatherer, pc.metric)
|
||||
if err != nil {
|
||||
klog.Error(err)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := hist.Validate(); err != nil {
|
||||
klog.Error(err)
|
||||
return nil
|
||||
}
|
||||
|
||||
q50 := hist.Quantile(0.50)
|
||||
q90 := hist.Quantile(0.90)
|
||||
q99 := hist.Quantile(0.95)
|
||||
avg := hist.Average()
|
||||
|
||||
// clear the metrics so that next test always starts with empty prometheus
|
||||
// metrics (since the metrics are shared among all tests run inside the same binary)
|
||||
hist.Clear()
|
||||
|
||||
msFactor := float64(time.Second) / float64(time.Millisecond)
|
||||
|
||||
return &DataItem{
|
||||
Labels: map[string]string{
|
||||
"Metric": pc.metric,
|
||||
},
|
||||
Data: map[string]float64{
|
||||
"Perc50": q50 * msFactor,
|
||||
"Perc90": q90 * msFactor,
|
||||
"Perc99": q99 * msFactor,
|
||||
"Average": avg * msFactor,
|
||||
},
|
||||
Unit: "ms",
|
||||
}
|
||||
}
|
||||
|
||||
type throughputCollector struct {
|
||||
podInformer coreinformers.PodInformer
|
||||
schedulingThroughputs []float64
|
||||
}
|
||||
|
||||
func newThroughputCollector(podInformer coreinformers.PodInformer) *throughputCollector {
|
||||
return &throughputCollector{
|
||||
podInformer: podInformer,
|
||||
}
|
||||
}
|
||||
|
||||
func (tc *throughputCollector) run(stopCh chan struct{}) {
|
||||
podsScheduled, err := getScheduledPods(tc.podInformer)
|
||||
if err != nil {
|
||||
klog.Fatalf("%v", err)
|
||||
}
|
||||
lastScheduledCount := len(podsScheduled)
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-time.After(throughputSampleFrequency):
|
||||
podsScheduled, err := getScheduledPods(tc.podInformer)
|
||||
if err != nil {
|
||||
klog.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
scheduled := len(podsScheduled)
|
||||
samplingRatioSeconds := float64(throughputSampleFrequency) / float64(time.Second)
|
||||
throughput := float64(scheduled-lastScheduledCount) / samplingRatioSeconds
|
||||
tc.schedulingThroughputs = append(tc.schedulingThroughputs, throughput)
|
||||
lastScheduledCount = scheduled
|
||||
|
||||
klog.Infof("%d pods scheduled", lastScheduledCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (tc *throughputCollector) collect() *DataItem {
|
||||
throughputSummary := &DataItem{}
|
||||
if length := len(tc.schedulingThroughputs); length > 0 {
|
||||
sort.Float64s(tc.schedulingThroughputs)
|
||||
sum := 0.0
|
||||
for i := range tc.schedulingThroughputs {
|
||||
sum += tc.schedulingThroughputs[i]
|
||||
}
|
||||
|
||||
throughputSummary.Labels = map[string]string{
|
||||
"Metric": "SchedulingThroughput",
|
||||
}
|
||||
throughputSummary.Data = map[string]float64{
|
||||
"Average": sum / float64(length),
|
||||
"Perc50": tc.schedulingThroughputs[int(math.Ceil(float64(length*50)/100))-1],
|
||||
"Perc90": tc.schedulingThroughputs[int(math.Ceil(float64(length*90)/100))-1],
|
||||
"Perc99": tc.schedulingThroughputs[int(math.Ceil(float64(length*99)/100))-1],
|
||||
}
|
||||
throughputSummary.Unit = "pods/s"
|
||||
}
|
||||
return throughputSummary
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user