Merge pull request #87897 from kubernetes/revert-85861-scheduler-perf-collect-data-items-from-metrics

Revert "Collect some of scheduling metrics and scheduling throughput"
This commit is contained in:
Kubernetes Prow Robot 2020-02-07 05:58:06 -08:00 committed by GitHub
commit 888bfd62c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 2 additions and 314 deletions

View File

@ -20,10 +20,7 @@ go_library(
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library", "//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/rest:go_default_library", "//staging/src/k8s.io/client-go/rest:go_default_library",
"//staging/src/k8s.io/component-base/metrics/legacyregistry:go_default_library",
"//test/integration/util:go_default_library", "//test/integration/util:go_default_library",
"//vendor/github.com/prometheus/client_model/go:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
], ],
) )

View File

@ -38,15 +38,6 @@ const (
configFile = "config/performance-config.yaml" configFile = "config/performance-config.yaml"
) )
var (
defaultMetrics = []string{
"scheduler_scheduling_algorithm_predicate_evaluation_seconds",
"scheduler_scheduling_algorithm_priority_evaluation_seconds",
"scheduler_binding_duration_seconds",
"scheduler_e2e_scheduling_duration_seconds",
}
)
// testCase configures a test case to run the scheduler performance test. Users should be able to // testCase configures a test case to run the scheduler performance test. Users should be able to
// provide this via a YAML file. // provide this via a YAML file.
// //
@ -101,7 +92,6 @@ type testParams struct {
} }
func BenchmarkPerfScheduling(b *testing.B) { func BenchmarkPerfScheduling(b *testing.B) {
dataItems := DataItems{Version: "v1"}
tests := getSimpleTestCases(configFile) tests := getSimpleTestCases(configFile)
for _, test := range tests { for _, test := range tests {
@ -110,15 +100,12 @@ func BenchmarkPerfScheduling(b *testing.B) {
for feature, flag := range test.FeatureGates { for feature, flag := range test.FeatureGates {
defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)() defer featuregatetesting.SetFeatureGateDuringTest(b, utilfeature.DefaultFeatureGate, feature, flag)()
} }
dataItems.DataItems = append(dataItems.DataItems, perfScheduling(test, b)...) perfScheduling(test, b)
}) })
} }
if err := dataItems2JSONFile(dataItems, b.Name()); err != nil {
klog.Fatalf("%v: unable to write measured data: %v", b.Name(), err)
}
} }
func perfScheduling(test testCase, b *testing.B) []DataItem { func perfScheduling(test testCase, b *testing.B) {
var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{} var nodeStrategy testutils.PrepareNodeStrategy = &testutils.TrivialNodePrepareStrategy{}
if test.Nodes.NodeAllocatableStrategy != nil { if test.Nodes.NodeAllocatableStrategy != nil {
nodeStrategy = test.Nodes.NodeAllocatableStrategy nodeStrategy = test.Nodes.NodeAllocatableStrategy
@ -193,45 +180,15 @@ func perfScheduling(test testCase, b *testing.B) []DataItem {
// start benchmark // start benchmark
b.ResetTimer() b.ResetTimer()
// Start measuring throughput
stopCh := make(chan struct{})
throughputCollector := newThroughputCollector(podInformer)
go throughputCollector.run(stopCh)
// Scheduling the main workload
config = testutils.NewTestPodCreatorConfig() config = testutils.NewTestPodCreatorConfig()
config.AddStrategy(testNamespace, test.PodsToSchedule.Num, testPodStrategy) config.AddStrategy(testNamespace, test.PodsToSchedule.Num, testPodStrategy)
podCreator = testutils.NewTestPodCreator(clientset, config) podCreator = testutils.NewTestPodCreator(clientset, config)
podCreator.CreatePods() podCreator.CreatePods()
<-completedCh <-completedCh
close(stopCh)
// Note: without this line we're taking the overhead of defer() into account. // Note: without this line we're taking the overhead of defer() into account.
b.StopTimer() b.StopTimer()
setNameLabel := func(dataItem *DataItem) DataItem {
if dataItem.Labels == nil {
dataItem.Labels = map[string]string{}
}
dataItem.Labels["Name"] = b.Name()
return *dataItem
}
dataItems := []DataItem{
setNameLabel(throughputCollector.collect()),
}
for _, metric := range defaultMetrics {
dataItem := newPrometheusCollector(metric).collect()
if dataItem == nil {
continue
}
dataItems = append(dataItems, setNameLabel(dataItem))
}
return dataItems
} }
func getPodStrategy(pc podCase) testutils.TestPodCreateStrategy { func getPodStrategy(pc podCase) testutils.TestPodCreateStrategy {

View File

@ -17,34 +17,15 @@ limitations under the License.
package benchmark package benchmark
import ( import (
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"math"
"path"
"sort"
"time"
dto "github.com/prometheus/client_model/go"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/runtime/schema"
coreinformers "k8s.io/client-go/informers/core/v1" coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes" clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest" restclient "k8s.io/client-go/rest"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog"
"k8s.io/kubernetes/test/integration/util" "k8s.io/kubernetes/test/integration/util"
) )
const (
dateFormat = "2006-01-02T15:04:05Z"
throughputSampleFrequency = time.Second
)
var dataItemsDir = flag.String("data-items-dir", "", "destination directory for storing generated data items for perf dashboard")
// mustSetupScheduler starts the following components: // mustSetupScheduler starts the following components:
// - k8s api server (a.k.a. master) // - k8s api server (a.k.a. master)
// - scheduler // - scheduler
@ -85,250 +66,3 @@ func getScheduledPods(podInformer coreinformers.PodInformer) ([]*v1.Pod, error)
} }
return scheduled, nil return scheduled, nil
} }
// DataItem is the data point.
type DataItem struct {
// Data is a map from bucket to real data point (e.g. "Perc90" -> 23.5). Notice
// that all data items with the same label combination should have the same buckets.
Data map[string]float64 `json:"data"`
// Unit is the data unit. Notice that all data items with the same label combination
// should have the same unit.
Unit string `json:"unit"`
// Labels is the labels of the data item.
Labels map[string]string `json:"labels,omitempty"`
}
// DataItems is the data point set. It is the struct that perf dashboard expects.
type DataItems struct {
Version string `json:"version"`
DataItems []DataItem `json:"dataItems"`
}
func dataItems2JSONFile(dataItems DataItems, namePrefix string) error {
b, err := json.Marshal(dataItems)
if err != nil {
return err
}
destFile := fmt.Sprintf("%v_%v.json", namePrefix, time.Now().Format(dateFormat))
if *dataItemsDir != "" {
destFile = path.Join(*dataItemsDir, destFile)
}
return ioutil.WriteFile(destFile, b, 0644)
}
// prometheusCollector collects metrics from legacyregistry.DefaultGatherer.Gather() endpoint.
// Currently only Histrogram metrics are supported.
type prometheusCollector struct {
metric string
cache *dto.MetricFamily
}
func newPrometheusCollector(metric string) *prometheusCollector {
return &prometheusCollector{
metric: metric,
}
}
func (pc *prometheusCollector) collect() *DataItem {
var metricFamily *dto.MetricFamily
m, err := legacyregistry.DefaultGatherer.Gather()
if err != nil {
klog.Error(err)
return nil
}
for _, mFamily := range m {
if mFamily.Name != nil && *mFamily.Name == pc.metric {
metricFamily = mFamily
break
}
}
if metricFamily == nil {
klog.Infof("Metric %q not found", pc.metric)
return nil
}
if metricFamily.GetMetric() == nil {
klog.Infof("Metric %q is empty", pc.metric)
return nil
}
if len(metricFamily.GetMetric()) == 0 {
klog.Infof("Metric %q is empty", pc.metric)
return nil
}
// Histograms are stored under the first index (based on observation).
// Given there's only one histogram registered per each metric name, accessaing
// the first index is sufficient.
dataItem := pc.promHist2Summary(metricFamily.GetMetric()[0].GetHistogram())
if dataItem.Data == nil {
return nil
}
// clear the metrics so that next test always starts with empty prometheus
// metrics (since the metrics are shared among all tests run inside the same binary)
clearPromHistogram(metricFamily.GetMetric()[0].GetHistogram())
return dataItem
}
// Bucket of a histogram
type bucket struct {
upperBound float64
count float64
}
func bucketQuantile(q float64, buckets []bucket) float64 {
if q < 0 {
return math.Inf(-1)
}
if q > 1 {
return math.Inf(+1)
}
if len(buckets) < 2 {
return math.NaN()
}
rank := q * buckets[len(buckets)-1].count
b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
if b == 0 {
return buckets[0].upperBound * (rank / buckets[0].count)
}
// linear approximation of b-th bucket
brank := rank - buckets[b-1].count
bSize := buckets[b].upperBound - buckets[b-1].upperBound
bCount := buckets[b].count - buckets[b-1].count
return buckets[b-1].upperBound + bSize*(brank/bCount)
}
func (pc *prometheusCollector) promHist2Summary(hist *dto.Histogram) *DataItem {
buckets := []bucket{}
if hist.SampleCount == nil || *hist.SampleCount == 0 {
return &DataItem{}
}
if hist.SampleSum == nil || *hist.SampleSum == 0 {
return &DataItem{}
}
for _, bckt := range hist.Bucket {
if bckt == nil {
return &DataItem{}
}
if bckt.UpperBound == nil || *bckt.UpperBound < 0 {
return &DataItem{}
}
buckets = append(buckets, bucket{
count: float64(*bckt.CumulativeCount),
upperBound: *bckt.UpperBound,
})
}
// bucketQuantile expects the upper bound of the last bucket to be +inf
buckets[len(buckets)-1].upperBound = math.Inf(+1)
q50 := bucketQuantile(0.50, buckets)
q90 := bucketQuantile(0.90, buckets)
q99 := bucketQuantile(0.95, buckets)
msFactor := float64(time.Second) / float64(time.Millisecond)
return &DataItem{
Labels: map[string]string{
"Metric": pc.metric,
},
Data: map[string]float64{
"Perc50": q50 * msFactor,
"Perc90": q90 * msFactor,
"Perc99": q99 * msFactor,
"Average": (*hist.SampleSum / float64(*hist.SampleCount)) * msFactor,
},
Unit: "ms",
}
}
func clearPromHistogram(hist *dto.Histogram) {
if hist.SampleCount != nil {
*hist.SampleCount = 0
}
if hist.SampleSum != nil {
*hist.SampleSum = 0
}
for _, b := range hist.Bucket {
if b.CumulativeCount != nil {
*b.CumulativeCount = 0
}
if b.UpperBound != nil {
*b.UpperBound = 0
}
}
}
type throughputCollector struct {
podInformer coreinformers.PodInformer
schedulingThroughputs []float64
}
func newThroughputCollector(podInformer coreinformers.PodInformer) *throughputCollector {
return &throughputCollector{
podInformer: podInformer,
}
}
func (tc *throughputCollector) run(stopCh chan struct{}) {
podsScheduled, err := getScheduledPods(tc.podInformer)
if err != nil {
klog.Fatalf("%v", err)
}
lastScheduledCount := len(podsScheduled)
for {
select {
case <-stopCh:
return
case <-time.After(throughputSampleFrequency):
podsScheduled, err := getScheduledPods(tc.podInformer)
if err != nil {
klog.Fatalf("%v", err)
}
scheduled := len(podsScheduled)
samplingRatioSeconds := float64(throughputSampleFrequency) / float64(time.Second)
throughput := float64(scheduled-lastScheduledCount) / samplingRatioSeconds
tc.schedulingThroughputs = append(tc.schedulingThroughputs, throughput)
lastScheduledCount = scheduled
klog.Infof("%d pods scheduled", lastScheduledCount)
}
}
}
func (tc *throughputCollector) collect() *DataItem {
throughputSummary := &DataItem{}
if length := len(tc.schedulingThroughputs); length > 0 {
sort.Float64s(tc.schedulingThroughputs)
sum := 0.0
for i := range tc.schedulingThroughputs {
sum += tc.schedulingThroughputs[i]
}
throughputSummary.Labels = map[string]string{
"Metric": "SchedulingThroughput",
}
throughputSummary.Data = map[string]float64{
"Average": sum / float64(length),
"Perc50": tc.schedulingThroughputs[int(math.Ceil(float64(length*50)/100))-1],
"Perc90": tc.schedulingThroughputs[int(math.Ceil(float64(length*90)/100))-1],
"Perc99": tc.schedulingThroughputs[int(math.Ceil(float64(length*99)/100))-1],
}
throughputSummary.Unit = "pods/s"
}
return throughputSummary
}