Merge pull request #50913 from shyamjvs/list-call-slo

Automatic merge from submit-queue (batch tested with PRs 50893, 50913, 50963, 50629, 50640)

Increase latency threshold for list api calls

This is only a short-term solution to make our density test green. In the long-term, we should measure as per our new SLIs.
From @wojtek-t's [doc](https://docs.google.com/document/d/1Q5qxdeBPgTTIXZxdsFILg7kgqWhvOwY8uROEf0j5YBw) on the new SLIs/SLOs, we have the following SLO for list calls:

```
SLO1: In default Kubernetes installation, 99th percentile of SLI2 per cluster-day:
<= 1s if total number of objects of the same type as resource in the system <= X
<= 5s if total number of objects of the same type as resource in the system <= Y
<= 30s if total number of objects of the same types as resource in the system <= Z
```

I would guess that 170,000 pods would fall into the 2nd bracket (at least) and hence the new value of 5s. WDYT?

cc @kubernetes/sig-scalability-misc @wojtek-t @gmarek
This commit is contained in:
Kubernetes Submit Queue 2017-08-22 05:31:07 -07:00 committed by GitHub
commit fdf14b8218
3 changed files with 17 additions and 12 deletions

View File

@ -51,9 +51,10 @@ const (
// Increasing threshold to 1s is within our SLO and should solve this problem. // Increasing threshold to 1s is within our SLO and should solve this problem.
apiCallLatencyThreshold time.Duration = 1 * time.Second apiCallLatencyThreshold time.Duration = 1 * time.Second
// We set a higher threshold for list apicalls as they can take more time when // We use a higher threshold for list apicalls if the cluster is big (i.e having > 500 nodes)
// the list is really big. For eg. list nodes in a 5000-node cluster. // as list response sizes are bigger in general for big clusters.
apiListCallLatencyThreshold time.Duration = 2 * time.Second apiListCallLatencyThreshold time.Duration = 5 * time.Second
bigClusterNodeCountThreshold = 500
// Cluster Autoscaler metrics names // Cluster Autoscaler metrics names
caFunctionMetric = "cluster_autoscaler_function_duration_seconds_bucket" caFunctionMetric = "cluster_autoscaler_function_duration_seconds_bucket"
@ -354,8 +355,10 @@ func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) {
} }
// Prints top five summary metrics for request types with latency and returns // Prints top five summary metrics for request types with latency and returns
// number of such request types above threshold. // number of such request types above threshold. We use a higher threshold for
func HighLatencyRequests(c clientset.Interface) (int, *APIResponsiveness, error) { // list calls if nodeCount is above a given threshold (i.e. cluster is big).
func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIResponsiveness, error) {
isBigCluster := (nodeCount > bigClusterNodeCountThreshold)
metrics, err := readLatencyMetrics(c) metrics, err := readLatencyMetrics(c)
if err != nil { if err != nil {
return 0, metrics, err return 0, metrics, err
@ -364,12 +367,14 @@ func HighLatencyRequests(c clientset.Interface) (int, *APIResponsiveness, error)
badMetrics := 0 badMetrics := 0
top := 5 top := 5
for i := range metrics.APICalls { for i := range metrics.APICalls {
latency := metrics.APICalls[i].Latency.Perc99
isListCall := (metrics.APICalls[i].Verb == "LIST")
isBad := false isBad := false
verb := metrics.APICalls[i].Verb if latency > apiCallLatencyThreshold {
if verb != "LIST" && metrics.APICalls[i].Latency.Perc99 > apiCallLatencyThreshold || if !isListCall || !isBigCluster || (latency > apiListCallLatencyThreshold) {
verb == "LIST" && metrics.APICalls[i].Latency.Perc99 > apiListCallLatencyThreshold {
badMetrics++
isBad = true isBad = true
badMetrics++
}
} }
if top > 0 || isBad { if top > 0 || isBad {
top-- top--

View File

@ -328,7 +328,7 @@ var _ = SIGDescribe("Density", func() {
summaries := make([]framework.TestDataSummary, 0, 2) summaries := make([]framework.TestDataSummary, 0, 2)
// Verify latency metrics. // Verify latency metrics.
highLatencyRequests, metrics, err := framework.HighLatencyRequests(c) highLatencyRequests, metrics, err := framework.HighLatencyRequests(c, nodeCount)
framework.ExpectNoError(err) framework.ExpectNoError(err)
if err == nil { if err == nil {
summaries = append(summaries, metrics) summaries = append(summaries, metrics)

View File

@ -92,7 +92,7 @@ var _ = SIGDescribe("Load capacity", func() {
// TODO add flag that allows to skip cleanup on failure // TODO add flag that allows to skip cleanup on failure
AfterEach(func() { AfterEach(func() {
// Verify latency metrics // Verify latency metrics
highLatencyRequests, metrics, err := framework.HighLatencyRequests(clientset) highLatencyRequests, metrics, err := framework.HighLatencyRequests(clientset, nodeCount)
framework.ExpectNoError(err) framework.ExpectNoError(err)
if err == nil { if err == nil {
summaries := make([]framework.TestDataSummary, 0, 1) summaries := make([]framework.TestDataSummary, 0, 1)