diff --git a/test/e2e/framework/metrics/BUILD b/test/e2e/framework/metrics/BUILD index 940075d7d4d..4e36fcfcd20 100644 --- a/test/e2e/framework/metrics/BUILD +++ b/test/e2e/framework/metrics/BUILD @@ -27,17 +27,14 @@ go_library( "//pkg/kubelet/dockershim/metrics:go_default_library", "//pkg/kubelet/metrics:go_default_library", "//pkg/master/ports:go_default_library", - "//pkg/scheduler/metrics:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//staging/src/k8s.io/component-base/metrics/testutil:go_default_library", "//test/e2e/framework/log:go_default_library", - "//test/e2e/framework/ssh:go_default_library", "//test/e2e/perftype:go_default_library", "//test/e2e/system:go_default_library", - "//vendor/github.com/onsi/gomega:go_default_library", "//vendor/k8s.io/klog:go_default_library", ], ) diff --git a/test/e2e/framework/metrics/api.go b/test/e2e/framework/metrics/api.go index c0ea733c246..cb0d2766290 100644 --- a/test/e2e/framework/metrics/api.go +++ b/test/e2e/framework/metrics/api.go @@ -18,7 +18,6 @@ package metrics import ( "fmt" - "time" e2eperftype "k8s.io/kubernetes/test/e2e/perftype" ) @@ -61,51 +60,6 @@ func (a *APIResponsiveness) Less(i, j int) bool { return a.APICalls[i].Latency.Perc99 < a.APICalls[j].Latency.Perc99 } -// Set request latency for a particular quantile in the APICall metric entry (creating one if necessary). -// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median) -// Only 0.5, 0.9 and 0.99 quantiles are supported. -func (a *APIResponsiveness) addMetricRequestLatency(resource, subresource, verb, scope string, quantile float64, latency time.Duration) { - for i, apicall := range a.APICalls { - if apicall.Resource == resource && apicall.Subresource == subresource && apicall.Verb == verb && apicall.Scope == scope { - a.APICalls[i] = setQuantileAPICall(apicall, quantile, latency) - return - } - } - apicall := setQuantileAPICall(APICall{Resource: resource, Subresource: subresource, Verb: verb, Scope: scope}, quantile, latency) - a.APICalls = append(a.APICalls, apicall) -} - -// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median) -// Only 0.5, 0.9 and 0.99 quantiles are supported. -func setQuantileAPICall(apicall APICall, quantile float64, latency time.Duration) APICall { - setQuantile(&apicall.Latency, quantile, latency) - return apicall -} - -// Only 0.5, 0.9 and 0.99 quantiles are supported. -func setQuantile(metric *LatencyMetric, quantile float64, latency time.Duration) { - switch quantile { - case 0.5: - metric.Perc50 = latency - case 0.9: - metric.Perc90 = latency - case 0.99: - metric.Perc99 = latency - } -} - -// Add request count to the APICall metric entry (creating one if necessary). -func (a *APIResponsiveness) addMetricRequestCount(resource, subresource, verb, scope string, count int) { - for i, apicall := range a.APICalls { - if apicall.Resource == resource && apicall.Subresource == subresource && apicall.Verb == verb && apicall.Scope == scope { - a.APICalls[i].Count += count - return - } - } - apicall := APICall{Resource: resource, Subresource: subresource, Verb: verb, Count: count, Scope: scope} - a.APICalls = append(a.APICalls, apicall) -} - // currentAPICallMetricsVersion is the current apicall performance metrics version. We should // bump up the version each time we make incompatible change to the metrics. const currentAPICallMetricsVersion = "v1" diff --git a/test/e2e/framework/metrics/latencies.go b/test/e2e/framework/metrics/latencies.go index 3e7b4042bbc..ffb39d4e611 100644 --- a/test/e2e/framework/metrics/latencies.go +++ b/test/e2e/framework/metrics/latencies.go @@ -17,25 +17,11 @@ limitations under the License. package metrics import ( - "context" "fmt" "math" - "sort" - "strconv" - "strings" "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/component-base/metrics/testutil" - "k8s.io/kubernetes/pkg/master/ports" - schedulermetric "k8s.io/kubernetes/pkg/scheduler/metrics" e2elog "k8s.io/kubernetes/test/e2e/framework/log" - e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" - "k8s.io/kubernetes/test/e2e/system" - - "github.com/onsi/gomega" ) const ( @@ -43,119 +29,8 @@ const ( // transient failures from failing tests. // TODO: client should not apply this timeout to Watch calls. Increased from 30s until that is fixed. SingleCallTimeout = 5 * time.Minute - - // nodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node. - nodeStartupThreshold = 4 * time.Second - - // We are setting 1s threshold for apicalls even in small clusters to avoid flakes. - // The problem is that if long GC is happening in small clusters (where we have e.g. - // 1-core master machines) and tests are pretty short, it may consume significant - // portion of CPU and basically stop all the real work. - // Increasing threshold to 1s is within our SLO and should solve this problem. - apiCallLatencyThreshold time.Duration = 1 * time.Second - - // We use a higher threshold for list apicalls if the cluster is big (i.e having > 500 nodes) - // as list response sizes are bigger in general for big clusters. We also use a higher threshold - // for list calls at cluster scope (this includes non-namespaced and all-namespaced calls). - apiListCallLatencyThreshold time.Duration = 5 * time.Second - apiClusterScopeListCallThreshold time.Duration = 10 * time.Second - bigClusterNodeCountThreshold = 500 ) -var schedulingLatencyMetricName = schedulermetric.SchedulerSubsystem + "_" + schedulermetric.SchedulingLatencyName - -func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) { - var a APIResponsiveness - - body, err := getMetrics(c) - if err != nil { - return nil, err - } - - samples, err := testutil.ExtractMetricSamples(body) - if err != nil { - return nil, err - } - - ignoredResources := sets.NewString("events") - // TODO: figure out why we're getting non-capitalized proxy and fix this. - ignoredVerbs := sets.NewString("WATCH", "WATCHLIST", "PROXY", "proxy", "CONNECT") - - for _, sample := range samples { - // Example line: - // apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908 - // apiserver_request_total{resource="pods",verb="LIST",client="kubectl",code="200",contentType="json"} 233 - if sample.Metric[testutil.MetricNameLabel] != "apiserver_request_latencies_summary" && - sample.Metric[testutil.MetricNameLabel] != "apiserver_request_total" { - continue - } - - resource := string(sample.Metric["resource"]) - subresource := string(sample.Metric["subresource"]) - verb := string(sample.Metric["verb"]) - scope := string(sample.Metric["scope"]) - if ignoredResources.Has(resource) || ignoredVerbs.Has(verb) { - continue - } - - switch sample.Metric[testutil.MetricNameLabel] { - case "apiserver_request_latencies_summary": - latency := sample.Value - quantile, err := strconv.ParseFloat(string(sample.Metric[testutil.QuantileLabel]), 64) - if err != nil { - return nil, err - } - a.addMetricRequestLatency(resource, subresource, verb, scope, quantile, time.Duration(int64(latency))*time.Microsecond) - case "apiserver_request_total": - count := sample.Value - a.addMetricRequestCount(resource, subresource, verb, scope, int(count)) - - } - } - - return &a, err -} - -// HighLatencyRequests prints top five summary metrics for request types with latency and returns -// number of such request types above threshold. We use a higher threshold for -// list calls if nodeCount is above a given threshold (i.e. cluster is big). -func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIResponsiveness, error) { - isBigCluster := (nodeCount > bigClusterNodeCountThreshold) - metrics, err := readLatencyMetrics(c) - if err != nil { - return 0, metrics, err - } - sort.Sort(sort.Reverse(metrics)) - badMetrics := 0 - top := 5 - for i := range metrics.APICalls { - latency := metrics.APICalls[i].Latency.Perc99 - isListCall := (metrics.APICalls[i].Verb == "LIST") - isClusterScopedCall := (metrics.APICalls[i].Scope == "cluster") - isBad := false - latencyThreshold := apiCallLatencyThreshold - if isListCall && isBigCluster { - latencyThreshold = apiListCallLatencyThreshold - if isClusterScopedCall { - latencyThreshold = apiClusterScopeListCallThreshold - } - } - if latency > latencyThreshold { - isBad = true - badMetrics++ - } - if top > 0 || isBad { - top-- - prefix := "" - if isBad { - prefix = "WARNING " - } - e2elog.Logf("%vTop latency metric: %+v", prefix, metrics.APICalls[i]) - } - } - return badMetrics, metrics, nil -} - // VerifyLatencyWithinThreshold verifies whether 50, 90 and 99th percentiles of a latency metric are // within the expected threshold. func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName string) error { @@ -171,147 +46,6 @@ func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName st return nil } -// ResetMetrics resets latency metrics in apiserver. -func ResetMetrics(c clientset.Interface) error { - e2elog.Logf("Resetting latency metrics in apiserver...") - body, err := c.CoreV1().RESTClient().Delete().AbsPath("/metrics").DoRaw() - if err != nil { - return err - } - if string(body) != "metrics reset\n" { - return fmt.Errorf("Unexpected response: %q", string(body)) - } - return nil -} - -// Retrieves metrics information. -func getMetrics(c clientset.Interface) (string, error) { - body, err := c.CoreV1().RESTClient().Get().AbsPath("/metrics").DoRaw() - if err != nil { - return "", err - } - return string(body), nil -} - -// Sends REST request to kube scheduler metrics -func sendRestRequestToScheduler(c clientset.Interface, op, provider, cloudMasterName, masterHostname string) (string, error) { - opUpper := strings.ToUpper(op) - if opUpper != "GET" && opUpper != "DELETE" { - return "", fmt.Errorf("Unknown REST request") - } - - nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{}) - // The following 4 lines are intended to replace framework.ExpectNoError(err) - if err != nil { - e2elog.Logf("Unexpected error occurred: %v", err) - } - gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - - var masterRegistered = false - for _, node := range nodes.Items { - if system.DeprecatedMightBeMasterNode(node.Name) { - masterRegistered = true - } - } - - var responseText string - if masterRegistered { - ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout) - defer cancel() - - body, err := c.CoreV1().RESTClient().Verb(opUpper). - Context(ctx). - Namespace(metav1.NamespaceSystem). - Resource("pods"). - Name(fmt.Sprintf("kube-scheduler-%v:%v", cloudMasterName, ports.InsecureSchedulerPort)). - SubResource("proxy"). - Suffix("metrics"). - Do().Raw() - - // The following 4 lines are intended to replace - // framework.ExpectNoError(err). - if err != nil { - e2elog.Logf("Unexpected error occurred: %v", err) - } - gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - responseText = string(body) - } else { - // If master is not registered fall back to old method of using SSH. - if provider == "gke" || provider == "eks" { - e2elog.Logf("Not grabbing scheduler metrics through master SSH: unsupported for %s", provider) - return "", nil - } - - cmd := "curl -X " + opUpper + " http://localhost:10251/metrics" - sshResult, err := e2essh.SSH(cmd, masterHostname+":22", provider) - if err != nil || sshResult.Code != 0 { - return "", fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) - } - responseText = sshResult.Stdout - } - return responseText, nil -} - -// Retrieves scheduler latency metrics. -func getSchedulingLatency(c clientset.Interface, provider, cloudMasterName, masterHostname string) (*SchedulingMetrics, error) { - result := SchedulingMetrics{} - data, err := sendRestRequestToScheduler(c, "GET", provider, cloudMasterName, masterHostname) - if err != nil { - return nil, err - } - - samples, err := testutil.ExtractMetricSamples(data) - if err != nil { - return nil, err - } - - for _, sample := range samples { - if string(sample.Metric[testutil.MetricNameLabel]) != schedulingLatencyMetricName { - continue - } - - var metric *LatencyMetric - switch sample.Metric[schedulermetric.OperationLabel] { - case schedulermetric.PredicateEvaluation: - metric = &result.PredicateEvaluationLatency - case schedulermetric.PriorityEvaluation: - metric = &result.PriorityEvaluationLatency - case schedulermetric.PreemptionEvaluation: - metric = &result.PreemptionEvaluationLatency - case schedulermetric.Binding: - metric = &result.BindingLatency - } - if metric == nil { - continue - } - - quantile, err := strconv.ParseFloat(string(sample.Metric[testutil.QuantileLabel]), 64) - if err != nil { - return nil, err - } - setQuantile(metric, quantile, time.Duration(int64(float64(sample.Value)*float64(time.Second)))) - } - return &result, nil -} - -// VerifySchedulerLatency verifies (currently just by logging them) the scheduling latencies. -func VerifySchedulerLatency(c clientset.Interface, provider, cloudMasterName, masterHostname string) (*SchedulingMetrics, error) { - latency, err := getSchedulingLatency(c, provider, cloudMasterName, masterHostname) - if err != nil { - return nil, err - } - return latency, nil -} - -// ResetSchedulerMetrics sends a DELETE request to kube-scheduler for resetting metrics. -func ResetSchedulerMetrics(c clientset.Interface, provider, cloudMasterName, masterHostname string) error { - responseText, err := sendRestRequestToScheduler(c, "DELETE", provider, cloudMasterName, masterHostname) - if err != nil { - return fmt.Errorf("Unexpected response: %q, %v", responseText, err) - } - return nil -} - // PodLatencyData encapsulates pod startup latency information. type PodLatencyData struct { // Name of the pod @@ -339,21 +73,6 @@ func ExtractLatencyMetrics(latencies []PodLatencyData) LatencyMetric { return LatencyMetric{Perc50: perc50, Perc90: perc90, Perc99: perc99, Perc100: perc100} } -// LogSuspiciousLatency logs metrics/docker errors from all nodes that had slow startup times -// If latencyDataLag is nil then it will be populated from latencyData -func LogSuspiciousLatency(latencyData []PodLatencyData, latencyDataLag []PodLatencyData, nodeCount int, c clientset.Interface) { - if latencyDataLag == nil { - latencyDataLag = latencyData - } - for _, l := range latencyData { - if l.Latency > nodeStartupThreshold { - HighLatencyKubeletOperations(c, 1*time.Second, l.Node, e2elog.Logf) - } - } - e2elog.Logf("Approx throughput: %v pods/min", - float64(nodeCount)/(latencyDataLag[len(latencyDataLag)-1].Latency.Minutes())) -} - // PrintLatencies outputs latencies to log with readable format. func PrintLatencies(latencies []PodLatencyData, header string) { metrics := ExtractLatencyMetrics(latencies)