From 551cec2a21c5480f2f55cde6cc69857b85e4031b Mon Sep 17 00:00:00 2001 From: Filip Grzadkowski Date: Wed, 29 Apr 2015 12:24:01 +0200 Subject: [PATCH] Verify request latency in e2e performance tests. --- test/e2e/density.go | 7 +++++ test/e2e/util.go | 67 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/test/e2e/density.go b/test/e2e/density.go index 1c721416e6a..6d6b11b0bc7 100644 --- a/test/e2e/density.go +++ b/test/e2e/density.go @@ -154,6 +154,13 @@ var _ = Describe("Density", func() { // Tune the threshold for allowed failures. badEvents := BadEvents(events) Expect(badEvents).NotTo(BeNumerically(">", int(math.Floor(0.01*float64(totalPods))))) + + // Verify latency metrics + // TODO: Update threshold to 1s once we reach this goal + // TODO: We should reset metrics before the test. Currently previous tests influence latency metrics. + highLatencyRequests, err := HighLatencyRequests(c, 10*time.Second) + expectNoError(err) + Expect(highLatencyRequests).NotTo(BeNumerically(">", 0)) }) } }) diff --git a/test/e2e/util.go b/test/e2e/util.go index e40175acc3f..d5a91c9c2bb 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -713,3 +713,70 @@ func getSigner(provider string) (ssh.Signer, error) { } return signer, nil } + +// LatencyMetrics stores data about request latency at a given quantile +// broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services). +type LatencyMetric struct { + verb string + resource string + // 0 <= quantile <=1, e.g. 0.95 is 95%tile, 0.5 is median. + quantile float64 + latency time.Duration +} + +func ReadLatencyMetrics(c *client.Client) ([]LatencyMetric, error) { + body, err := c.Get().AbsPath("/metrics").DoRaw() + if err != nil { + return nil, err + } + metrics := make([]LatencyMetric, 0) + for _, line := range strings.Split(string(body), "\n") { + if strings.HasPrefix(line, "apiserver_request_latencies_summary{") { + // Example line: + // apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908 + // TODO: This parsing code is long and not readable. We should improve it. + keyVal := strings.Split(line, " ") + if len(keyVal) != 2 { + return nil, fmt.Errorf("Error parsing metric %q", line) + } + keyElems := strings.Split(line, "\"") + if len(keyElems) != 7 { + return nil, fmt.Errorf("Error parsing metric %q", line) + } + resource := keyElems[1] + verb := keyElems[3] + quantile, err := strconv.ParseFloat(keyElems[5], 64) + if err != nil { + return nil, fmt.Errorf("Error parsing metric %q", line) + } + latency, err := strconv.ParseFloat(keyVal[1], 64) + if err != nil { + return nil, fmt.Errorf("Error parsing metric %q", line) + } + metrics = append(metrics, LatencyMetric{verb, resource, quantile, time.Duration(int64(latency)) * time.Microsecond}) + } + } + return metrics, nil +} + +// Prints summary metrics for request types with latency above threshold +// and returns number of such request types. +func HighLatencyRequests(c *client.Client, threshold time.Duration) (int, error) { + metrics, err := ReadLatencyMetrics(c) + if err != nil { + return 0, err + } + var badMetrics []LatencyMetric + for _, metric := range metrics { + if metric.verb != "WATCHLIST" && + // We are only interested in 99%tile, but for logging purposes + // it's useful to have all the offending percentiles. + metric.quantile <= 0.99 && + metric.latency > threshold { + Logf("WARNING - requests with too high latency: %+v", metric) + badMetrics = append(badMetrics, metric) + } + } + + return len(badMetrics), nil +}