From 551cec2a21c5480f2f55cde6cc69857b85e4031b Mon Sep 17 00:00:00 2001
From: Filip Grzadkowski <filipg@google.com>
Date: Wed, 29 Apr 2015 12:24:01 +0200
Subject: [PATCH] Verify request latency in e2e performance tests.

---
 test/e2e/density.go |  7 +++++
 test/e2e/util.go    | 67 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/test/e2e/density.go b/test/e2e/density.go
index 1c721416e6a..6d6b11b0bc7 100644
--- a/test/e2e/density.go
+++ b/test/e2e/density.go
@@ -154,6 +154,13 @@ var _ = Describe("Density", func() {
 			// Tune the threshold for allowed failures.
 			badEvents := BadEvents(events)
 			Expect(badEvents).NotTo(BeNumerically(">", int(math.Floor(0.01*float64(totalPods)))))
+
+			// Verify latency metrics
+			// TODO: Update threshold to 1s once we reach this goal
+			// TODO: We should reset metrics before the test. Currently previous tests influence latency metrics.
+			highLatencyRequests, err := HighLatencyRequests(c, 10*time.Second)
+			expectNoError(err)
+			Expect(highLatencyRequests).NotTo(BeNumerically(">", 0))
 		})
 	}
 })
diff --git a/test/e2e/util.go b/test/e2e/util.go
index e40175acc3f..d5a91c9c2bb 100644
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -713,3 +713,70 @@ func getSigner(provider string) (ssh.Signer, error) {
 	}
 	return signer, nil
 }
+
+// LatencyMetrics stores data about request latency at a given quantile
+// broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services).
+type LatencyMetric struct {
+	verb     string
+	resource string
+	// 0 <= quantile <=1, e.g. 0.95 is 95%tile, 0.5 is median.
+	quantile float64
+	latency  time.Duration
+}
+
+func ReadLatencyMetrics(c *client.Client) ([]LatencyMetric, error) {
+	body, err := c.Get().AbsPath("/metrics").DoRaw()
+	if err != nil {
+		return nil, err
+	}
+	metrics := make([]LatencyMetric, 0)
+	for _, line := range strings.Split(string(body), "\n") {
+		if strings.HasPrefix(line, "apiserver_request_latencies_summary{") {
+			// Example line:
+			// apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
+			// TODO: This parsing code is long and not readable. We should improve it.
+			keyVal := strings.Split(line, " ")
+			if len(keyVal) != 2 {
+				return nil, fmt.Errorf("Error parsing metric %q", line)
+			}
+			keyElems := strings.Split(line, "\"")
+			if len(keyElems) != 7 {
+				return nil, fmt.Errorf("Error parsing metric %q", line)
+			}
+			resource := keyElems[1]
+			verb := keyElems[3]
+			quantile, err := strconv.ParseFloat(keyElems[5], 64)
+			if err != nil {
+				return nil, fmt.Errorf("Error parsing metric %q", line)
+			}
+			latency, err := strconv.ParseFloat(keyVal[1], 64)
+			if err != nil {
+				return nil, fmt.Errorf("Error parsing metric %q", line)
+			}
+			metrics = append(metrics, LatencyMetric{verb, resource, quantile, time.Duration(int64(latency)) * time.Microsecond})
+		}
+	}
+	return metrics, nil
+}
+
+// Prints summary metrics for request types with latency above threshold
+// and returns number of such request types.
+func HighLatencyRequests(c *client.Client, threshold time.Duration) (int, error) {
+	metrics, err := ReadLatencyMetrics(c)
+	if err != nil {
+		return 0, err
+	}
+	var badMetrics []LatencyMetric
+	for _, metric := range metrics {
+		if metric.verb != "WATCHLIST" &&
+			// We are only interested in 99%tile, but for logging purposes
+			// it's useful to have all the offending percentiles.
+			metric.quantile <= 0.99 &&
+			metric.latency > threshold {
+			Logf("WARNING - requests with too high latency: %+v", metric)
+			badMetrics = append(badMetrics, metric)
+		}
+	}
+
+	return len(badMetrics), nil
+}