diff --git a/hack/e2e-suite/monitoring.sh b/hack/e2e-suite/monitoring.sh deleted file mode 100755 index 1b1c3cc3ee3..00000000000 --- a/hack/e2e-suite/monitoring.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Assumes a running Kubernetes test cluster; verifies that the monitoring setup -# works. Assumes that we're being called by hack/e2e-test.sh (we use some env -# vars it sets up). - -set -o errexit -set -o nounset -set -o pipefail - -KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. - -: ${KUBE_VERSION_ROOT:=${KUBE_ROOT}} -: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"} -: ${KUBE_CONFIG_FILE:="config-test.sh"} - -export KUBECTL KUBE_CONFIG_FILE - -source "${KUBE_ROOT}/cluster/kube-env.sh" -source "${KUBE_VERSION_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh" - -prepare-e2e - -MONITORING="${KUBE_ROOT}/cluster/addons/cluster-monitoring" -KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh" -BIGRAND=$(printf "%x\n" $(( $RANDOM << 16 | $RANDOM ))) # random 2^32 in hex -MONITORING_FIREWALL_RULE="monitoring-test-${BIGRAND}" - -function setup { - # This only has work to do on gce and gke - if [[ "${KUBERNETES_PROVIDER}" == "gce" ]] || [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then - detect-project - if ! "${GCLOUD}" compute firewall-rules create "${MONITORING_FIREWALL_RULE}" \ - --project "${PROJECT}" \ - --network "${NETWORK}" \ - --quiet \ - --allow tcp:80 tcp:8083 tcp:8086 tcp:9200; then - echo "Failed to set up firewall for monitoring" && false - fi - fi - - "${KUBECTL}" create -f "${MONITORING}/" -} - -function cleanup { - "${KUBECTL}" stop rc monitoring-influx-grafana-controller &> /dev/null || true - "${KUBECTL}" stop rc monitoring-heapster-controller &> /dev/null || true - "${KUBECTL}" delete -f "${MONITORING}/" &> /dev/null || true - - # This only has work to do on gce and gke - if [[ "${KUBERNETES_PROVIDER}" == "gce" ]] || [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then - detect-project - if "${GCLOUD}" compute firewall-rules describe "${MONITORING_FIREWALL_RULE}" &> /dev/null; then - "${GCLOUD}" compute firewall-rules delete \ - --project "${PROJECT}" \ - --quiet \ - "${MONITORING_FIREWALL_RULE}" || true - fi - fi -} - -function influx-data-exists { - local max_retries=10 - local retry_delay=30 #seconds - local influx_ip=$("${KUBECTL}" get pods -l name=influxGrafana -o template -t {{range.items}}{{.currentState.hostIP}}:{{end}} | sed s/://g) - local influx_url="http://$influx_ip:8086/db/k8s/series?u=root&p=root" - local ok="false" - for i in `seq 1 10`; do - if curl --retry $max_retries --retry-delay $retry_delay -G $influx_url --data-urlencode "q=select * from stats limit 1" \ - && curl --retry $max_retries --retry-delay $retry_delay -G $influx_url --data-urlencode "q=select * from machine limit 1"; then - echo "retrieved data from InfluxDB." - ok="true" - break - fi - sleep 5 - done - if [[ "${ok}" != "true" ]]; then - echo "failed to retrieve stats from InfluxDB. monitoring test failed" - exit 1 - fi -} - -function wait-for-pods { - local running=false - for i in `seq 1 20`; do - sleep 20 - if "${KUBECTL}" get pods -l name=influxGrafana -o template -t {{range.items}}{{.currentState.status}}:{{end}} | grep Running &> /dev/null \ - && "${KUBECTL}" get pods -l name=heapster -o template -t {{range.items}}{{.currentState.status}}:{{end}} | grep Running &> /dev/null; then - running=true - break - fi - done - if [ running == false ]; then - echo "giving up waiting on monitoring pods to be active. monitoring test failed" - exit 1 - fi -} - -trap cleanup EXIT - -# Remove any pre-existing monitoring services. -cleanup - -# Start monitoring pods and services. -setup - -# Wait for a maximum of 5 minutes for the influx grafana pod to be running. -echo "waiting for monitoring pods to be running" -wait-for-pods - -# Wait for some time to let heapster push some stats to InfluxDB. -echo "monitoring pods are running. waiting for stats to be pushed to InfluxDB" -sleep 60 - -# Check if stats data exists in InfluxDB -echo "checking if stats exist in InfluxDB" -influx-data-exists - -echo "monitoring setup works" -exit 0 diff --git a/test/e2e/monitoring.go b/test/e2e/monitoring.go index 63bc29434dd..8a6507a27cc 100644 --- a/test/e2e/monitoring.go +++ b/test/e2e/monitoring.go @@ -39,7 +39,12 @@ var _ = Describe("Monitoring", func() { expectNoError(err) }) - It("pod and node resource usage metrics are available on influxdb using heapster.", func() { + It("verify monitoring pods and all cluster nodes are available on influxdb using heapster.", func() { + if testContext.provider != "gce" { + By(fmt.Sprintf("Skipping Monitoring test, which is only supported for provider gce (not %s)", + testContext.provider)) + return + } testMonitoringUsingHeapsterInfluxdb(c) }) }) @@ -51,8 +56,8 @@ const ( influxdbPW = "root" podlistQuery = "select distinct(pod) from stats" nodelistQuery = "select distinct(hostname) from machine" - sleepBetweenAttempts = 30 * time.Second - maxAttempts = 10 // Total sleep time of 5 minutes for this test. + sleepBetweenAttempts = 5 * time.Second + testTimeout = 5 * time.Minute ) var ( @@ -67,27 +72,40 @@ var ( } ) -func expectedRcsExist(c *client.Client) { +func verifyExpectedRcsExistAndGetExpectedPods(c *client.Client) ([]string, error) { rcList, err := c.ReplicationControllers(api.NamespaceDefault).List(labels.Everything()) - expectNoError(err) + if err != nil { + return nil, err + } + expectedPods := []string{} for _, rc := range rcList.Items { if _, ok := expectedRcs[rc.Name]; ok { if rc.Status.Replicas != 1 { - Failf("expected to find only one replica for rc %q, found %d", rc.Name, rc.Status.Replicas) + return nil, fmt.Errorf("expected to find only one replica for rc %q, found %d", rc.Name, rc.Status.Replicas) } expectedRcs[rc.Name] = true + podList, err := c.Pods(api.NamespaceDefault).List(labels.Set(rc.Spec.Selector).AsSelector()) + if err != nil { + return nil, err + } + for _, pod := range podList.Items { + expectedPods = append(expectedPods, pod.Name) + } } } for rc, found := range expectedRcs { if !found { - Failf("Replication Controller %q not found.", rc) + return nil, fmt.Errorf("Replication Controller %q not found.", rc) } } + return expectedPods, nil } -func expectedServicesExist(c *client.Client) { +func expectedServicesExist(c *client.Client) error { serviceList, err := c.Services(api.NamespaceDefault).List(labels.Everything()) - expectNoError(err) + if err != nil { + return err + } for _, service := range serviceList.Items { if _, ok := expectedServices[service.Name]; ok { expectedServices[service.Name] = true @@ -95,29 +113,22 @@ func expectedServicesExist(c *client.Client) { } for service, found := range expectedServices { if !found { - Failf("Service %q not found", service) + return fmt.Errorf("Service %q not found", service) } } + return nil } -func getAllPodsInCluster(c *client.Client) []string { - podList, err := c.Pods(api.NamespaceAll).List(labels.Everything()) - expectNoError(err) - result := []string{} - for _, pod := range podList.Items { - result = append(result, pod.Name) - } - return result -} - -func getAllNodesInCluster(c *client.Client) []string { +func getAllNodesInCluster(c *client.Client) ([]string, error) { nodeList, err := c.Nodes().List() - expectNoError(err) + if err != nil { + return nil, err + } result := []string{} for _, node := range nodeList.Items { result = append(result, node.Name) } - return result + return result, nil } func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error) { @@ -133,6 +144,9 @@ func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error) } result := map[string]bool{} for _, point := range series[0].GetPoints() { + if len(point) != 2 { + Failf("Expected only two entries in a point for query %q. Got %v", query, point) + } name, ok := point[1].(string) if !ok { Failf("expected %v to be a string, but it is %T", point[1], point[1]) @@ -143,6 +157,9 @@ func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error) } func expectedItemsExist(expectedItems []string, actualItems map[string]bool) bool { + if len(actualItems) < len(expectedItems) { + return false + } for _, item := range expectedItems { if _, found := actualItems[item]; !found { return false @@ -182,8 +199,9 @@ func getMasterHost() string { func testMonitoringUsingHeapsterInfluxdb(c *client.Client) { // Check if heapster pods and services are up. - expectedRcsExist(c) - expectedServicesExist(c) + expectedPods, err := verifyExpectedRcsExistAndGetExpectedPods(c) + expectNoError(err) + expectNoError(expectedServicesExist(c)) // TODO: Wait for all pods and services to be running. kubeMasterHttpClient, ok := c.Client.(*http.Client) if !ok { @@ -202,14 +220,14 @@ func testMonitoringUsingHeapsterInfluxdb(c *client.Client) { influxdbClient, err := influxdb.NewClient(config) expectNoError(err, "failed to create influxdb client") - expectedPods := getAllPodsInCluster(c) - expectedNodes := getAllNodesInCluster(c) - attempt := maxAttempts + expectedNodes, err := getAllNodesInCluster(c) + expectNoError(err) + startTime := time.Now() for { if validatePodsAndNodes(influxdbClient, expectedPods, expectedNodes) { return } - if attempt--; attempt <= 0 { + if time.Since(startTime) >= testTimeout { break } time.Sleep(sleepBetweenAttempts)