From 7e6b8ac26bfe4eeae94ee04ba1eee738c4db03c2 Mon Sep 17 00:00:00 2001 From: Shyam Jeedigunta Date: Tue, 7 Feb 2017 16:18:47 +0100 Subject: [PATCH] Added a basic monitor for watching etcd version and size related info --- .../images/etcd-version-monitor/Dockerfile | 20 ++ cluster/images/etcd-version-monitor/Makefile | 43 ++++ cluster/images/etcd-version-monitor/README.md | 25 ++ .../etcd-version-monitor.go | 233 ++++++++++++++++++ .../etcd-version-monitor.yaml | 13 + hack/verify-flags/exceptions.txt | 6 +- hack/verify-flags/known-flags.txt | 5 + 7 files changed, 342 insertions(+), 3 deletions(-) create mode 100644 cluster/images/etcd-version-monitor/Dockerfile create mode 100644 cluster/images/etcd-version-monitor/Makefile create mode 100644 cluster/images/etcd-version-monitor/README.md create mode 100644 cluster/images/etcd-version-monitor/etcd-version-monitor.go create mode 100644 cluster/images/etcd-version-monitor/etcd-version-monitor.yaml diff --git a/cluster/images/etcd-version-monitor/Dockerfile b/cluster/images/etcd-version-monitor/Dockerfile new file mode 100644 index 00000000000..ae78f60cd86 --- /dev/null +++ b/cluster/images/etcd-version-monitor/Dockerfile @@ -0,0 +1,20 @@ +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM scratch +LABEL maintainer "Shyam JVS " + +COPY etcd-version-monitor /etcd-version-monitor + +EXPOSE 9101 diff --git a/cluster/images/etcd-version-monitor/Makefile b/cluster/images/etcd-version-monitor/Makefile new file mode 100644 index 00000000000..abb1e32f0b4 --- /dev/null +++ b/cluster/images/etcd-version-monitor/Makefile @@ -0,0 +1,43 @@ +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build the etcd-version-monitor image +# +# Usage: +# [GOLANG_VERSION=1.7.4] [REGISTRY=gcr.io/google-containers] [TAG=test] make (build|push) +# TODO(shyamjvs): Support architectures other than amd64 if needed. +ARCH:=amd64 +GOLANG_VERSION?=1.7.4 +REGISTRY?=gcr.io/google-containers +TAG?=0.1.0 +IMAGE:=$(REGISTRY)/etcd-version-monitor:$(TAG) +CURRENT_DIR:=$(pwd) +TEMP_DIR:=$(shell mktemp -d) + +build: + # Copy the necessary files for building the image to TEMP_DIR. + cp etcd-version-monitor.go Dockerfile $(TEMP_DIR) + + # Compile etcd-version-monitor. + docker run -it -v $(shell pwd)/../../../:/go/src/k8s.io/kubernetes -v $(TEMP_DIR):/build -e GOARCH=$(ARCH) golang:$(GOLANG_VERSION) \ + /bin/bash -c "CGO_ENABLED=0 go build -o /build/etcd-version-monitor k8s.io/kubernetes/cluster/images/etcd-version-monitor" + + docker build -t $(IMAGE) $(TEMP_DIR) + +push: build + gcloud docker -- push $(IMAGE) + +all: build + +.PHONY: build push diff --git a/cluster/images/etcd-version-monitor/README.md b/cluster/images/etcd-version-monitor/README.md new file mode 100644 index 00000000000..bd000219fa7 --- /dev/null +++ b/cluster/images/etcd-version-monitor/README.md @@ -0,0 +1,25 @@ +# etcd-version-monitor + +This is a tool for exporting metrics related to etcd version, like etcd +server's binary version, cluster version, and counts of different kinds of +gRPC calls (which is a characteristic of v3), etc. These metrics are in +prometheus format and can be scraped by a prometheus server. +The metrics are exposed at the http://localhost:9101/metrics endpoint. + +**RUNNING THE TOOL** + +To run this tool as a docker container: +- make build +- docker run --net=host -i -t gcr.io/google_containers/etcd-version-monitor:test /etcd-version-monitor --logtostderr + +To run this as a pod on the kubernetes cluster: +- Place the 'etcd-version-monitor.yaml' in the manifests directory of + kubelet on the master machine. + +*Note*: This tool has to run on the same machine as etcd, as communication +with etcd is over localhost. + +**VERIFYING THE TOOL** + +- Goto [http://localhost:9101/metrics](http://localhost:9101/metrics) in order to view the exported metrics. +- The metrics prefixed with "etcd_" are the ones of interest to us. diff --git a/cluster/images/etcd-version-monitor/etcd-version-monitor.go b/cluster/images/etcd-version-monitor/etcd-version-monitor.go new file mode 100644 index 00000000000..5def5d1b7a6 --- /dev/null +++ b/cluster/images/etcd-version-monitor/etcd-version-monitor.go @@ -0,0 +1,233 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "encoding/json" + goflag "flag" + "fmt" + "net/http" + "time" + + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/expfmt" + "github.com/spf13/pflag" +) + +// Initialize the prometheus instrumentation and client related flags. +var ( + listenAddress string + metricsPath string + etcdVersionScrapeURI string + etcdMetricsScrapeURI string + scrapeTimeout time.Duration +) + +func registerFlags(fs *pflag.FlagSet) { + fs.StringVar(&listenAddress, "listen-address", "localhost:9101", "Address to listen on for serving prometheus metrics") + fs.StringVar(&metricsPath, "metrics-path", "/metrics", "Path under which prometheus metrics are to be served") + fs.StringVar(&etcdVersionScrapeURI, "etcd-version-scrape-uri", "http://localhost:2379/version", "URI to scrape etcd version info") + fs.StringVar(&etcdMetricsScrapeURI, "etcd-metrics-scrape-uri", "http://localhost:2379/metrics", "URI to scrape etcd metrics") + fs.DurationVar(&scrapeTimeout, "scrape-timeout", 15*time.Second, "Timeout for trying to get stats from etcd") +} + +const ( + namespace = "etcd" // For prefixing prometheus metrics +) + +// Initialize prometheus metrics to be exported. +var ( + etcdVersion = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "version_info", + Help: "Etcd server's binary version", + }, + []string{"binary_version"}) + etcdGRPCRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "grpc_requests_total", + Help: "Counter of received grpc requests, labeled by the grpc method and service names", + }, + []string{"method", "service"}) +) + +// Struct for unmarshalling the json response from etcd's /version endpoint. +type EtcdVersion struct { + BinaryVersion string `json:"etcdserver"` + ClusterVersion string `json:"etcdcluster"` +} + +// Function for fetching etcd version info and feeding it to the prometheus metric. +func getVersion(lastSeenBinaryVersion *string) error { + // Create the get request for the etcd version endpoint. + req, err := http.NewRequest("GET", etcdVersionScrapeURI, nil) + if err != nil { + return fmt.Errorf("Failed to create GET request for etcd version: %v", err) + } + + // Send the get request and receive a response. + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("Failed to receive GET response for etcd version: %v", err) + } + defer resp.Body.Close() + + // Obtain EtcdVersion from the JSON response. + var version EtcdVersion + if err := json.NewDecoder(resp.Body).Decode(&version); err != nil { + return fmt.Errorf("Failed to decode etcd version JSON: %v", err) + } + + // Return without updating the version if it stayed the same since last time. + if *lastSeenBinaryVersion == version.BinaryVersion { + return nil + } + + // Delete the metric for the previous version. + if *lastSeenBinaryVersion != "" { + deleted := etcdVersion.Delete(prometheus.Labels{"binary_version": *lastSeenBinaryVersion}) + if !deleted { + return fmt.Errorf("Failed to delete previous version's metric") + } + } + + // Record the new version in a metric. + etcdVersion.With(prometheus.Labels{ + "binary_version": version.BinaryVersion, + }).Set(0) + *lastSeenBinaryVersion = version.BinaryVersion + return nil +} + +// Periodically fetches etcd version info. +func getVersionPeriodically(stopCh <-chan struct{}) { + lastSeenBinaryVersion := "" + for { + if err := getVersion(&lastSeenBinaryVersion); err != nil { + glog.Errorf("Failed to fetch etcd version: %v", err) + } + select { + case <-stopCh: + break + case <-time.After(scrapeTimeout): + } + } +} + +// Struct for storing labels for gRPC request types. +type GRPCRequestLabels struct { + Method string + Service string +} + +// Function for fetching etcd grpc request counts and feeding it to the prometheus metric. +func getGRPCRequestCount(lastRecordedCount *map[GRPCRequestLabels]float64) error { + // Create the get request for the etcd metrics endpoint. + req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil) + if err != nil { + return fmt.Errorf("Failed to create GET request for etcd metrics: %v", err) + } + + // Send the get request and receive a response. + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("Failed to receive GET response for etcd metrics: %v", err) + } + defer resp.Body.Close() + + // Parse the metrics in text format to a MetricFamily struct. + var textParser expfmt.TextParser + metricFamilies, err := textParser.TextToMetricFamilies(resp.Body) + if err != nil { + return fmt.Errorf("Failed to parse etcd metrics: %v", err) + } + + // Look through the grpc requests metric family and update our promotheus metric. + for _, metric := range metricFamilies["etcd_grpc_requests_total"].GetMetric() { + var grpcRequestLabels GRPCRequestLabels + for _, label := range metric.GetLabel() { + if label.GetName() == "grpc_method" { + grpcRequestLabels.Method = label.GetValue() + } + if label.GetName() == "grpc_service" { + grpcRequestLabels.Service = label.GetValue() + } + } + if grpcRequestLabels.Method == "" || grpcRequestLabels.Service == "" { + return fmt.Errorf("Could not get value for grpc_method and/or grpc_service label") + } + + // Get last recorded value and new value of the metric and update it suitably. + previousMetricValue := 0.0 + if value, ok := (*lastRecordedCount)[grpcRequestLabels]; ok { + previousMetricValue = value + } + newMetricValue := metric.GetCounter().GetValue() + (*lastRecordedCount)[grpcRequestLabels] = newMetricValue + if newMetricValue >= previousMetricValue { + etcdGRPCRequestsTotal.With(prometheus.Labels{ + "method": grpcRequestLabels.Method, + "service": grpcRequestLabels.Service, + }).Add(newMetricValue - previousMetricValue) + } + } + return nil +} + +// Function for periodically fetching etcd GRPC request counts. +func getGRPCRequestCountPeriodically(stopCh <-chan struct{}) { + // This map stores last recorded count for a given grpc request type. + lastRecordedCount := make(map[GRPCRequestLabels]float64) + for { + if err := getGRPCRequestCount(&lastRecordedCount); err != nil { + glog.Errorf("Failed to fetch etcd grpc request counts: %v", err) + } + select { + case <-stopCh: + break + case <-time.After(scrapeTimeout): + } + } +} + +func main() { + // Register the commandline flags passed to the tool. + registerFlags(pflag.CommandLine) + pflag.CommandLine.AddGoFlagSet(goflag.CommandLine) + pflag.Parse() + + // Register the metrics we defined above with prometheus. + prometheus.MustRegister(etcdVersion) + prometheus.MustRegister(etcdGRPCRequestsTotal) + prometheus.Unregister(prometheus.NewGoCollector()) // Unregister go metrics. + + // Spawn threads for periodically scraping etcd version metrics. + stopCh := make(chan struct{}) + defer close(stopCh) + go getVersionPeriodically(stopCh) + go getGRPCRequestCountPeriodically(stopCh) + + // Serve our metrics on listenAddress/metricsPath. + glog.Infof("Listening on: %v", listenAddress) + http.Handle(metricsPath, prometheus.Handler()) + glog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil)) +} diff --git a/cluster/images/etcd-version-monitor/etcd-version-monitor.yaml b/cluster/images/etcd-version-monitor/etcd-version-monitor.yaml new file mode 100644 index 00000000000..49f1db39819 --- /dev/null +++ b/cluster/images/etcd-version-monitor/etcd-version-monitor.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Pod +metadata: + name: etcd-version-monitor + namespace: kube-system +spec: + hostNetwork: true + containers: + - name: etcd-version-monitor + image: gcr.io/google-containers/etcd-version-monitor:0.1.0 + command: + - /etcd-version-monitor + - --logtostderr diff --git a/hack/verify-flags/exceptions.txt b/hack/verify-flags/exceptions.txt index 600b2ed24d2..759168a89e2 100644 --- a/hack/verify-flags/exceptions.txt +++ b/hack/verify-flags/exceptions.txt @@ -79,7 +79,10 @@ cluster/vagrant/provision-utils.sh: api_servers: '$(echo "$MASTER_IP" | sed -e cluster/vagrant/provision-utils.sh: node_ip: '$(echo "$MASTER_IP" | sed -e "s/'/''/g")' cluster/vagrant/provision-utils.sh: runtime_config: '$(echo "$RUNTIME_CONFIG" | sed -e "s/'/''/g")' examples/cluster-dns/images/frontend/client.py: service_address = socket.gethostbyname(hostname) +examples/storage/cassandra/image/files/jvm.options:# information in cassandra.yaml (such as listen_address). +examples/storage/cassandra/image/files/jvm.options:#-Dcassandra.replace_address=listen_address or broadcast_address of dead node examples/storage/cassandra/image/files/run.sh: cluster_name \ +examples/storage/cassandra/image/files/run.sh: listen_address \ examples/storage/vitess/env.sh: node_ip=$(get_node_ip) federation/cluster/common.sh: local cert_dir="${kube_temp}/easy-rsa-master/easyrsa3" federation/deploy/config.json.sample: "cloud_provider": "gce", @@ -94,9 +97,6 @@ federation/deploy/config.json.sample: "cluster_name": "cluster3-kubernetes" federation/deploy/config.json.sample: "num_nodes": 3, federation/deploy/config.json.sample: "num_nodes": 3, federation/deploy/config.json.sample: "num_nodes": 3, -hack/e2e.go:.phase1.cloud_provider="gce" -hack/e2e.go:.phase1.cluster_name="{{.Cluster}}" -hack/e2e.go:.phase1.num_nodes=4 hack/lib/util.sh: local api_port=$5 hack/local-up-cluster.sh: advertise_address="--advertise_address=${API_HOST_IP}" hack/local-up-cluster.sh: runtime_config="--runtime-config=${RUNTIME_CONFIG}" diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index f43cc1c6d53..3d19ab382d3 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -161,6 +161,8 @@ dump-logs-on-failure duration-sec e2e-output-dir e2e-verify-service-account +etcd-metrics-scrape-uri +etcd-version-scrape-uri enable-controller-attach-detach enable-custom-metrics enable-debugging-handlers @@ -366,6 +368,7 @@ leader-elect-retry-period lease-duration leave-stdin-open limit-bytes +listen-address listers-package load-balancer-ip lock-file @@ -405,6 +408,7 @@ mesos-launch-grace-period mesos-master mesos-sandbox-overlay mesos-user +metrics-path min-available min-pr-number min-request-timeout @@ -541,6 +545,7 @@ scheduler-config scheduler-name schema-cache-dir scopes +scrape-timeout seccomp-profile-root secondary-node-eviction-rate secret-name