mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-28 22:17:14 +00:00
Improve etcd-version-monitor metrics proxying, add etcd 3.1 gprc metric support
This commit is contained in:
parent
305d644363
commit
a0874620f1
@ -19,6 +19,8 @@ go_library(
|
|||||||
deps = [
|
deps = [
|
||||||
"//vendor/github.com/golang/glog:go_default_library",
|
"//vendor/github.com/golang/glog:go_default_library",
|
||||||
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
|
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
|
||||||
|
"//vendor/github.com/prometheus/client_golang/prometheus/promhttp:go_default_library",
|
||||||
|
"//vendor/github.com/prometheus/client_model/go:go_default_library",
|
||||||
"//vendor/github.com/prometheus/common/expfmt:go_default_library",
|
"//vendor/github.com/prometheus/common/expfmt:go_default_library",
|
||||||
"//vendor/github.com/spf13/pflag:go_default_library",
|
"//vendor/github.com/spf13/pflag:go_default_library",
|
||||||
],
|
],
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
ARCH:=amd64
|
ARCH:=amd64
|
||||||
GOLANG_VERSION?=1.8.3
|
GOLANG_VERSION?=1.8.3
|
||||||
REGISTRY?=gcr.io/google-containers
|
REGISTRY?=gcr.io/google-containers
|
||||||
TAG?=0.1.0
|
TAG?=0.1.1
|
||||||
IMAGE:=$(REGISTRY)/etcd-version-monitor:$(TAG)
|
IMAGE:=$(REGISTRY)/etcd-version-monitor:$(TAG)
|
||||||
CURRENT_DIR:=$(pwd)
|
CURRENT_DIR:=$(pwd)
|
||||||
TEMP_DIR:=$(shell mktemp -d)
|
TEMP_DIR:=$(shell mktemp -d)
|
||||||
|
@ -1,11 +1,19 @@
|
|||||||
# etcd-version-monitor
|
# etcd-version-monitor
|
||||||
|
|
||||||
This is a tool for exporting metrics related to etcd version, like etcd
|
This is a tool for exporting etcd metrics and supplementing them with etcd
|
||||||
server's binary version, cluster version, and counts of different kinds of
|
server binary version and cluster version. These metrics are in
|
||||||
gRPC calls (which is a characteristic of v3), etc. These metrics are in
|
|
||||||
prometheus format and can be scraped by a prometheus server.
|
prometheus format and can be scraped by a prometheus server.
|
||||||
The metrics are exposed at the http://localhost:9101/metrics endpoint.
|
The metrics are exposed at the http://localhost:9101/metrics endpoint.
|
||||||
|
|
||||||
|
For etcd 3.1+, the
|
||||||
|
[go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus)
|
||||||
|
metrics format, which backward incompatibly replaces the 3.0 legacy grpc metric
|
||||||
|
format, is exposed in both the 3.1 format and in the 3.0. This preserves
|
||||||
|
backward compatiblity.
|
||||||
|
|
||||||
|
For etcd 3.1+, the `--metrics=extensive` must be set on etcd for grpc request
|
||||||
|
latency metrics (`etcd_grpc_unary_requests_duration_seconds`) to be exposed.
|
||||||
|
|
||||||
**RUNNING THE TOOL**
|
**RUNNING THE TOOL**
|
||||||
|
|
||||||
To run this tool as a docker container:
|
To run this tool as a docker container:
|
||||||
|
@ -25,6 +25,8 @@ import (
|
|||||||
|
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
"github.com/prometheus/common/expfmt"
|
"github.com/prometheus/common/expfmt"
|
||||||
"github.com/spf13/pflag"
|
"github.com/spf13/pflag"
|
||||||
)
|
)
|
||||||
@ -52,6 +54,11 @@ const (
|
|||||||
|
|
||||||
// Initialize prometheus metrics to be exported.
|
// Initialize prometheus metrics to be exported.
|
||||||
var (
|
var (
|
||||||
|
// Register all custom metrics with a dedicated registry to keep them separate.
|
||||||
|
customMetricRegistry = prometheus.NewRegistry()
|
||||||
|
|
||||||
|
// Custom etcd version metric since etcd 3.2- does not export one.
|
||||||
|
// This will be replaced by https://github.com/coreos/etcd/pull/8960 in etcd 3.3.
|
||||||
etcdVersion = prometheus.NewGaugeVec(
|
etcdVersion = prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
@ -59,15 +66,122 @@ var (
|
|||||||
Help: "Etcd server's binary version",
|
Help: "Etcd server's binary version",
|
||||||
},
|
},
|
||||||
[]string{"binary_version"})
|
[]string{"binary_version"})
|
||||||
etcdGRPCRequestsTotal = prometheus.NewCounterVec(
|
|
||||||
prometheus.CounterOpts{
|
gatherer = &monitorGatherer{
|
||||||
Namespace: namespace,
|
// Rewrite rules for etcd metrics that are exported by default.
|
||||||
Name: "grpc_requests_total",
|
exported: map[string]*exportedMetric{
|
||||||
Help: "Counter of received grpc requests, labeled by the grpc method and service names",
|
// etcd 3.0 metric format for total grpc requests with renamed method and service labels.
|
||||||
|
"etcd_grpc_requests_total": {
|
||||||
|
rewriters: []rewriteFunc{
|
||||||
|
func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
|
||||||
|
mf = deepCopyMetricFamily(mf)
|
||||||
|
renameLabels(mf, map[string]string{
|
||||||
|
"grpc_method": "method",
|
||||||
|
"grpc_service": "service",
|
||||||
|
})
|
||||||
|
return mf, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// etcd 3.1+ metric format for total grpc requests.
|
||||||
|
"grpc_server_handled_total": {
|
||||||
|
rewriters: []rewriteFunc{
|
||||||
|
// Export the metric exactly as-is. For 3.1+ metrics, we will
|
||||||
|
// pass all metrics directly through.
|
||||||
|
identity,
|
||||||
|
// Write to the etcd 3.0 metric format for backward compatibility.
|
||||||
|
func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
|
||||||
|
mf = deepCopyMetricFamily(mf)
|
||||||
|
renameMetric(mf, "etcd_grpc_requests_total")
|
||||||
|
renameLabels(mf, map[string]string{
|
||||||
|
"grpc_method": "method",
|
||||||
|
"grpc_service": "service",
|
||||||
|
})
|
||||||
|
return mf, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
// etcd 3.0 metric format for grpc request latencies,
|
||||||
|
// rewritten to the etcd 3.1+ format.
|
||||||
|
"etcd_grpc_unary_requests_duration_seconds": {
|
||||||
|
rewriters: []rewriteFunc{
|
||||||
|
func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
|
||||||
|
mf = deepCopyMetricFamily(mf)
|
||||||
|
renameMetric(mf, "grpc_server_handling_seconds")
|
||||||
|
tpeName := "grpc_type"
|
||||||
|
tpeVal := "unary"
|
||||||
|
for _, m := range mf.Metric {
|
||||||
|
m.Label = append(m.Label, &dto.LabelPair{Name: &tpeName, Value: &tpeVal})
|
||||||
|
}
|
||||||
|
return mf, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// etcd 3.1+ metric format for total grpc requests.
|
||||||
|
"grpc_server_handling_seconds": {},
|
||||||
},
|
},
|
||||||
[]string{"method", "service"})
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// monitorGatherer is a custom metric gatherer for prometheus that exports custom metrics
|
||||||
|
// defined by this monitor as well as rewritten etcd metrics.
|
||||||
|
type monitorGatherer struct {
|
||||||
|
exported map[string]*exportedMetric
|
||||||
|
}
|
||||||
|
|
||||||
|
// exportedMetric identifies a metric that is exported and defines how it is rewritten before
|
||||||
|
// it is exported.
|
||||||
|
type exportedMetric struct {
|
||||||
|
rewriters []rewriteFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// rewriteFunc rewrites metrics before they are exported.
|
||||||
|
type rewriteFunc func(mf *dto.MetricFamily) (*dto.MetricFamily, error)
|
||||||
|
|
||||||
|
func (m *monitorGatherer) Gather() ([]*dto.MetricFamily, error) {
|
||||||
|
etcdMetrics, err := scrapeMetrics()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
exported, err := m.rewriteExportedMetrics(etcdMetrics)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
custom, err := customMetricRegistry.Gather()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
result := make([]*dto.MetricFamily, 0, len(exported)+len(custom))
|
||||||
|
result = append(result, exported...)
|
||||||
|
result = append(result, custom...)
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *monitorGatherer) rewriteExportedMetrics(metrics map[string]*dto.MetricFamily) ([]*dto.MetricFamily, error) {
|
||||||
|
results := make([]*dto.MetricFamily, 0, len(metrics))
|
||||||
|
for n, mf := range metrics {
|
||||||
|
if e, ok := m.exported[n]; ok {
|
||||||
|
// Apply rewrite rules for metrics that have them.
|
||||||
|
if e.rewriters == nil {
|
||||||
|
results = append(results, mf)
|
||||||
|
} else {
|
||||||
|
for _, rewriter := range e.rewriters {
|
||||||
|
new, err := rewriter(mf)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
results = append(results, new)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Proxy all metrics without any rewrite rules directly.
|
||||||
|
results = append(results, mf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Struct for unmarshalling the json response from etcd's /version endpoint.
|
// Struct for unmarshalling the json response from etcd's /version endpoint.
|
||||||
type EtcdVersion struct {
|
type EtcdVersion struct {
|
||||||
BinaryVersion string `json:"etcdserver"`
|
BinaryVersion string `json:"etcdserver"`
|
||||||
@ -132,83 +246,78 @@ func getVersionPeriodically(stopCh <-chan struct{}) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Struct for storing labels for gRPC request types.
|
// scrapeMetrics scrapes the prometheus metrics from the etcd metrics URI.
|
||||||
type GRPCRequestLabels struct {
|
func scrapeMetrics() (map[string]*dto.MetricFamily, error) {
|
||||||
Method string
|
|
||||||
Service string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function for fetching etcd grpc request counts and feeding it to the prometheus metric.
|
|
||||||
func getGRPCRequestCount(lastRecordedCount *map[GRPCRequestLabels]float64) error {
|
|
||||||
// Create the get request for the etcd metrics endpoint.
|
|
||||||
req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil)
|
req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Failed to create GET request for etcd metrics: %v", err)
|
return nil, fmt.Errorf("Failed to create GET request for etcd metrics: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send the get request and receive a response.
|
// Send the get request and receive a response.
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Failed to receive GET response for etcd metrics: %v", err)
|
return nil, fmt.Errorf("Failed to receive GET response for etcd metrics: %v", err)
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
// Parse the metrics in text format to a MetricFamily struct.
|
// Parse the metrics in text format to a MetricFamily struct.
|
||||||
var textParser expfmt.TextParser
|
var textParser expfmt.TextParser
|
||||||
metricFamilies, err := textParser.TextToMetricFamilies(resp.Body)
|
return textParser.TextToMetricFamilies(resp.Body)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("Failed to parse etcd metrics: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look through the grpc requests metric family and update our promotheus metric.
|
|
||||||
for _, metric := range metricFamilies["etcd_grpc_requests_total"].GetMetric() {
|
|
||||||
var grpcRequestLabels GRPCRequestLabels
|
|
||||||
for _, label := range metric.GetLabel() {
|
|
||||||
if label.GetName() == "grpc_method" {
|
|
||||||
grpcRequestLabels.Method = label.GetValue()
|
|
||||||
}
|
|
||||||
if label.GetName() == "grpc_service" {
|
|
||||||
grpcRequestLabels.Service = label.GetValue()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if grpcRequestLabels.Method == "" || grpcRequestLabels.Service == "" {
|
|
||||||
return fmt.Errorf("Could not get value for grpc_method and/or grpc_service label")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get last recorded value and new value of the metric and update it suitably.
|
|
||||||
previousMetricValue := 0.0
|
|
||||||
if value, ok := (*lastRecordedCount)[grpcRequestLabels]; ok {
|
|
||||||
previousMetricValue = value
|
|
||||||
}
|
|
||||||
newMetricValue := metric.GetCounter().GetValue()
|
|
||||||
(*lastRecordedCount)[grpcRequestLabels] = newMetricValue
|
|
||||||
if newMetricValue >= previousMetricValue {
|
|
||||||
etcdGRPCRequestsTotal.With(prometheus.Labels{
|
|
||||||
"method": grpcRequestLabels.Method,
|
|
||||||
"service": grpcRequestLabels.Service,
|
|
||||||
}).Add(newMetricValue - previousMetricValue)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function for periodically fetching etcd GRPC request counts.
|
func renameMetric(mf *dto.MetricFamily, name string) {
|
||||||
func getGRPCRequestCountPeriodically(stopCh <-chan struct{}) {
|
mf.Name = &name
|
||||||
// This map stores last recorded count for a given grpc request type.
|
}
|
||||||
lastRecordedCount := make(map[GRPCRequestLabels]float64)
|
|
||||||
for {
|
func renameLabels(mf *dto.MetricFamily, nameMapping map[string]string) {
|
||||||
if err := getGRPCRequestCount(&lastRecordedCount); err != nil {
|
for _, m := range mf.Metric {
|
||||||
glog.Errorf("Failed to fetch etcd grpc request counts: %v", err)
|
for _, lbl := range m.Label {
|
||||||
}
|
if alias, ok := nameMapping[*lbl.Name]; ok {
|
||||||
select {
|
lbl.Name = &alias
|
||||||
case <-stopCh:
|
}
|
||||||
break
|
|
||||||
case <-time.After(scrapeTimeout):
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func identity(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
|
||||||
|
return mf, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func deepCopyMetricFamily(mf *dto.MetricFamily) *dto.MetricFamily {
|
||||||
|
r := &dto.MetricFamily{}
|
||||||
|
r.Name = mf.Name
|
||||||
|
r.Help = mf.Help
|
||||||
|
r.Type = mf.Type
|
||||||
|
r.Metric = make([]*dto.Metric, len(mf.Metric))
|
||||||
|
for i, m := range mf.Metric {
|
||||||
|
r.Metric[i] = deepCopyMetric(m)
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func deepCopyMetric(m *dto.Metric) *dto.Metric {
|
||||||
|
r := &dto.Metric{}
|
||||||
|
r.Label = make([]*dto.LabelPair, len(m.Label))
|
||||||
|
for i, lp := range m.Label {
|
||||||
|
r.Label[i] = deepCopyLabelPair(lp)
|
||||||
|
}
|
||||||
|
r.Gauge = m.Gauge
|
||||||
|
r.Counter = m.Counter
|
||||||
|
r.Summary = m.Summary
|
||||||
|
r.Untyped = m.Untyped
|
||||||
|
r.Histogram = m.Histogram
|
||||||
|
r.TimestampMs = m.TimestampMs
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func deepCopyLabelPair(lp *dto.LabelPair) *dto.LabelPair {
|
||||||
|
r := &dto.LabelPair{}
|
||||||
|
r.Name = lp.Name
|
||||||
|
r.Value = lp.Value
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Register the commandline flags passed to the tool.
|
// Register the commandline flags passed to the tool.
|
||||||
registerFlags(pflag.CommandLine)
|
registerFlags(pflag.CommandLine)
|
||||||
@ -216,18 +325,16 @@ func main() {
|
|||||||
pflag.Parse()
|
pflag.Parse()
|
||||||
|
|
||||||
// Register the metrics we defined above with prometheus.
|
// Register the metrics we defined above with prometheus.
|
||||||
prometheus.MustRegister(etcdVersion)
|
customMetricRegistry.MustRegister(etcdVersion)
|
||||||
prometheus.MustRegister(etcdGRPCRequestsTotal)
|
customMetricRegistry.Unregister(prometheus.NewGoCollector())
|
||||||
prometheus.Unregister(prometheus.NewGoCollector())
|
|
||||||
|
|
||||||
// Spawn threads for periodically scraping etcd version metrics.
|
// Spawn threads for periodically scraping etcd version metrics.
|
||||||
stopCh := make(chan struct{})
|
stopCh := make(chan struct{})
|
||||||
defer close(stopCh)
|
defer close(stopCh)
|
||||||
go getVersionPeriodically(stopCh)
|
go getVersionPeriodically(stopCh)
|
||||||
go getGRPCRequestCountPeriodically(stopCh)
|
|
||||||
|
|
||||||
// Serve our metrics on listenAddress/metricsPath.
|
// Serve our metrics on listenAddress/metricsPath.
|
||||||
glog.Infof("Listening on: %v", listenAddress)
|
glog.Infof("Listening on: %v", listenAddress)
|
||||||
http.Handle(metricsPath, prometheus.UninstrumentedHandler())
|
http.Handle(metricsPath, promhttp.HandlerFor(gatherer, promhttp.HandlerOpts{}))
|
||||||
glog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil))
|
glog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil))
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user