Merge pull request #112741 from logicalhan/health-check-metrics

enable health check SLI metrics for apiserver
This commit is contained in:
Kubernetes Prow Robot 2022-09-27 15:37:50 -07:00 committed by GitHub
commit e11f23eb97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 19 additions and 44 deletions

View File

@ -210,7 +210,7 @@ func ClusterRoles() []rbacv1.ClusterRole {
ObjectMeta: metav1.ObjectMeta{Name: "system:monitoring"}, ObjectMeta: metav1.ObjectMeta{Name: "system:monitoring"},
Rules: []rbacv1.PolicyRule{ Rules: []rbacv1.PolicyRule{
rbacv1helpers.NewRule("get").URLs( rbacv1helpers.NewRule("get").URLs(
"/metrics", "/metrics", "/metrics/slis",
"/livez", "/readyz", "/healthz", "/livez", "/readyz", "/healthz",
"/livez/*", "/readyz/*", "/healthz/*", "/livez/*", "/readyz/*", "/healthz/*",
).RuleOrDie(), ).RuleOrDie(),

View File

@ -934,6 +934,7 @@ items:
- /livez - /livez
- /livez/* - /livez/*
- /metrics - /metrics
- /metrics/slis
- /readyz - /readyz
- /readyz/* - /readyz/*
verbs: verbs:

View File

@ -67,6 +67,7 @@ import (
"k8s.io/client-go/informers" "k8s.io/client-go/informers"
restclient "k8s.io/client-go/rest" restclient "k8s.io/client-go/rest"
"k8s.io/component-base/logs" "k8s.io/component-base/logs"
"k8s.io/component-base/metrics/prometheus/slis"
"k8s.io/klog/v2" "k8s.io/klog/v2"
openapicommon "k8s.io/kube-openapi/pkg/common" openapicommon "k8s.io/kube-openapi/pkg/common"
"k8s.io/kube-openapi/pkg/validation/spec" "k8s.io/kube-openapi/pkg/validation/spec"
@ -884,8 +885,10 @@ func installAPI(s *GenericAPIServer, c *Config) {
if c.EnableMetrics { if c.EnableMetrics {
if c.EnableProfiling { if c.EnableProfiling {
routes.MetricsWithReset{}.Install(s.Handler.NonGoRestfulMux) routes.MetricsWithReset{}.Install(s.Handler.NonGoRestfulMux)
slis.SLIMetricsWithReset{}.Install(s.Handler.NonGoRestfulMux)
} else { } else {
routes.DefaultMetrics{}.Install(s.Handler.NonGoRestfulMux) routes.DefaultMetrics{}.Install(s.Handler.NonGoRestfulMux)
slis.SLIMetrics{}.Install(s.Handler.NonGoRestfulMux)
} }
} }

View File

@ -169,6 +169,7 @@ func TestNewWithDelegate(t *testing.T) {
"/livez/poststarthook/storage-object-count-tracker-hook", "/livez/poststarthook/storage-object-count-tracker-hook",
"/livez/poststarthook/wrapping-post-start-hook", "/livez/poststarthook/wrapping-post-start-hook",
"/metrics", "/metrics",
"/metrics/slis",
"/readyz", "/readyz",
"/readyz/delegate-health", "/readyz/delegate-health",
"/readyz/informer-sync", "/readyz/informer-sync",

View File

@ -18,6 +18,7 @@ package healthz
import ( import (
"bytes" "bytes"
"context"
"fmt" "fmt"
"net/http" "net/http"
"reflect" "reflect"
@ -30,6 +31,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/endpoints/metrics" "k8s.io/apiserver/pkg/endpoints/metrics"
"k8s.io/apiserver/pkg/server/httplog" "k8s.io/apiserver/pkg/server/httplog"
"k8s.io/component-base/metrics/prometheus/slis"
"k8s.io/klog/v2" "k8s.io/klog/v2"
) )
@ -237,6 +239,7 @@ func handleRootHealth(name string, firstTimeHealthy func(), checks ...HealthChec
continue continue
} }
if err := check.Check(r); err != nil { if err := check.Check(r); err != nil {
slis.ObserveHealthcheck(context.Background(), check.Name(), name, slis.Error)
// don't include the error since this endpoint is public. If someone wants more detail // don't include the error since this endpoint is public. If someone wants more detail
// they should have explicit permission to the detailed checks. // they should have explicit permission to the detailed checks.
fmt.Fprintf(&individualCheckOutput, "[-]%s failed: reason withheld\n", check.Name()) fmt.Fprintf(&individualCheckOutput, "[-]%s failed: reason withheld\n", check.Name())
@ -244,6 +247,7 @@ func handleRootHealth(name string, firstTimeHealthy func(), checks ...HealthChec
fmt.Fprintf(&failedVerboseLogOutput, "[-]%s failed: %v\n", check.Name(), err) fmt.Fprintf(&failedVerboseLogOutput, "[-]%s failed: %v\n", check.Name(), err)
failedChecks = append(failedChecks, check.Name()) failedChecks = append(failedChecks, check.Name())
} else { } else {
slis.ObserveHealthcheck(context.Background(), check.Name(), name, slis.Success)
fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", check.Name()) fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", check.Name())
} }
} }

View File

@ -18,8 +18,6 @@ package slis
import ( import (
"context" "context"
"errors"
k8smetrics "k8s.io/component-base/metrics" k8smetrics "k8s.io/component-base/metrics"
) )
@ -28,17 +26,10 @@ type HealthcheckStatus string
const ( const (
Success HealthcheckStatus = "success" Success HealthcheckStatus = "success"
Error HealthcheckStatus = "error" Error HealthcheckStatus = "error"
Pending HealthcheckStatus = "pending"
) )
type HealthcheckType string type HealthcheckType string
const (
Livez HealthcheckType = "livez"
Readyz HealthcheckType = "readyz"
Healthz HealthcheckType = "healthz"
)
var ( var (
// healthcheck is a Prometheus Gauge metrics used for recording the results of a k8s healthcheck. // healthcheck is a Prometheus Gauge metrics used for recording the results of a k8s healthcheck.
healthcheck = k8smetrics.NewGaugeVec( healthcheck = k8smetrics.NewGaugeVec(
@ -48,7 +39,7 @@ var (
Help: "This metric records the result of a single healthcheck.", Help: "This metric records the result of a single healthcheck.",
StabilityLevel: k8smetrics.ALPHA, StabilityLevel: k8smetrics.ALPHA,
}, },
[]string{"name", "type", "status"}, []string{"name", "type"},
) )
// healthchecksTotal is a Prometheus Counter metrics used for counting the results of a k8s healthcheck. // healthchecksTotal is a Prometheus Counter metrics used for counting the results of a k8s healthcheck.
@ -61,8 +52,6 @@ var (
}, },
[]string{"name", "type", "status"}, []string{"name", "type", "status"},
) )
statuses = []HealthcheckStatus{Success, Error, Pending}
statusSet = map[HealthcheckStatus]struct{}{Success: {}, Error: {}, Pending: {}}
) )
func Register(registry k8smetrics.KubeRegistry) { func Register(registry k8smetrics.KubeRegistry) {
@ -76,15 +65,12 @@ func ResetHealthMetrics() {
} }
func ObserveHealthcheck(ctx context.Context, name string, healthcheckType string, status HealthcheckStatus) error { func ObserveHealthcheck(ctx context.Context, name string, healthcheckType string, status HealthcheckStatus) error {
if _, ok := statusSet[status]; !ok { if status == Success {
return errors.New("not a valid healthcheck status") healthcheck.WithContext(ctx).WithLabelValues(name, healthcheckType).Set(1)
} } else {
for _, s := range statuses { healthcheck.WithContext(ctx).WithLabelValues(name, healthcheckType).Set(0)
if status != s {
healthcheck.WithContext(ctx).WithLabelValues(name, healthcheckType, string(s)).Set(0)
}
} }
healthchecksTotal.WithContext(ctx).WithLabelValues(name, healthcheckType, string(status)).Inc() healthchecksTotal.WithContext(ctx).WithLabelValues(name, healthcheckType, string(status)).Inc()
healthcheck.WithContext(ctx).WithLabelValues(name, healthcheckType, string(status)).Set(1)
return nil return nil
} }

View File

@ -39,9 +39,7 @@ func TestObserveHealthcheck(t *testing.T) {
initialOutput := ` initialOutput := `
# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck. # HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck.
# TYPE kubernetes_healthcheck gauge # TYPE kubernetes_healthcheck gauge
kubernetes_healthcheck{name="healthcheck-a",status="error",type="healthz"} 1 kubernetes_healthcheck{name="healthcheck-a",type="healthz"} 0
kubernetes_healthcheck{name="healthcheck-a",status="pending",type="healthz"} 0
kubernetes_healthcheck{name="healthcheck-a",status="success",type="healthz"} 0
# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck. # HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck.
# TYPE kubernetes_healthchecks_total counter # TYPE kubernetes_healthchecks_total counter
kubernetes_healthchecks_total{name="healthcheck-a",status="error",type="healthz"} 1 kubernetes_healthchecks_total{name="healthcheck-a",status="error",type="healthz"} 1
@ -53,23 +51,6 @@ func TestObserveHealthcheck(t *testing.T) {
hcStatus HealthcheckStatus hcStatus HealthcheckStatus
want string want string
}{ }{
{
desc: "test pending",
name: healthcheckName,
hcType: "healthz",
hcStatus: Pending,
want: `
# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck.
# TYPE kubernetes_healthcheck gauge
kubernetes_healthcheck{name="healthcheck-a",status="error",type="healthz"} 0
kubernetes_healthcheck{name="healthcheck-a",status="pending",type="healthz"} 1
kubernetes_healthcheck{name="healthcheck-a",status="success",type="healthz"} 0
# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck.
# TYPE kubernetes_healthchecks_total counter
kubernetes_healthchecks_total{name="healthcheck-a",status="error",type="healthz"} 1
kubernetes_healthchecks_total{name="healthcheck-a",status="pending",type="healthz"} 1
`,
},
{ {
desc: "test success", desc: "test success",
name: healthcheckName, name: healthcheckName,
@ -78,9 +59,7 @@ func TestObserveHealthcheck(t *testing.T) {
want: ` want: `
# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck. # HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck.
# TYPE kubernetes_healthcheck gauge # TYPE kubernetes_healthcheck gauge
kubernetes_healthcheck{name="healthcheck-a",status="error",type="healthz"} 0 kubernetes_healthcheck{name="healthcheck-a",type="healthz"} 1
kubernetes_healthcheck{name="healthcheck-a",status="pending",type="healthz"} 0
kubernetes_healthcheck{name="healthcheck-a",status="success",type="healthz"} 1
# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck. # HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck.
# TYPE kubernetes_healthchecks_total counter # TYPE kubernetes_healthchecks_total counter
kubernetes_healthchecks_total{name="healthcheck-a",status="error",type="healthz"} 1 kubernetes_healthchecks_total{name="healthcheck-a",status="error",type="healthz"} 1

1
vendor/modules.txt vendored
View File

@ -1978,6 +1978,7 @@ k8s.io/component-base/metrics/prometheus/controllers
k8s.io/component-base/metrics/prometheus/feature k8s.io/component-base/metrics/prometheus/feature
k8s.io/component-base/metrics/prometheus/ratelimiter k8s.io/component-base/metrics/prometheus/ratelimiter
k8s.io/component-base/metrics/prometheus/restclient k8s.io/component-base/metrics/prometheus/restclient
k8s.io/component-base/metrics/prometheus/slis
k8s.io/component-base/metrics/prometheus/version k8s.io/component-base/metrics/prometheus/version
k8s.io/component-base/metrics/prometheus/workqueue k8s.io/component-base/metrics/prometheus/workqueue
k8s.io/component-base/metrics/prometheusextension k8s.io/component-base/metrics/prometheusextension