From 7655702313809cea908d5109264f2f108cf1f9dc Mon Sep 17 00:00:00 2001 From: Jack Date: Fri, 20 Aug 2021 12:38:01 -0700 Subject: [PATCH] add container probe duration metrics --- pkg/kubelet/prober/prober_manager.go | 14 ++++++++ pkg/kubelet/prober/worker.go | 38 +++++++++++++++++++++- pkg/kubelet/prober/worker_test.go | 48 ++++++++++++++++++++++++++++ pkg/kubelet/server/server.go | 1 + 4 files changed, 100 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/prober/prober_manager.go b/pkg/kubelet/prober/prober_manager.go index 570b3232499..1e18abd7593 100644 --- a/pkg/kubelet/prober/prober_manager.go +++ b/pkg/kubelet/prober/prober_manager.go @@ -48,6 +48,20 @@ var ProberResults = metrics.NewCounterVec( "pod_uid"}, ) +// ProberDuration stores the duration of a successful probe lifecycle by result as prometheus metrics. +var ProberDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: "prober", + Name: "probe_duration_seconds", + Help: "Duration in seconds for a probe response.", + StabilityLevel: metrics.ALPHA, + }, + []string{"probe_type", + "container", + "pod", + "namespace"}, +) + // Manager manages pod probing. It creates a probe "worker" for every container that specifies a // probe (AddPod). The worker periodically probes its assigned container and caches the results. The // manager use the cached probe results to set the appropriate Ready state in the PodStatus when diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index f627a79d036..582f6029c4a 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -17,7 +17,9 @@ limitations under the License. package prober import ( + "fmt" "math/rand" + "strings" "time" v1 "k8s.io/api/core/v1" @@ -25,6 +27,7 @@ import ( "k8s.io/component-base/metrics" "k8s.io/klog/v2" podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/apis/apps" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/prober/results" ) @@ -74,6 +77,10 @@ type worker struct { proberResultsSuccessfulMetricLabels metrics.Labels proberResultsFailedMetricLabels metrics.Labels proberResultsUnknownMetricLabels metrics.Labels + // proberDurationMetricLabels holds the labels attached to this worker + // for the ProberDuration metric by result. + proberDurationSuccessfulMetricLabels metrics.Labels + proberDurationUnknownMetricLabels metrics.Labels } // Creates and starts a new probe worker. @@ -107,14 +114,23 @@ func newWorker( w.initialValue = results.Unknown } + podName := getPodLabelName(w.pod) + basicMetricLabels := metrics.Labels{ "probe_type": w.probeType.String(), "container": w.container.Name, - "pod": w.pod.Name, + "pod": podName, "namespace": w.pod.Namespace, "pod_uid": string(w.pod.UID), } + proberDurationLabels := metrics.Labels{ + "probe_type": w.probeType.String(), + "container": w.container.Name, + "pod": podName, + "namespace": w.pod.Namespace, + } + w.proberResultsSuccessfulMetricLabels = deepCopyPrometheusLabels(basicMetricLabels) w.proberResultsSuccessfulMetricLabels["result"] = probeResultSuccessful @@ -124,6 +140,9 @@ func newWorker( w.proberResultsUnknownMetricLabels = deepCopyPrometheusLabels(basicMetricLabels) w.proberResultsUnknownMetricLabels["result"] = probeResultUnknown + w.proberDurationSuccessfulMetricLabels = deepCopyPrometheusLabels(proberDurationLabels) + w.proberDurationUnknownMetricLabels = deepCopyPrometheusLabels(proberDurationLabels) + return w } @@ -151,6 +170,8 @@ func (w *worker) run() { ProberResults.Delete(w.proberResultsSuccessfulMetricLabels) ProberResults.Delete(w.proberResultsFailedMetricLabels) ProberResults.Delete(w.proberResultsUnknownMetricLabels) + ProberDuration.Delete(w.proberDurationSuccessfulMetricLabels) + ProberDuration.Delete(w.proberDurationUnknownMetricLabels) }() probeLoop: @@ -181,6 +202,7 @@ func (w *worker) doProbe() (keepGoing bool) { defer func() { recover() }() // Actually eat panics (HandleCrash takes care of logging) defer runtime.HandleCrash(func(_ interface{}) { keepGoing = true }) + startTime := time.Now() status, ok := w.probeManager.statusManager.GetPodStatus(w.pod.UID) if !ok { // Either the pod has not been created yet, or it was already deleted. @@ -273,10 +295,12 @@ func (w *worker) doProbe() (keepGoing bool) { switch result { case results.Success: ProberResults.With(w.proberResultsSuccessfulMetricLabels).Inc() + ProberDuration.With(w.proberDurationSuccessfulMetricLabels).Observe(time.Since(startTime).Seconds()) case results.Failure: ProberResults.With(w.proberResultsFailedMetricLabels).Inc() default: ProberResults.With(w.proberResultsUnknownMetricLabels).Inc() + ProberDuration.With(w.proberDurationUnknownMetricLabels).Observe(time.Since(startTime).Seconds()) } if w.lastResult == result { @@ -313,3 +337,15 @@ func deepCopyPrometheusLabels(m metrics.Labels) metrics.Labels { } return ret } + +func getPodLabelName(pod *v1.Pod) string { + podName := pod.Name + if pod.GenerateName != "" { + podNameSlice := strings.Split(pod.Name, "-") + podName = strings.Join(podNameSlice[:len(podNameSlice)-1], "-") + if label, ok := pod.GetLabels()[apps.DefaultDeploymentUniqueLabelKey]; ok { + podName = strings.ReplaceAll(podName, fmt.Sprintf("-%s", label), "") + } + } + return podName +} diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index eee0bd1fb06..e86819fca88 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes/fake" + "k8s.io/kubernetes/pkg/apis/apps" kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/prober/results" "k8s.io/kubernetes/pkg/kubelet/status" @@ -474,3 +475,50 @@ func TestStartupProbeDisabledByStarted(t *testing.T) { expectContinue(t, w, w.doProbe(), msg) expectResult(t, w, results.Success, msg) } + +func TestGetPodLabelName(t *testing.T) { + testCases := []struct { + name string + pod *v1.Pod + result string + }{ + { + name: "Static pod", + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-controller-manager-k8s-master-21385161-0", + }, + }, + result: "kube-controller-manager-k8s-master-21385161-0", + }, + { + name: "Deployment pod", + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "coredns-845757d86-ccqpf", + GenerateName: "coredns-845757d86-", + Labels: map[string]string{ + apps.DefaultDeploymentUniqueLabelKey: "845757d86", + }, + }, + }, + result: "coredns", + }, + { + name: "ReplicaSet pod", + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kube-proxy-2gmqn", + GenerateName: "kube-proxy-", + }, + }, + result: "kube-proxy", + }, + } + for _, test := range testCases { + ret := getPodLabelName(test.pod) + if ret != test.result { + t.Errorf("Expected %s, got %s", test.result, ret) + } + } +} diff --git a/pkg/kubelet/server/server.go b/pkg/kubelet/server/server.go index 28dfc44977c..919c26fce40 100644 --- a/pkg/kubelet/server/server.go +++ b/pkg/kubelet/server/server.go @@ -400,6 +400,7 @@ func (s *Server) InstallDefaultHandlers() { p := compbasemetrics.NewKubeRegistry() _ = compbasemetrics.RegisterProcessStartTime(p.Register) p.MustRegister(prober.ProberResults) + p.MustRegister(prober.ProberDuration) s.restfulCont.Handle(proberMetricsPath, compbasemetrics.HandlerFor(p, compbasemetrics.HandlerOpts{ErrorHandling: compbasemetrics.ContinueOnError}), )