From 29178fff1c9a17a71d8fcc3d475f142bb683dc2c Mon Sep 17 00:00:00 2001 From: Lee Verberne Date: Thu, 11 Feb 2021 14:38:36 +0100 Subject: [PATCH 1/2] Add kubelet managed pod metrics --- .../kuberuntime/kuberuntime_manager.go | 19 +++-- pkg/kubelet/metrics/metrics.go | 77 +++++++++++++++++++ pkg/kubelet/pod/pod_manager.go | 46 ++++++++++- 3 files changed, 136 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index d83e75e300a..0022d2d720a 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -48,6 +48,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/logs" + "k8s.io/kubernetes/pkg/kubelet/metrics" proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" "k8s.io/kubernetes/pkg/kubelet/runtimeclass" "k8s.io/kubernetes/pkg/kubelet/types" @@ -774,6 +775,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine var err error klog.V(4).InfoS("Creating PodSandbox for pod", "pod", klog.KObj(pod)) + metrics.StartedPodsTotal.Inc() createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod)) result.AddSyncResult(createSandboxResult) podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt) @@ -786,6 +788,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine klog.V(4).InfoS("Pod was deleted and sandbox failed to be created", "pod", klog.KObj(pod), "podUID", pod.UID) return } + metrics.StartedPodsErrorsTotal.WithLabelValues(err.Error()).Inc() createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg) klog.ErrorS(err, "CreatePodSandbox for pod failed", "pod", klog.KObj(pod)) ref, referr := ref.GetReference(legacyscheme.Scheme, pod) @@ -838,9 +841,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine } // Helper containing boilerplate common to starting all types of containers. - // typeName is a label used to describe this type of container in log messages, + // typeName is a description used to describe this type of container in log messages, // currently: "container", "init container" or "ephemeral container" - start := func(typeName string, spec *startSpec) error { + // metricLabel is the label used to describe this type of container in monitoring metrics. + // currently: "container", "init_container" or "ephemeral_container" + start := func(typeName, metricLabel string, spec *startSpec) error { startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, spec.container.Name) result.AddSyncResult(startContainerResult) @@ -851,9 +856,13 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine return err } + metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc() klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod)) // NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs. if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil { + // startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are + // useful to cluster administrators to distinguish "server errors" from "user errors". + metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc() startContainerResult.Fail(err, msg) // known errors that are logged in other places are logged at higher levels here to avoid // repetitive log spam @@ -875,14 +884,14 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine // containers cannot be specified on pod creation. if utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) { for _, idx := range podContainerChanges.EphemeralContainersToStart { - start("ephemeral container", ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx])) + start("ephemeral container", metrics.EphemeralContainer, ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx])) } } // Step 6: start the init container. if container := podContainerChanges.NextInitContainerToStart; container != nil { // Start the next init container. - if err := start("init container", containerStartSpec(container)); err != nil { + if err := start("init container", metrics.InitContainer, containerStartSpec(container)); err != nil { return } @@ -892,7 +901,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine // Step 7: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { - start("container", containerStartSpec(&pod.Spec.Containers[idx])) + start("container", metrics.Container, containerStartSpec(&pod.Spec.Containers[idx])) } return diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 6fb40e1875b..6c54f6af634 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -83,6 +83,19 @@ const ( // Metrics keys for RuntimeClass RunPodSandboxDurationKey = "run_podsandbox_duration_seconds" RunPodSandboxErrorsKey = "run_podsandbox_errors_total" + + // Metrics to keep track of objects under management + ManagedPodsKey = "managed_pods" + ManagedContainersKey = "managed_containers" + StartedPodsTotalKey = "started_pods_total" + StartedPodsErrorsTotalKey = "started_pods_errors_total" + StartedContainersTotalKey = "started_containers_total" + StartedContainersErrorsTotalKey = "started_containers_errors_total" + + // Values used in metric labels + Container = "container" + InitContainer = "init_container" + EphemeralContainer = "ephemeral_container" ) var ( @@ -431,6 +444,64 @@ var ( }, []string{"container_state"}, ) + // StartedPodsTotal is a counter that tracks pod sandbox creation operations + StartedPodsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedPodsTotalKey, + Help: "Cumulative number of pods started", + StabilityLevel: metrics.ALPHA, + }, + ) + // StartedPodsErrorsTotal is a counter that tracks the number of errors creating pod sandboxes + StartedPodsErrorsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedPodsErrorsTotalKey, + Help: "Cumulative number of errors when starting pods", + StabilityLevel: metrics.ALPHA, + }, + []string{"message"}, + ) + // StartedContainersTotal is a counter that tracks the number of container creation operations + StartedContainersTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedContainersTotalKey, + Help: "Cumulative number of containers started", + StabilityLevel: metrics.ALPHA, + }, + []string{"container_type"}, + ) + // StartedContainersTotal is a counter that tracks the number of errors creating containers + StartedContainersErrorsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedContainersErrorsTotalKey, + Help: "Cumulative number of errors when starting containers", + StabilityLevel: metrics.ALPHA, + }, + []string{"container_type", "code"}, + ) + // ManagedPods is a gauge that tracks how many pods are managed by this kubelet + ManagedPods = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ManagedPodsKey, + Help: "Number of pods managed by this kubelet", + StabilityLevel: metrics.ALPHA, + }, + ) + // ManagedContainers is a gauge that tracks how many containers are managed by this kubelet + ManagedContainers = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ManagedContainersKey, + Help: "Number of containers managed by this kubelet", + StabilityLevel: metrics.ALPHA, + }, + []string{"container_type"}, + ) ) var registerMetrics sync.Once @@ -459,6 +530,12 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(DevicePluginAllocationDuration) legacyregistry.MustRegister(RunningContainerCount) legacyregistry.MustRegister(RunningPodCount) + legacyregistry.MustRegister(ManagedPods) + legacyregistry.MustRegister(ManagedContainers) + legacyregistry.MustRegister(StartedPodsTotal) + legacyregistry.MustRegister(StartedPodsErrorsTotal) + legacyregistry.MustRegister(StartedContainersTotal) + legacyregistry.MustRegister(StartedContainersErrorsTotal) legacyregistry.MustRegister(RunPodSandboxDuration) legacyregistry.MustRegister(RunPodSandboxErrors) if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 215a7a155d6..e0becae6126 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -19,10 +19,13 @@ package pod import ( "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/configmap" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/secret" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -159,6 +162,45 @@ func isPodInTerminatedState(pod *v1.Pod) bool { return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded } +// updateMetrics updates the gauge metrics that track how many pods and containers this kubelet manages. +// oldPod or newPod may be nil to signify creation or deletion, respectively. +func updateMetrics(oldPod, newPod *v1.Pod) { + var numC, numIC, numEC int + countEC := utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) + + if oldPod != nil { + if newPod == nil { + metrics.ManagedPods.Dec() + } + numC -= len(oldPod.Spec.Containers) + numIC -= len(oldPod.Spec.InitContainers) + if countEC { + numEC -= len(oldPod.Spec.EphemeralContainers) + } + } + + if newPod != nil { + if oldPod == nil { + metrics.ManagedPods.Inc() + } + numC += len(newPod.Spec.Containers) + numIC += len(newPod.Spec.InitContainers) + if countEC { + numEC += len(newPod.Spec.EphemeralContainers) + } + } + + if numC != 0 { + metrics.ManagedContainers.WithLabelValues(metrics.Container).Add(float64(numC)) + } + if numIC != 0 { + metrics.ManagedContainers.WithLabelValues(metrics.InitContainer).Add(float64(numIC)) + } + if countEC && numEC != 0 { + metrics.ManagedContainers.WithLabelValues(metrics.EphemeralContainer).Add(float64(numEC)) + } +} + // updatePodsInternal replaces the given pods in the current state of the // manager, updating the various indices. The caller is assumed to hold the // lock. @@ -202,6 +244,7 @@ func (pm *basicManager) updatePodsInternal(pods ...*v1.Pod) { } } else { resolvedPodUID := kubetypes.ResolvedPodUID(pod.UID) + updateMetrics(pm.podByUID[resolvedPodUID], pod) pm.podByUID[resolvedPodUID] = pod pm.podByFullName[podFullName] = pod if mirror, ok := pm.mirrorPodByFullName[podFullName]; ok { @@ -212,6 +255,7 @@ func (pm *basicManager) updatePodsInternal(pods ...*v1.Pod) { } func (pm *basicManager) DeletePod(pod *v1.Pod) { + updateMetrics(pod, nil) pm.lock.Lock() defer pm.lock.Unlock() if pm.secretManager != nil { From 30d2ad576ac9d29e8f18bc0adf6bfa1a23a968e2 Mon Sep 17 00:00:00 2001 From: Lee Verberne Date: Tue, 15 Jun 2021 18:59:27 +0200 Subject: [PATCH 2/2] Remove ManagedPod,ManagedContainer metrics This replaces the generic ManagedPod and ManagedContainer kubelet metrics with a gauge to track only ephemeral container usage. --- pkg/kubelet/metrics/metrics.go | 28 ++++++++---------------- pkg/kubelet/pod/pod_manager.go | 40 +++++++++------------------------- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 6c54f6af634..99a98e4a427 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -84,14 +84,15 @@ const ( RunPodSandboxDurationKey = "run_podsandbox_duration_seconds" RunPodSandboxErrorsKey = "run_podsandbox_errors_total" - // Metrics to keep track of objects under management - ManagedPodsKey = "managed_pods" - ManagedContainersKey = "managed_containers" + // Metrics to keep track of total number of Pods and Containers started StartedPodsTotalKey = "started_pods_total" StartedPodsErrorsTotalKey = "started_pods_errors_total" StartedContainersTotalKey = "started_containers_total" StartedContainersErrorsTotalKey = "started_containers_errors_total" + // Metrics to track ephemeral container usage by this kubelet + ManagedEphemeralContainersKey = "managed_ephemeral_containers" + // Values used in metric labels Container = "container" InitContainer = "init_container" @@ -483,25 +484,15 @@ var ( }, []string{"container_type", "code"}, ) - // ManagedPods is a gauge that tracks how many pods are managed by this kubelet - ManagedPods = metrics.NewGauge( + // ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet. + ManagedEphemeralContainers = metrics.NewGauge( &metrics.GaugeOpts{ Subsystem: KubeletSubsystem, - Name: ManagedPodsKey, - Help: "Number of pods managed by this kubelet", + Name: ManagedEphemeralContainersKey, + Help: "Current number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.", StabilityLevel: metrics.ALPHA, }, ) - // ManagedContainers is a gauge that tracks how many containers are managed by this kubelet - ManagedContainers = metrics.NewGaugeVec( - &metrics.GaugeOpts{ - Subsystem: KubeletSubsystem, - Name: ManagedContainersKey, - Help: "Number of containers managed by this kubelet", - StabilityLevel: metrics.ALPHA, - }, - []string{"container_type"}, - ) ) var registerMetrics sync.Once @@ -530,8 +521,7 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(DevicePluginAllocationDuration) legacyregistry.MustRegister(RunningContainerCount) legacyregistry.MustRegister(RunningPodCount) - legacyregistry.MustRegister(ManagedPods) - legacyregistry.MustRegister(ManagedContainers) + legacyregistry.MustRegister(ManagedEphemeralContainers) legacyregistry.MustRegister(StartedPodsTotal) legacyregistry.MustRegister(StartedPodsErrorsTotal) legacyregistry.MustRegister(StartedContainersTotal) diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index e0becae6126..044be68b220 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -162,42 +162,22 @@ func isPodInTerminatedState(pod *v1.Pod) bool { return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded } -// updateMetrics updates the gauge metrics that track how many pods and containers this kubelet manages. -// oldPod or newPod may be nil to signify creation or deletion, respectively. +// updateMetrics updates the metrics surfaced by the pod manager. +// oldPod or newPod may be nil to signify creation or deletion. func updateMetrics(oldPod, newPod *v1.Pod) { - var numC, numIC, numEC int - countEC := utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) + if !utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) { + return + } + var numEC int if oldPod != nil { - if newPod == nil { - metrics.ManagedPods.Dec() - } - numC -= len(oldPod.Spec.Containers) - numIC -= len(oldPod.Spec.InitContainers) - if countEC { - numEC -= len(oldPod.Spec.EphemeralContainers) - } + numEC -= len(oldPod.Spec.EphemeralContainers) } - if newPod != nil { - if oldPod == nil { - metrics.ManagedPods.Inc() - } - numC += len(newPod.Spec.Containers) - numIC += len(newPod.Spec.InitContainers) - if countEC { - numEC += len(newPod.Spec.EphemeralContainers) - } + numEC += len(newPod.Spec.EphemeralContainers) } - - if numC != 0 { - metrics.ManagedContainers.WithLabelValues(metrics.Container).Add(float64(numC)) - } - if numIC != 0 { - metrics.ManagedContainers.WithLabelValues(metrics.InitContainer).Add(float64(numIC)) - } - if countEC && numEC != 0 { - metrics.ManagedContainers.WithLabelValues(metrics.EphemeralContainer).Add(float64(numEC)) + if numEC != 0 { + metrics.ManagedEphemeralContainers.Add(float64(numEC)) } }