diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 87bf70b031e..6584d0f17c9 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -48,6 +48,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/logs" + "k8s.io/kubernetes/pkg/kubelet/metrics" proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" "k8s.io/kubernetes/pkg/kubelet/runtimeclass" "k8s.io/kubernetes/pkg/kubelet/types" @@ -784,6 +785,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine var err error klog.V(4).InfoS("Creating PodSandbox for pod", "pod", klog.KObj(pod)) + metrics.StartedPodsTotal.Inc() createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod)) result.AddSyncResult(createSandboxResult) podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt) @@ -796,6 +798,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine klog.V(4).InfoS("Pod was deleted and sandbox failed to be created", "pod", klog.KObj(pod), "podUID", pod.UID) return } + metrics.StartedPodsErrorsTotal.WithLabelValues(err.Error()).Inc() createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg) klog.ErrorS(err, "CreatePodSandbox for pod failed", "pod", klog.KObj(pod)) ref, referr := ref.GetReference(legacyscheme.Scheme, pod) @@ -848,9 +851,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine } // Helper containing boilerplate common to starting all types of containers. - // typeName is a label used to describe this type of container in log messages, + // typeName is a description used to describe this type of container in log messages, // currently: "container", "init container" or "ephemeral container" - start := func(typeName string, spec *startSpec) error { + // metricLabel is the label used to describe this type of container in monitoring metrics. + // currently: "container", "init_container" or "ephemeral_container" + start := func(typeName, metricLabel string, spec *startSpec) error { startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, spec.container.Name) result.AddSyncResult(startContainerResult) @@ -861,9 +866,13 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine return err } + metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc() klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod)) // NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs. if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil { + // startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are + // useful to cluster administrators to distinguish "server errors" from "user errors". + metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc() startContainerResult.Fail(err, msg) // known errors that are logged in other places are logged at higher levels here to avoid // repetitive log spam @@ -885,14 +894,14 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine // containers cannot be specified on pod creation. if utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) { for _, idx := range podContainerChanges.EphemeralContainersToStart { - start("ephemeral container", ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx])) + start("ephemeral container", metrics.EphemeralContainer, ephemeralContainerStartSpec(&pod.Spec.EphemeralContainers[idx])) } } // Step 6: start the init container. if container := podContainerChanges.NextInitContainerToStart; container != nil { // Start the next init container. - if err := start("init container", containerStartSpec(container)); err != nil { + if err := start("init container", metrics.InitContainer, containerStartSpec(container)); err != nil { return } @@ -902,7 +911,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine // Step 7: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { - start("container", containerStartSpec(&pod.Spec.Containers[idx])) + start("container", metrics.Container, containerStartSpec(&pod.Spec.Containers[idx])) } return diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index c4e022b1995..5daa512cb2b 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -83,6 +83,20 @@ const ( // Metrics keys for RuntimeClass RunPodSandboxDurationKey = "run_podsandbox_duration_seconds" RunPodSandboxErrorsKey = "run_podsandbox_errors_total" + + // Metrics to keep track of total number of Pods and Containers started + StartedPodsTotalKey = "started_pods_total" + StartedPodsErrorsTotalKey = "started_pods_errors_total" + StartedContainersTotalKey = "started_containers_total" + StartedContainersErrorsTotalKey = "started_containers_errors_total" + + // Metrics to track ephemeral container usage by this kubelet + ManagedEphemeralContainersKey = "managed_ephemeral_containers" + + // Values used in metric labels + Container = "container" + InitContainer = "init_container" + EphemeralContainer = "ephemeral_container" ) var ( @@ -436,6 +450,54 @@ var ( }, []string{"container_state"}, ) + // StartedPodsTotal is a counter that tracks pod sandbox creation operations + StartedPodsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedPodsTotalKey, + Help: "Cumulative number of pods started", + StabilityLevel: metrics.ALPHA, + }, + ) + // StartedPodsErrorsTotal is a counter that tracks the number of errors creating pod sandboxes + StartedPodsErrorsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedPodsErrorsTotalKey, + Help: "Cumulative number of errors when starting pods", + StabilityLevel: metrics.ALPHA, + }, + []string{"message"}, + ) + // StartedContainersTotal is a counter that tracks the number of container creation operations + StartedContainersTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedContainersTotalKey, + Help: "Cumulative number of containers started", + StabilityLevel: metrics.ALPHA, + }, + []string{"container_type"}, + ) + // StartedContainersTotal is a counter that tracks the number of errors creating containers + StartedContainersErrorsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: StartedContainersErrorsTotalKey, + Help: "Cumulative number of errors when starting containers", + StabilityLevel: metrics.ALPHA, + }, + []string{"container_type", "code"}, + ) + // ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet. + ManagedEphemeralContainers = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ManagedEphemeralContainersKey, + Help: "Current number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.", + StabilityLevel: metrics.ALPHA, + }, + ) ) var registerMetrics sync.Once @@ -464,6 +526,11 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(DevicePluginAllocationDuration) legacyregistry.MustRegister(RunningContainerCount) legacyregistry.MustRegister(RunningPodCount) + legacyregistry.MustRegister(ManagedEphemeralContainers) + legacyregistry.MustRegister(StartedPodsTotal) + legacyregistry.MustRegister(StartedPodsErrorsTotal) + legacyregistry.MustRegister(StartedContainersTotal) + legacyregistry.MustRegister(StartedContainersErrorsTotal) legacyregistry.MustRegister(RunPodSandboxDuration) legacyregistry.MustRegister(RunPodSandboxErrors) if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 215a7a155d6..044be68b220 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -19,10 +19,13 @@ package pod import ( "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/configmap" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/secret" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -159,6 +162,25 @@ func isPodInTerminatedState(pod *v1.Pod) bool { return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded } +// updateMetrics updates the metrics surfaced by the pod manager. +// oldPod or newPod may be nil to signify creation or deletion. +func updateMetrics(oldPod, newPod *v1.Pod) { + if !utilfeature.DefaultFeatureGate.Enabled(features.EphemeralContainers) { + return + } + + var numEC int + if oldPod != nil { + numEC -= len(oldPod.Spec.EphemeralContainers) + } + if newPod != nil { + numEC += len(newPod.Spec.EphemeralContainers) + } + if numEC != 0 { + metrics.ManagedEphemeralContainers.Add(float64(numEC)) + } +} + // updatePodsInternal replaces the given pods in the current state of the // manager, updating the various indices. The caller is assumed to hold the // lock. @@ -202,6 +224,7 @@ func (pm *basicManager) updatePodsInternal(pods ...*v1.Pod) { } } else { resolvedPodUID := kubetypes.ResolvedPodUID(pod.UID) + updateMetrics(pm.podByUID[resolvedPodUID], pod) pm.podByUID[resolvedPodUID] = pod pm.podByFullName[podFullName] = pod if mirror, ok := pm.mirrorPodByFullName[podFullName]; ok { @@ -212,6 +235,7 @@ func (pm *basicManager) updatePodsInternal(pods ...*v1.Pod) { } func (pm *basicManager) DeletePod(pod *v1.Pod) { + updateMetrics(pod, nil) pm.lock.Lock() defer pm.lock.Unlock() if pm.secretManager != nil {