From e9a5fb7372ed56a1dcf422ee03a3c5ce64299de9 Mon Sep 17 00:00:00 2001
From: Clayton Coleman <smarterclayton@gmail.com>
Date: Mon, 15 Aug 2022 13:20:45 -0400
Subject: [PATCH] kubelet: Record a metric for latency of pod status update

Track how long it takes for pod updates to propagate from detection
to successful change on API server. Will guide future improvements
in pod start and shutdown latency.

Metric is `kubelet_pod_status_sync_duration_seconds` and is ALPHA
stability. Histogram buckets are chosen based on distribution of
observed status delays in practice.
---
 pkg/kubelet/metrics/metrics.go       | 15 +++++++++++++++
 pkg/kubelet/status/status_manager.go | 28 +++++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go
index 53b04c05897..17267b888d7 100644
--- a/pkg/kubelet/metrics/metrics.go
+++ b/pkg/kubelet/metrics/metrics.go
@@ -37,6 +37,7 @@ const (
 	PodStartDurationKey                = "pod_start_duration_seconds"
 	CgroupManagerOperationsKey         = "cgroup_manager_duration_seconds"
 	PodWorkerStartDurationKey          = "pod_worker_start_duration_seconds"
+	PodStatusSyncDurationKey           = "pod_status_sync_duration_seconds"
 	PLEGRelistDurationKey              = "pleg_relist_duration_seconds"
 	PLEGDiscardEventsKey               = "pleg_discard_events"
 	PLEGRelistIntervalKey              = "pleg_relist_interval_seconds"
@@ -53,6 +54,7 @@ const (
 	VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal"
 	RunningPodsKey                     = "running_pods"
 	RunningContainersKey               = "running_containers"
+
 	// Metrics keys of remote runtime operations
 	RuntimeOperationsKey         = "runtime_operations_total"
 	RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
@@ -156,6 +158,18 @@ var (
 			StabilityLevel: metrics.ALPHA,
 		},
 	)
+	// PodStatusSyncDuration is a Histogram that tracks the duration (in seconds) in takes from the time a pod
+	// status is generated to the time it is synced with the apiserver. If multiple status changes are generated
+	// on a pod before it is written to the API, the latency is from the first update to the last event.
+	PodStatusSyncDuration = metrics.NewHistogram(
+		&metrics.HistogramOpts{
+			Subsystem:      KubeletSubsystem,
+			Name:           PodStatusSyncDurationKey,
+			Help:           "Duration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.",
+			Buckets:        []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
+			StabilityLevel: metrics.ALPHA,
+		},
+	)
 	// PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's
 	// Pod Lifecycle Event Generator (PLEG).
 	PLEGRelistDuration = metrics.NewHistogram(
@@ -496,6 +510,7 @@ func Register(collectors ...metrics.StableCollector) {
 		legacyregistry.MustRegister(PodStartDuration)
 		legacyregistry.MustRegister(CgroupManagerDuration)
 		legacyregistry.MustRegister(PodWorkerStartDuration)
+		legacyregistry.MustRegister(PodStatusSyncDuration)
 		legacyregistry.MustRegister(ContainersPerPodCount)
 		legacyregistry.MustRegister(PLEGRelistDuration)
 		legacyregistry.MustRegister(PLEGDiscardEvents)
diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go
index c767e7dc41b..4b22433d68d 100644
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@@ -37,6 +37,7 @@ import (
 	"k8s.io/klog/v2"
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
+	"k8s.io/kubernetes/pkg/kubelet/metrics"
 	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 	statusutil "k8s.io/kubernetes/pkg/util/pod"
@@ -45,12 +46,15 @@ import (
 // A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are
 // not sent to the API server.
 type versionedPodStatus struct {
-	status v1.PodStatus
-	// Monotonically increasing version number (per pod).
+	// version is a monotonically increasing version number (per pod).
 	version uint64
 	// Pod name & namespace, for sending updates to API server.
 	podName      string
 	podNamespace string
+	// at is the time at which the most recent status update was detected
+	at time.Time
+
+	status v1.PodStatus
 }
 
 type podStatusSyncRequest struct {
@@ -545,6 +549,16 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp
 		podName:      pod.Name,
 		podNamespace: pod.Namespace,
 	}
+
+	// Multiple status updates can be generated before we update the API server,
+	// so we track the time from the first status update until we retire it to
+	// the API.
+	if cachedStatus.at.IsZero() {
+		newStatus.at = time.Now()
+	} else {
+		newStatus.at = cachedStatus.at
+	}
+
 	m.podStatuses[pod.UID] = newStatus
 
 	select {
@@ -685,7 +699,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 	mergedStatus := mergePodStatus(pod.Status, status.status, m.podDeletionSafety.PodCouldHaveRunningContainers(pod))
 
 	newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(context.TODO(), m.kubeClient, pod.Namespace, pod.Name, pod.UID, pod.Status, mergedStatus)
-	klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patch", string(patchBytes))
+	klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "podUID", uid, "patch", string(patchBytes))
 
 	if err != nil {
 		klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err)
@@ -698,6 +712,14 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 		pod = newPod
 	}
 
+	// measure how long the status update took to propagate from generation to update on the server
+	if status.at.IsZero() {
+		klog.V(3).InfoS("Pod had no status time set", "pod", klog.KObj(pod), "podUID", uid, "version", status.version)
+	} else {
+		duration := time.Now().Sub(status.at).Truncate(time.Millisecond)
+		metrics.PodStatusSyncDuration.Observe(duration.Seconds())
+	}
+
 	m.apiStatusVersions[kubetypes.MirrorPodUID(pod.UID)] = status.version
 
 	// We don't handle graceful deletion of mirror pods.