Merge pull request #107896 from smarterclayton/track_pod_sync_latency

kubelet: Record a metric for latency of pod status update
This commit is contained in:
Kubernetes Prow Robot 2022-09-27 14:25:50 -07:00 committed by GitHub
commit 00532e305a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 3 deletions

View File

@ -37,6 +37,7 @@ const (
PodStartDurationKey = "pod_start_duration_seconds"
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
PodStatusSyncDurationKey = "pod_status_sync_duration_seconds"
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
@ -53,6 +54,7 @@ const (
VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal"
RunningPodsKey = "running_pods"
RunningContainersKey = "running_containers"
// Metrics keys of remote runtime operations
RuntimeOperationsKey = "runtime_operations_total"
RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
@ -156,6 +158,18 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// PodStatusSyncDuration is a Histogram that tracks the duration (in seconds) in takes from the time a pod
// status is generated to the time it is synced with the apiserver. If multiple status changes are generated
// on a pod before it is written to the API, the latency is from the first update to the last event.
PodStatusSyncDuration = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: PodStatusSyncDurationKey,
Help: "Duration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.",
Buckets: []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
StabilityLevel: metrics.ALPHA,
},
)
// PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's
// Pod Lifecycle Event Generator (PLEG).
PLEGRelistDuration = metrics.NewHistogram(
@ -496,6 +510,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(PodStartDuration)
legacyregistry.MustRegister(CgroupManagerDuration)
legacyregistry.MustRegister(PodWorkerStartDuration)
legacyregistry.MustRegister(PodStatusSyncDuration)
legacyregistry.MustRegister(ContainersPerPodCount)
legacyregistry.MustRegister(PLEGRelistDuration)
legacyregistry.MustRegister(PLEGDiscardEvents)

View File

@ -37,6 +37,7 @@ import (
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
statusutil "k8s.io/kubernetes/pkg/util/pod"
@ -45,12 +46,15 @@ import (
// A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are
// not sent to the API server.
type versionedPodStatus struct {
status v1.PodStatus
// Monotonically increasing version number (per pod).
// version is a monotonically increasing version number (per pod).
version uint64
// Pod name & namespace, for sending updates to API server.
podName string
podNamespace string
// at is the time at which the most recent status update was detected
at time.Time
status v1.PodStatus
}
type podStatusSyncRequest struct {
@ -545,6 +549,16 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp
podName: pod.Name,
podNamespace: pod.Namespace,
}
// Multiple status updates can be generated before we update the API server,
// so we track the time from the first status update until we retire it to
// the API.
if cachedStatus.at.IsZero() {
newStatus.at = time.Now()
} else {
newStatus.at = cachedStatus.at
}
m.podStatuses[pod.UID] = newStatus
select {
@ -685,7 +699,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
mergedStatus := mergePodStatus(pod.Status, status.status, m.podDeletionSafety.PodCouldHaveRunningContainers(pod))
newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(context.TODO(), m.kubeClient, pod.Namespace, pod.Name, pod.UID, pod.Status, mergedStatus)
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patch", string(patchBytes))
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "podUID", uid, "patch", string(patchBytes))
if err != nil {
klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err)
@ -698,6 +712,14 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
pod = newPod
}
// measure how long the status update took to propagate from generation to update on the server
if status.at.IsZero() {
klog.V(3).InfoS("Pod had no status time set", "pod", klog.KObj(pod), "podUID", uid, "version", status.version)
} else {
duration := time.Now().Sub(status.at).Truncate(time.Millisecond)
metrics.PodStatusSyncDuration.Observe(duration.Seconds())
}
m.apiStatusVersions[kubetypes.MirrorPodUID(pod.UID)] = status.version
// We don't handle graceful deletion of mirror pods.