mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Merge pull request #107896 from smarterclayton/track_pod_sync_latency
kubelet: Record a metric for latency of pod status update
This commit is contained in:
commit
00532e305a
@ -37,6 +37,7 @@ const (
|
||||
PodStartDurationKey = "pod_start_duration_seconds"
|
||||
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
|
||||
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
|
||||
PodStatusSyncDurationKey = "pod_status_sync_duration_seconds"
|
||||
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
|
||||
PLEGDiscardEventsKey = "pleg_discard_events"
|
||||
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
|
||||
@ -53,6 +54,7 @@ const (
|
||||
VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal"
|
||||
RunningPodsKey = "running_pods"
|
||||
RunningContainersKey = "running_containers"
|
||||
|
||||
// Metrics keys of remote runtime operations
|
||||
RuntimeOperationsKey = "runtime_operations_total"
|
||||
RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
|
||||
@ -156,6 +158,18 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// PodStatusSyncDuration is a Histogram that tracks the duration (in seconds) in takes from the time a pod
|
||||
// status is generated to the time it is synced with the apiserver. If multiple status changes are generated
|
||||
// on a pod before it is written to the API, the latency is from the first update to the last event.
|
||||
PodStatusSyncDuration = metrics.NewHistogram(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: PodStatusSyncDurationKey,
|
||||
Help: "Duration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.",
|
||||
Buckets: []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's
|
||||
// Pod Lifecycle Event Generator (PLEG).
|
||||
PLEGRelistDuration = metrics.NewHistogram(
|
||||
@ -496,6 +510,7 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(PodStartDuration)
|
||||
legacyregistry.MustRegister(CgroupManagerDuration)
|
||||
legacyregistry.MustRegister(PodWorkerStartDuration)
|
||||
legacyregistry.MustRegister(PodStatusSyncDuration)
|
||||
legacyregistry.MustRegister(ContainersPerPodCount)
|
||||
legacyregistry.MustRegister(PLEGRelistDuration)
|
||||
legacyregistry.MustRegister(PLEGDiscardEvents)
|
||||
|
@ -37,6 +37,7 @@ import (
|
||||
"k8s.io/klog/v2"
|
||||
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
statusutil "k8s.io/kubernetes/pkg/util/pod"
|
||||
@ -45,12 +46,15 @@ import (
|
||||
// A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are
|
||||
// not sent to the API server.
|
||||
type versionedPodStatus struct {
|
||||
status v1.PodStatus
|
||||
// Monotonically increasing version number (per pod).
|
||||
// version is a monotonically increasing version number (per pod).
|
||||
version uint64
|
||||
// Pod name & namespace, for sending updates to API server.
|
||||
podName string
|
||||
podNamespace string
|
||||
// at is the time at which the most recent status update was detected
|
||||
at time.Time
|
||||
|
||||
status v1.PodStatus
|
||||
}
|
||||
|
||||
type podStatusSyncRequest struct {
|
||||
@ -545,6 +549,16 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp
|
||||
podName: pod.Name,
|
||||
podNamespace: pod.Namespace,
|
||||
}
|
||||
|
||||
// Multiple status updates can be generated before we update the API server,
|
||||
// so we track the time from the first status update until we retire it to
|
||||
// the API.
|
||||
if cachedStatus.at.IsZero() {
|
||||
newStatus.at = time.Now()
|
||||
} else {
|
||||
newStatus.at = cachedStatus.at
|
||||
}
|
||||
|
||||
m.podStatuses[pod.UID] = newStatus
|
||||
|
||||
select {
|
||||
@ -685,7 +699,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
||||
mergedStatus := mergePodStatus(pod.Status, status.status, m.podDeletionSafety.PodCouldHaveRunningContainers(pod))
|
||||
|
||||
newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(context.TODO(), m.kubeClient, pod.Namespace, pod.Name, pod.UID, pod.Status, mergedStatus)
|
||||
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patch", string(patchBytes))
|
||||
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "podUID", uid, "patch", string(patchBytes))
|
||||
|
||||
if err != nil {
|
||||
klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err)
|
||||
@ -698,6 +712,14 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
||||
pod = newPod
|
||||
}
|
||||
|
||||
// measure how long the status update took to propagate from generation to update on the server
|
||||
if status.at.IsZero() {
|
||||
klog.V(3).InfoS("Pod had no status time set", "pod", klog.KObj(pod), "podUID", uid, "version", status.version)
|
||||
} else {
|
||||
duration := time.Now().Sub(status.at).Truncate(time.Millisecond)
|
||||
metrics.PodStatusSyncDuration.Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
m.apiStatusVersions[kubetypes.MirrorPodUID(pod.UID)] = status.version
|
||||
|
||||
// We don't handle graceful deletion of mirror pods.
|
||||
|
Loading…
Reference in New Issue
Block a user