Add metric for e2e pod startup latency including image pull

Signed-off-by: ruiwen-zhao <ruiwen@google.com>
This commit is contained in:
ruiwen-zhao 2023-10-24 22:22:51 +00:00
parent af52a7052b
commit 1165609036
2 changed files with 29 additions and 2 deletions

View File

@ -41,6 +41,7 @@ const (
PodWorkerDurationKey = "pod_worker_duration_seconds" PodWorkerDurationKey = "pod_worker_duration_seconds"
PodStartDurationKey = "pod_start_duration_seconds" PodStartDurationKey = "pod_start_duration_seconds"
PodStartSLIDurationKey = "pod_start_sli_duration_seconds" PodStartSLIDurationKey = "pod_start_sli_duration_seconds"
PodStartTotalDurationKey = "pod_start_total_duration_seconds"
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds" CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds" PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
PodStatusSyncDurationKey = "pod_status_sync_duration_seconds" PodStatusSyncDurationKey = "pod_status_sync_duration_seconds"
@ -125,6 +126,10 @@ const (
EphemeralContainer = "ephemeral_container" EphemeralContainer = "ephemeral_container"
) )
var (
podStartupDurationBuckets = []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}
)
var ( var (
// NodeName is a Gauge that tracks the ode's name. The count is always 1. // NodeName is a Gauge that tracks the ode's name. The count is always 1.
NodeName = metrics.NewGaugeVec( NodeName = metrics.NewGaugeVec(
@ -165,7 +170,7 @@ var (
Subsystem: KubeletSubsystem, Subsystem: KubeletSubsystem,
Name: PodStartDurationKey, Name: PodStartDurationKey,
Help: "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run", Help: "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run",
Buckets: []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, },
) )
@ -182,11 +187,30 @@ var (
Subsystem: KubeletSubsystem, Subsystem: KubeletSubsystem,
Name: PodStartSLIDurationKey, Name: PodStartSLIDurationKey,
Help: "Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch", Help: "Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
Buckets: []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, },
[]string{}, []string{},
) )
// PodStartTotalDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to run
// since creation, including the time for image pulling.
//
// The histogram bucket boundaries for pod startup latency metrics, measured in seconds. These are hand-picked
// so as to be roughly exponential but still round numbers in everyday units. This is to minimise the number
// of buckets while allowing accurate measurement of thresholds which might be used in SLOs
// e.g. x% of pods start up within 30 seconds, or 15 minutes, etc.
PodStartTotalDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: PodStartTotalDurationKey,
Help: "Duration in seconds to start a pod since creation, including time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA,
},
[]string{},
)
// CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete. // CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete.
// Broken down by method. // Broken down by method.
CgroupManagerDuration = metrics.NewHistogramVec( CgroupManagerDuration = metrics.NewHistogramVec(
@ -810,6 +834,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(PodWorkerDuration) legacyregistry.MustRegister(PodWorkerDuration)
legacyregistry.MustRegister(PodStartDuration) legacyregistry.MustRegister(PodStartDuration)
legacyregistry.MustRegister(PodStartSLIDuration) legacyregistry.MustRegister(PodStartSLIDuration)
legacyregistry.MustRegister(PodStartTotalDuration)
legacyregistry.MustRegister(NodeStartupPreKubeletDuration) legacyregistry.MustRegister(NodeStartupPreKubeletDuration)
legacyregistry.MustRegister(NodeStartupPreRegistrationDuration) legacyregistry.MustRegister(NodeStartupPreRegistrationDuration)
legacyregistry.MustRegister(NodeStartupRegistrationDuration) legacyregistry.MustRegister(NodeStartupRegistrationDuration)

View File

@ -102,6 +102,7 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
klog.InfoS("Observed pod startup duration", klog.InfoS("Observed pod startup duration",
"pod", klog.KObj(pod), "pod", klog.KObj(pod),
"podStartSLOduration", podStartSLOduration, "podStartSLOduration", podStartSLOduration,
"podStartE2EDuration", podStartingDuration,
"podCreationTimestamp", pod.CreationTimestamp.Time, "podCreationTimestamp", pod.CreationTimestamp.Time,
"firstStartedPulling", state.firstStartedPulling, "firstStartedPulling", state.firstStartedPulling,
"lastFinishedPulling", state.lastFinishedPulling, "lastFinishedPulling", state.lastFinishedPulling,
@ -109,6 +110,7 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
"watchObservedRunningTime", when) "watchObservedRunningTime", when)
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration) metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
state.metricRecorded = true state.metricRecorded = true
} }
} }