Merge pull request #121041 from ruiwen-zhao/sli-add-pull

Add metric for e2e pod startup latency including image pull
This commit is contained in:
Kubernetes Prow Robot 2023-10-26 01:07:43 +02:00 committed by GitHub
commit de708905d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 2 deletions

View File

@ -41,6 +41,7 @@ const (
PodWorkerDurationKey = "pod_worker_duration_seconds"
PodStartDurationKey = "pod_start_duration_seconds"
PodStartSLIDurationKey = "pod_start_sli_duration_seconds"
PodStartTotalDurationKey = "pod_start_total_duration_seconds"
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
PodStatusSyncDurationKey = "pod_status_sync_duration_seconds"
@ -125,6 +126,10 @@ const (
EphemeralContainer = "ephemeral_container"
)
var (
podStartupDurationBuckets = []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}
)
var (
// NodeName is a Gauge that tracks the ode's name. The count is always 1.
NodeName = metrics.NewGaugeVec(
@ -165,7 +170,7 @@ var (
Subsystem: KubeletSubsystem,
Name: PodStartDurationKey,
Help: "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run",
Buckets: []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600},
Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA,
},
)
@ -182,11 +187,30 @@ var (
Subsystem: KubeletSubsystem,
Name: PodStartSLIDurationKey,
Help: "Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
Buckets: []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600},
Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA,
},
[]string{},
)
// PodStartTotalDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to run
// since creation, including the time for image pulling.
//
// The histogram bucket boundaries for pod startup latency metrics, measured in seconds. These are hand-picked
// so as to be roughly exponential but still round numbers in everyday units. This is to minimise the number
// of buckets while allowing accurate measurement of thresholds which might be used in SLOs
// e.g. x% of pods start up within 30 seconds, or 15 minutes, etc.
PodStartTotalDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: PodStartTotalDurationKey,
Help: "Duration in seconds to start a pod since creation, including time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
Buckets: podStartupDurationBuckets,
StabilityLevel: metrics.ALPHA,
},
[]string{},
)
// CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete.
// Broken down by method.
CgroupManagerDuration = metrics.NewHistogramVec(
@ -810,6 +834,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(PodWorkerDuration)
legacyregistry.MustRegister(PodStartDuration)
legacyregistry.MustRegister(PodStartSLIDuration)
legacyregistry.MustRegister(PodStartTotalDuration)
legacyregistry.MustRegister(NodeStartupPreKubeletDuration)
legacyregistry.MustRegister(NodeStartupPreRegistrationDuration)
legacyregistry.MustRegister(NodeStartupRegistrationDuration)

View File

@ -102,6 +102,7 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
klog.InfoS("Observed pod startup duration",
"pod", klog.KObj(pod),
"podStartSLOduration", podStartSLOduration,
"podStartE2EDuration", podStartingDuration,
"podCreationTimestamp", pod.CreationTimestamp.Time,
"firstStartedPulling", state.firstStartedPulling,
"lastFinishedPulling", state.lastFinishedPulling,
@ -109,6 +110,7 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
"watchObservedRunningTime", when)
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
state.metricRecorded = true
}
}