mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-08 11:38:15 +00:00
kubelet: add metric for the first pod with network latency
The first pod with network latency impact user workloads, however, it is difficuly to understand where is the problem of this latency, since it depends on the CNI plugin to be ready at the moment of the pod creation. Add a new internal metric in the kubelet that allow developers and cluster administrator to understand the source of the latency problems on node startups. kubelet_first_network_pod_start_sli_duration_seconds Change-Id: I4cdb55b0df72c96a3a65b78ce2aae404c5195006
This commit is contained in:
parent
3716bd7681
commit
b8533f7976
@ -30,6 +30,7 @@ import (
|
||||
|
||||
// This const block defines the metric names for the kubelet metrics.
|
||||
const (
|
||||
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
|
||||
KubeletSubsystem = "kubelet"
|
||||
NodeNameKey = "node_name"
|
||||
NodeLabelKey = "node"
|
||||
@ -211,6 +212,20 @@ var (
|
||||
[]string{},
|
||||
)
|
||||
|
||||
// FirstNetworkPodStartSLIDuration is a gauge that tracks the duration (in seconds) it takes for the first network pod to run,
|
||||
// excluding the time for image pulling. This is an internal and temporary metric required because of the existing limitations of the
|
||||
// existing networking subsystem and CRI/CNI implementations that will be solved by https://github.com/containernetworking/cni/issues/859
|
||||
// The metric represents the latency observed by an user to run workloads in a new node.
|
||||
// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
|
||||
FirstNetworkPodStartSLIDuration = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: FirstNetworkPodStartSLIDurationKey,
|
||||
Help: "Duration in seconds to start the first network pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
|
||||
StabilityLevel: metrics.INTERNAL,
|
||||
},
|
||||
)
|
||||
|
||||
// CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete.
|
||||
// Broken down by method.
|
||||
CgroupManagerDuration = metrics.NewHistogramVec(
|
||||
@ -830,6 +845,7 @@ var registerMetrics sync.Once
|
||||
func Register(collectors ...metrics.StableCollector) {
|
||||
// Register the metrics.
|
||||
registerMetrics.Do(func() {
|
||||
legacyregistry.MustRegister(FirstNetworkPodStartSLIDuration)
|
||||
legacyregistry.MustRegister(NodeName)
|
||||
legacyregistry.MustRegister(PodWorkerDuration)
|
||||
legacyregistry.MustRegister(PodStartDuration)
|
||||
|
@ -41,6 +41,8 @@ type basicPodStartupLatencyTracker struct {
|
||||
// protect against concurrent read and write on pods map
|
||||
lock sync.Mutex
|
||||
pods map[types.UID]*perPodState
|
||||
// metrics for the first network pod only
|
||||
firstNetworkPodSeen bool
|
||||
// For testability
|
||||
clock clock.Clock
|
||||
}
|
||||
@ -112,6 +114,12 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
|
||||
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
|
||||
metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
|
||||
state.metricRecorded = true
|
||||
// if is the first Pod with network track the start values
|
||||
// these metrics will help to identify problems with the CNI plugin
|
||||
if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
|
||||
metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
|
||||
p.firstNetworkPodSeen = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,6 +275,69 @@ kubelet_pod_start_sli_duration_seconds_count 1
|
||||
})
|
||||
}
|
||||
|
||||
func TestFirstNetworkPodMetrics(t *testing.T) {
|
||||
|
||||
t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
|
||||
|
||||
wants := `
|
||||
# HELP kubelet_first_network_pod_start_sli_duration_seconds [INTERNAL] Duration in seconds to start the first network pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
|
||||
# TYPE kubelet_first_network_pod_start_sli_duration_seconds gauge
|
||||
kubelet_first_network_pod_start_sli_duration_seconds 30
|
||||
`
|
||||
|
||||
fakeClock := testingclock.NewFakeClock(frozenTime)
|
||||
|
||||
metrics.Register()
|
||||
|
||||
tracker := &basicPodStartupLatencyTracker{
|
||||
pods: map[types.UID]*perPodState{},
|
||||
clock: fakeClock,
|
||||
}
|
||||
|
||||
// hostNetwork pods should not be tracked
|
||||
hostNetworkPodInitializing := buildInitializingPod()
|
||||
hostNetworkPodInitializing.UID = "11111-22222"
|
||||
hostNetworkPodInitializing.Spec.HostNetwork = true
|
||||
tracker.ObservedPodOnWatch(hostNetworkPodInitializing, frozenTime)
|
||||
|
||||
hostNetworkPodStarted := buildRunningPod()
|
||||
hostNetworkPodStarted.UID = "11111-22222"
|
||||
hostNetworkPodStarted.Spec.HostNetwork = true
|
||||
tracker.RecordStatusUpdated(hostNetworkPodStarted)
|
||||
|
||||
// track only the first pod with network
|
||||
podInitializing := buildInitializingPod()
|
||||
tracker.ObservedPodOnWatch(podInitializing, frozenTime)
|
||||
|
||||
// pod started
|
||||
podStarted := buildRunningPod()
|
||||
tracker.RecordStatusUpdated(podStarted)
|
||||
|
||||
// at 30s observe the same pod on watch
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*30))
|
||||
|
||||
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// any new pod observations should not impact the metrics, as the pod should be recorder only once
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*150))
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*200))
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*250))
|
||||
|
||||
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// cleanup
|
||||
tracker.DeletePodStartupState(hostNetworkPodStarted.UID)
|
||||
tracker.DeletePodStartupState(podStarted.UID)
|
||||
|
||||
assert.Empty(t, tracker.pods)
|
||||
metrics.PodStartSLIDuration.Reset()
|
||||
})
|
||||
}
|
||||
|
||||
func buildInitializingPod() *corev1.Pod {
|
||||
return buildPodWithStatus([]corev1.ContainerStatus{
|
||||
{State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "PodInitializing"}}},
|
||||
|
Loading…
Reference in New Issue
Block a user