mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-02 00:07:50 +00:00
Merge pull request #121720 from aojea/first_pod_network_startup
kubelet: add internal metric for the first pod with network latency
This commit is contained in:
commit
cbfebf02e8
@ -30,6 +30,7 @@ import (
|
||||
|
||||
// This const block defines the metric names for the kubelet metrics.
|
||||
const (
|
||||
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
|
||||
KubeletSubsystem = "kubelet"
|
||||
NodeNameKey = "node_name"
|
||||
NodeLabelKey = "node"
|
||||
@ -238,6 +239,20 @@ var (
|
||||
[]string{},
|
||||
)
|
||||
|
||||
// FirstNetworkPodStartSLIDuration is a gauge that tracks the duration (in seconds) it takes for the first network pod to run,
|
||||
// excluding the time for image pulling. This is an internal and temporary metric required because of the existing limitations of the
|
||||
// existing networking subsystem and CRI/CNI implementations that will be solved by https://github.com/containernetworking/cni/issues/859
|
||||
// The metric represents the latency observed by an user to run workloads in a new node.
|
||||
// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
|
||||
FirstNetworkPodStartSLIDuration = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: FirstNetworkPodStartSLIDurationKey,
|
||||
Help: "Duration in seconds to start the first network pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch",
|
||||
StabilityLevel: metrics.INTERNAL,
|
||||
},
|
||||
)
|
||||
|
||||
// CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete.
|
||||
// Broken down by method.
|
||||
CgroupManagerDuration = metrics.NewHistogramVec(
|
||||
@ -900,6 +915,7 @@ var registerMetrics sync.Once
|
||||
func Register(collectors ...metrics.StableCollector) {
|
||||
// Register the metrics.
|
||||
registerMetrics.Do(func() {
|
||||
legacyregistry.MustRegister(FirstNetworkPodStartSLIDuration)
|
||||
legacyregistry.MustRegister(NodeName)
|
||||
legacyregistry.MustRegister(PodWorkerDuration)
|
||||
legacyregistry.MustRegister(PodStartDuration)
|
||||
|
@ -41,6 +41,8 @@ type basicPodStartupLatencyTracker struct {
|
||||
// protect against concurrent read and write on pods map
|
||||
lock sync.Mutex
|
||||
pods map[types.UID]*perPodState
|
||||
// metrics for the first network pod only
|
||||
firstNetworkPodSeen bool
|
||||
// For testability
|
||||
clock clock.Clock
|
||||
}
|
||||
@ -112,6 +114,12 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
|
||||
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
|
||||
metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
|
||||
state.metricRecorded = true
|
||||
// if is the first Pod with network track the start values
|
||||
// these metrics will help to identify problems with the CNI plugin
|
||||
if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
|
||||
metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
|
||||
p.firstNetworkPodSeen = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,6 +275,69 @@ kubelet_pod_start_sli_duration_seconds_count 1
|
||||
})
|
||||
}
|
||||
|
||||
func TestFirstNetworkPodMetrics(t *testing.T) {
|
||||
|
||||
t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
|
||||
|
||||
wants := `
|
||||
# HELP kubelet_first_network_pod_start_sli_duration_seconds [INTERNAL] Duration in seconds to start the first network pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
|
||||
# TYPE kubelet_first_network_pod_start_sli_duration_seconds gauge
|
||||
kubelet_first_network_pod_start_sli_duration_seconds 30
|
||||
`
|
||||
|
||||
fakeClock := testingclock.NewFakeClock(frozenTime)
|
||||
|
||||
metrics.Register()
|
||||
|
||||
tracker := &basicPodStartupLatencyTracker{
|
||||
pods: map[types.UID]*perPodState{},
|
||||
clock: fakeClock,
|
||||
}
|
||||
|
||||
// hostNetwork pods should not be tracked
|
||||
hostNetworkPodInitializing := buildInitializingPod()
|
||||
hostNetworkPodInitializing.UID = "11111-22222"
|
||||
hostNetworkPodInitializing.Spec.HostNetwork = true
|
||||
tracker.ObservedPodOnWatch(hostNetworkPodInitializing, frozenTime)
|
||||
|
||||
hostNetworkPodStarted := buildRunningPod()
|
||||
hostNetworkPodStarted.UID = "11111-22222"
|
||||
hostNetworkPodStarted.Spec.HostNetwork = true
|
||||
tracker.RecordStatusUpdated(hostNetworkPodStarted)
|
||||
|
||||
// track only the first pod with network
|
||||
podInitializing := buildInitializingPod()
|
||||
tracker.ObservedPodOnWatch(podInitializing, frozenTime)
|
||||
|
||||
// pod started
|
||||
podStarted := buildRunningPod()
|
||||
tracker.RecordStatusUpdated(podStarted)
|
||||
|
||||
// at 30s observe the same pod on watch
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*30))
|
||||
|
||||
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// any new pod observations should not impact the metrics, as the pod should be recorder only once
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*150))
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*200))
|
||||
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*250))
|
||||
|
||||
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// cleanup
|
||||
tracker.DeletePodStartupState(hostNetworkPodStarted.UID)
|
||||
tracker.DeletePodStartupState(podStarted.UID)
|
||||
|
||||
assert.Empty(t, tracker.pods)
|
||||
metrics.PodStartSLIDuration.Reset()
|
||||
})
|
||||
}
|
||||
|
||||
func buildInitializingPod() *corev1.Pod {
|
||||
return buildPodWithStatus([]corev1.ContainerStatus{
|
||||
{State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "PodInitializing"}}},
|
||||
|
Loading…
Reference in New Issue
Block a user