diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 1f6ee294846..0e60794b02c 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -44,6 +44,7 @@ const ( PLEGRelistDurationKey = "pleg_relist_duration_seconds" PLEGDiscardEventsKey = "pleg_discard_events" PLEGRelistIntervalKey = "pleg_relist_interval_seconds" + PLEGLastSeenKey = "pleg_last_seen_seconds" EvictionsKey = "evictions" EvictionStatsAgeKey = "eviction_stats_age_seconds" PreemptionsKey = "preemptions" @@ -186,6 +187,16 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + // PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's + // Pod Lifecycle Event Generator (PLEG) was last seen active. + PLEGLastSeen = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: PLEGLastSeenKey, + Help: "Timestamp in seconds when PLEG was last seen active.", + StabilityLevel: metrics.ALPHA, + }, + ) // RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations. // Broken down by operation type. RuntimeOperations = metrics.NewCounterVec( @@ -522,6 +533,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.S legacyregistry.MustRegister(PLEGRelistDuration) legacyregistry.MustRegister(PLEGDiscardEvents) legacyregistry.MustRegister(PLEGRelistInterval) + legacyregistry.MustRegister(PLEGLastSeen) legacyregistry.MustRegister(RuntimeOperations) legacyregistry.MustRegister(RuntimeOperationsDuration) legacyregistry.MustRegister(RuntimeOperationsErrors) diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 1bd39c64ff7..4b3743e8917 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -138,6 +138,8 @@ func (g *GenericPLEG) Healthy() (bool, error) { if relistTime.IsZero() { return false, fmt.Errorf("pleg has yet to be successful") } + // Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn` + metrics.PLEGLastSeen.Set(float64(relistTime.Unix())) elapsed := g.clock.Since(relistTime) if elapsed > relistThreshold { return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)