From cc0b3e82eb81d5ab0a311a309112ad579d92f0ce Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Fri, 13 Dec 2019 13:48:26 +0000 Subject: [PATCH] Kubelet: add a metric to observe time since PLEG last seen Expose the measurement that kubelet uses to judge that "PLEG is unhealthy". If we can observe the measurement growing then we can alert before the node goes unhealthy. Note that the existing metrics PLEGRelistInterval and PLEGRelistDuration are poor for this, because when relist() gets stuck they are never updated. Signed-off-by: Bryan Boreham --- pkg/kubelet/metrics/metrics.go | 12 ++++++++++++ pkg/kubelet/pleg/generic.go | 2 ++ 2 files changed, 14 insertions(+) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index ff12aaa31c5..5eb0b741c3d 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -44,6 +44,7 @@ const ( PLEGRelistDurationKey = "pleg_relist_duration_seconds" PLEGDiscardEventsKey = "pleg_discard_events" PLEGRelistIntervalKey = "pleg_relist_interval_seconds" + PLEGLastSeenKey = "pleg_last_seen_seconds" EvictionsKey = "evictions" EvictionStatsAgeKey = "eviction_stats_age_seconds" PreemptionsKey = "preemptions" @@ -187,6 +188,16 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + // PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's + // Pod Lifecycle Event Generator (PLEG) was last seen active. + PLEGLastSeen = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: PLEGLastSeenKey, + Help: "Timestamp in seconds when PLEG was last seen active.", + StabilityLevel: metrics.ALPHA, + }, + ) // RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations. // Broken down by operation type. RuntimeOperations = metrics.NewCounterVec( @@ -523,6 +534,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.S legacyregistry.MustRegister(PLEGRelistDuration) legacyregistry.MustRegister(PLEGDiscardEvents) legacyregistry.MustRegister(PLEGRelistInterval) + legacyregistry.MustRegister(PLEGLastSeen) legacyregistry.MustRegister(RuntimeOperations) legacyregistry.MustRegister(RuntimeOperationsDuration) legacyregistry.MustRegister(RuntimeOperationsErrors) diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 9fda1be6b48..611c754d544 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -138,6 +138,8 @@ func (g *GenericPLEG) Healthy() (bool, error) { if relistTime.IsZero() { return false, fmt.Errorf("pleg has yet to be successful") } + // Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn` + metrics.PLEGLastSeen.Set(float64(relistTime.Unix())) elapsed := g.clock.Since(relistTime) if elapsed > relistThreshold { return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)