Kubelet: add a metric to observe time since PLEG last seen

Expose the measurement that kubelet uses to judge that "PLEG is
unhealthy". If we can observe the measurement growing then we can
alert before the node goes unhealthy.

Note that the existing metrics PLEGRelistInterval and
PLEGRelistDuration are poor for this, because when relist() gets
stuck they are never updated.

Signed-off-by: Bryan Boreham <bryan@weave.works>
This commit is contained in:
Bryan Boreham 2019-12-13 13:48:26 +00:00
parent e622579b14
commit cc0b3e82eb
2 changed files with 14 additions and 0 deletions

View File

@ -44,6 +44,7 @@ const (
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
PLEGLastSeenKey = "pleg_last_seen_seconds"
EvictionsKey = "evictions"
EvictionStatsAgeKey = "eviction_stats_age_seconds"
PreemptionsKey = "preemptions"
@ -187,6 +188,16 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's
// Pod Lifecycle Event Generator (PLEG) was last seen active.
PLEGLastSeen = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: PLEGLastSeenKey,
Help: "Timestamp in seconds when PLEG was last seen active.",
StabilityLevel: metrics.ALPHA,
},
)
// RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations.
// Broken down by operation type.
RuntimeOperations = metrics.NewCounterVec(
@ -523,6 +534,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.S
legacyregistry.MustRegister(PLEGRelistDuration)
legacyregistry.MustRegister(PLEGDiscardEvents)
legacyregistry.MustRegister(PLEGRelistInterval)
legacyregistry.MustRegister(PLEGLastSeen)
legacyregistry.MustRegister(RuntimeOperations)
legacyregistry.MustRegister(RuntimeOperationsDuration)
legacyregistry.MustRegister(RuntimeOperationsErrors)

View File

@ -138,6 +138,8 @@ func (g *GenericPLEG) Healthy() (bool, error) {
if relistTime.IsZero() {
return false, fmt.Errorf("pleg has yet to be successful")
}
// Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn`
metrics.PLEGLastSeen.Set(float64(relistTime.Unix()))
elapsed := g.clock.Since(relistTime)
if elapsed > relistThreshold {
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)