Merge pull request #86251 from bboreham/pleg-last-seen-metric

Kubelet: add a metric to observe time since PLEG last seen
This commit is contained in:
Kubernetes Prow Robot 2020-01-06 18:06:18 -08:00 committed by GitHub
commit 49bc696614
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 0 deletions

View File

@ -44,6 +44,7 @@ const (
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
PLEGLastSeenKey = "pleg_last_seen_seconds"
EvictionsKey = "evictions"
EvictionStatsAgeKey = "eviction_stats_age_seconds"
PreemptionsKey = "preemptions"
@ -186,6 +187,16 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's
// Pod Lifecycle Event Generator (PLEG) was last seen active.
PLEGLastSeen = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: PLEGLastSeenKey,
Help: "Timestamp in seconds when PLEG was last seen active.",
StabilityLevel: metrics.ALPHA,
},
)
// RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations.
// Broken down by operation type.
RuntimeOperations = metrics.NewCounterVec(
@ -522,6 +533,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.S
legacyregistry.MustRegister(PLEGRelistDuration)
legacyregistry.MustRegister(PLEGDiscardEvents)
legacyregistry.MustRegister(PLEGRelistInterval)
legacyregistry.MustRegister(PLEGLastSeen)
legacyregistry.MustRegister(RuntimeOperations)
legacyregistry.MustRegister(RuntimeOperationsDuration)
legacyregistry.MustRegister(RuntimeOperationsErrors)

View File

@ -138,6 +138,8 @@ func (g *GenericPLEG) Healthy() (bool, error) {
if relistTime.IsZero() {
return false, fmt.Errorf("pleg has yet to be successful")
}
// Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn`
metrics.PLEGLastSeen.Set(float64(relistTime.Unix()))
elapsed := g.clock.Since(relistTime)
if elapsed > relistThreshold {
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)