mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 14:37:00 +00:00
Kubelet: add a metric to observe time since PLEG last seen
Expose the measurement that kubelet uses to judge that "PLEG is unhealthy". If we can observe the measurement growing then we can alert before the node goes unhealthy. Note that the existing metrics PLEGRelistInterval and PLEGRelistDuration are poor for this, because when relist() gets stuck they are never updated. Signed-off-by: Bryan Boreham <bryan@weave.works>
This commit is contained in:
parent
e622579b14
commit
cc0b3e82eb
@ -44,6 +44,7 @@ const (
|
||||
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
|
||||
PLEGDiscardEventsKey = "pleg_discard_events"
|
||||
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
|
||||
PLEGLastSeenKey = "pleg_last_seen_seconds"
|
||||
EvictionsKey = "evictions"
|
||||
EvictionStatsAgeKey = "eviction_stats_age_seconds"
|
||||
PreemptionsKey = "preemptions"
|
||||
@ -187,6 +188,16 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's
|
||||
// Pod Lifecycle Event Generator (PLEG) was last seen active.
|
||||
PLEGLastSeen = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: PLEGLastSeenKey,
|
||||
Help: "Timestamp in seconds when PLEG was last seen active.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations.
|
||||
// Broken down by operation type.
|
||||
RuntimeOperations = metrics.NewCounterVec(
|
||||
@ -523,6 +534,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.S
|
||||
legacyregistry.MustRegister(PLEGRelistDuration)
|
||||
legacyregistry.MustRegister(PLEGDiscardEvents)
|
||||
legacyregistry.MustRegister(PLEGRelistInterval)
|
||||
legacyregistry.MustRegister(PLEGLastSeen)
|
||||
legacyregistry.MustRegister(RuntimeOperations)
|
||||
legacyregistry.MustRegister(RuntimeOperationsDuration)
|
||||
legacyregistry.MustRegister(RuntimeOperationsErrors)
|
||||
|
@ -138,6 +138,8 @@ func (g *GenericPLEG) Healthy() (bool, error) {
|
||||
if relistTime.IsZero() {
|
||||
return false, fmt.Errorf("pleg has yet to be successful")
|
||||
}
|
||||
// Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn`
|
||||
metrics.PLEGLastSeen.Set(float64(relistTime.Unix()))
|
||||
elapsed := g.clock.Since(relistTime)
|
||||
if elapsed > relistThreshold {
|
||||
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)
|
||||
|
Loading…
Reference in New Issue
Block a user