Merge pull request #115967 from harche/evented_pleg_metrics

Graduate Evented PLEG to Beta
This commit is contained in:
Kubernetes Prow Robot 2023-03-10 17:34:40 -08:00 committed by GitHub
commit c6f3007071
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 1 deletions

View File

@ -259,6 +259,7 @@ const (
// owner: @harche
// kep: http://kep.k8s.io/3386
// alpha: v1.25
// beta: v1.27
//
// Allows using event-driven PLEG (pod lifecycle event generator) through kubelet
// which avoids frequent relisting of containers which helps optimize performance.
@ -937,7 +938,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
DynamicResourceAllocation: {Default: false, PreRelease: featuregate.Alpha},
EventedPLEG: {Default: false, PreRelease: featuregate.Alpha},
EventedPLEG: {Default: false, PreRelease: featuregate.Beta}, // off by default, requires CRI Runtime support
ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update

View File

@ -37,8 +37,10 @@ import (
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/probe/exec"
utilexec "k8s.io/utils/exec"
)
@ -797,6 +799,9 @@ func (r *remoteRuntimeService) GetContainerEvents(containerEventsCh chan *runtim
return err
}
// The connection is successfully established and we have a streaming client ready for use.
metrics.EventedPLEGConn.Inc()
for {
resp, err := containerEventsStreamingClient.Recv()
if err == io.EOF {

View File

@ -43,6 +43,9 @@ const (
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
PLEGLastSeenKey = "pleg_last_seen_seconds"
EventedPLEGConnErrKey = "evented_pleg_connection_error_count"
EventedPLEGConnKey = "evented_pleg_connection_success_count"
EventedPLEGConnLatencyKey = "evented_pleg_connection_latency_seconds"
EvictionsKey = "evictions"
EvictionStatsAgeKey = "eviction_stats_age_seconds"
PreemptionsKey = "preemptions"
@ -250,6 +253,41 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// EventedPLEGConnErr is a Counter that tracks the number of errors encountered during
// the establishment of streaming connection with the CRI runtime.
EventedPLEGConnErr = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: EventedPLEGConnErrKey,
Help: "The number of errors encountered during the establishment of streaming connection with the CRI runtime.",
StabilityLevel: metrics.ALPHA,
},
)
// EventedPLEGConn is a Counter that tracks the number of times a streaming client
// was obtained to receive CRI Events.
EventedPLEGConn = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: EventedPLEGConnKey,
Help: "The number of times a streaming client was obtained to receive CRI Events.",
StabilityLevel: metrics.ALPHA,
},
)
// EventedPLEGConnLatency is a Histogram that tracks the latency of streaming connection
// with the CRI runtime, measured in seconds.
EventedPLEGConnLatency = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: EventedPLEGConnLatencyKey,
Help: "The latency of streaming connection with the CRI runtime, measured in seconds.",
Buckets: metrics.DefBuckets,
StabilityLevel: metrics.ALPHA,
},
)
// RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations.
// Broken down by operation type.
RuntimeOperations = metrics.NewCounterVec(
@ -692,6 +730,9 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(PLEGDiscardEvents)
legacyregistry.MustRegister(PLEGRelistInterval)
legacyregistry.MustRegister(PLEGLastSeen)
legacyregistry.MustRegister(EventedPLEGConnErr)
legacyregistry.MustRegister(EventedPLEGConn)
legacyregistry.MustRegister(EventedPLEGConnLatency)
legacyregistry.MustRegister(RuntimeOperations)
legacyregistry.MustRegister(RuntimeOperationsDuration)
legacyregistry.MustRegister(RuntimeOperationsErrors)

View File

@ -190,6 +190,7 @@ func (e *EventedPLEG) watchEventsChannel() {
err := e.runtimeService.GetContainerEvents(containerEventsResponseCh)
if err != nil {
metrics.EventedPLEGConnErr.Inc()
numAttempts++
e.Relist() // Force a relist to get the latest container and pods running metric.
klog.V(4).InfoS("Evented PLEG: Failed to get container events, retrying: ", "err", err)
@ -245,6 +246,7 @@ func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeap
e.updateRunningPodMetric(status)
e.updateRunningContainerMetric(status)
e.updateLatencyMetric(event)
if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT {
for _, sandbox := range status.SandboxStatuses {
@ -410,6 +412,11 @@ func (e *EventedPLEG) updateRunningContainerMetric(podStatus *kubecontainer.PodS
}
}
func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventResponse) {
duration := time.Duration(time.Now().UnixNano()-event.CreatedAt) * time.Nanosecond
metrics.EventedPLEGConnLatency.Observe(duration.Seconds())
}
func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) {
return fmt.Errorf("not implemented"), false
}