kubelet: Record preemptions similarly to evictions

A preemption is a disruption event that should have a metric so that
the rate of preemption can be assessed. Nodes that are under heavy
preemption may have conflicting workloads or otherwise need attention.
A sudden burst of preemption on a cluster in steady state could
indicate pathological conditions within the scheduler or workload
controllers.
This commit is contained in:
Clayton Coleman 2019-10-19 19:07:37 -04:00
parent 9c25b16d88
commit 3c44e11cfa
No known key found for this signature in database
GPG Key ID: 3D16906B4F1C5CB3
3 changed files with 24 additions and 2 deletions

View File

@ -18,11 +18,12 @@ package metrics
import (
"fmt"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"sync"
"time"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -45,6 +46,7 @@ const (
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
EvictionsKey = "evictions"
EvictionStatsAgeKey = "eviction_stats_age_seconds"
PreemptionsKey = "preemptions"
DeprecatedPodWorkerLatencyKey = "pod_worker_latency_microseconds"
DeprecatedPodStartLatencyKey = "pod_start_latency_microseconds"
DeprecatedCgroupManagerOperationsKey = "cgroup_manager_latency_microseconds"
@ -242,6 +244,18 @@ var (
},
[]string{"eviction_signal"},
)
// Preemptions is a Counter that tracks the cumulative number of pod preemptions initiated by the kubelet.
// Broken down by preemption signal. A preemption is only recorded for one resource, the sum of all signals
// is the number of preemptions on the given node.
Preemptions = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: PreemptionsKey,
Help: "Cumulative number of pod preemptions by preemption resource",
StabilityLevel: metrics.ALPHA,
},
[]string{"preemption_signal"},
)
// DevicePluginRegistrationCount is a Counter that tracks the cumulative number of device plugin registrations.
// Broken down by resource name.
DevicePluginRegistrationCount = metrics.NewCounterVec(
@ -502,6 +516,7 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...metrics.C
legacyregistry.MustRegister(RuntimeOperationsErrors)
legacyregistry.MustRegister(Evictions)
legacyregistry.MustRegister(EvictionStatsAge)
legacyregistry.MustRegister(Preemptions)
legacyregistry.MustRegister(DevicePluginRegistrationCount)
legacyregistry.MustRegister(DevicePluginAllocationDuration)
legacyregistry.MustRegister(DeprecatedPodWorkerLatency)

View File

@ -16,6 +16,7 @@ go_library(
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/eviction:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/scheduler/algorithm/predicates:go_default_library",

View File

@ -28,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
@ -111,6 +112,11 @@ func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod,
// In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
continue
}
if len(insufficientResources) > 0 {
metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
} else {
metrics.Preemptions.WithLabelValues("").Inc()
}
klog.Infof("preemption: pod %s evicted successfully", format.Pod(pod))
}
return nil