node: metrics: cpumanager: add pinning metrics

In order to improve the observability of the cpumanager,
add and populate metrics to track if the combination of
the kubelet configuration and podspec would trigger
exclusive core allocation and pinning.

We should avoid leaking any node/machine specific information
(e.g. core ids, even though this is admittedly an extreme example);
tracking these metrics seems to be a good first step, because
it allows us to get feedback without exposing details.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2022-10-04 14:36:42 +02:00
parent 5539a5b80f
commit 47d3299781
2 changed files with 79 additions and 40 deletions

View File

@ -27,6 +27,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
const (
@ -252,10 +253,21 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c
p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Difference(cset)
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
if numCPUs := p.guaranteedCPUs(pod, container); numCPUs != 0 {
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
klog.InfoS("Static policy: Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
// container belongs in an exclusively allocated pool
metrics.CPUManagerPinningRequestsTotal.Inc()
defer func() {
if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc()
}
}()
if p.options.FullPhysicalCPUsOnly && ((numCPUs % p.topology.CPUsPerCore()) != 0) {
// Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
@ -291,8 +303,6 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
}
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}

View File

@ -86,6 +86,10 @@ const (
// Metrics to track ephemeral container usage by this kubelet
ManagedEphemeralContainersKey = "managed_ephemeral_containers"
// Metrics to track the CPU manager behavior
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
// Values used in metric labels
Container = "container"
InitContainer = "init_container"
@ -506,6 +510,26 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// CPUManagerPinningRequestsTotal tracks the number of times the pod spec will cause the cpu manager to pin cores
CPUManagerPinningRequestsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: CPUManagerPinningRequestsTotalKey,
Help: "The number of cpu core allocations which required pinning.",
StabilityLevel: metrics.ALPHA,
},
)
// CPUManagerPinningErrorsTotal tracks the number of times the pod spec required the cpu manager to pin cores, but the allocation failed
CPUManagerPinningErrorsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: CPUManagerPinningErrorsTotalKey,
Help: "The number of cpu core allocations which required pinning failed.",
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
@ -570,6 +594,11 @@ func Register(collectors ...metrics.StableCollector) {
if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentHTTPGetHandlers) {
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
}
if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) {
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
}
})
}