Merge pull request #127155 from ffromani/alignment-metrics

node: metrics: add resource alignment metrics
2025-08-05 10:19:50 +00:00 · 2024-10-23 09:54:58 +01:00 · 2024-10-23 09:54:58 +01:00 · c6669ea7d6
commit c6669ea7d6
parent 403fcab520 c025861e0c
8 changed files with 96 additions and 2 deletions
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@ -312,7 +312,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 	defer func() {
 		if rerr != nil {
 			metrics.CPUManagerPinningErrorsTotal.Inc()
 			return
 		}
 		if !p.options.FullPhysicalCPUsOnly {
 			// increment only if we know we allocate aligned resources
 			return
 		}
 		metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
 	}()
 	if p.options.FullPhysicalCPUsOnly {
--- a/pkg/kubelet/cm/topologymanager/policy.go
+++ b/pkg/kubelet/cm/topologymanager/policy.go
@ -30,6 +30,14 @@ type Policy interface {
 	Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
 }
 // IsAlignmentGuaranteed return true if the given policy guarantees that either
 // the compute resources will be allocated within a NUMA boundary, or the allocation will fail at all.
 func IsAlignmentGuaranteed(p Policy) bool {
 	// We are abusing the name, but atm this matches almost 1:1 the policy name
 	// so we are not adding new fields for now.
 	return p.Name() == PolicySingleNumaNode
 }
 // Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
 // of their affinity masks. The hint shall be preferred if all hits in the permutation
 // are preferred.
--- a/pkg/kubelet/cm/topologymanager/scope_container.go
+++ b/pkg/kubelet/cm/topologymanager/scope_container.go
@ -61,6 +61,10 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 			metrics.TopologyManagerAdmissionErrorsTotal.Inc()
 			return admission.GetPodAdmitResult(err)
 		}
 		if IsAlignmentGuaranteed(s.policy) {
 			metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
 		}
 	}
 	return admission.GetPodAdmitResult(nil)
 }
--- a/pkg/kubelet/cm/topologymanager/scope_pod.go
+++ b/pkg/kubelet/cm/topologymanager/scope_pod.go
@ -62,6 +62,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 			return admission.GetPodAdmitResult(err)
 		}
 	}
 	if IsAlignmentGuaranteed(s.policy) {
 		// increment only if we know we allocate aligned resources.
 		metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
 	}
 	return admission.GetPodAdmitResult(nil)
 }
--- a/pkg/kubelet/metrics/metrics.go
+++ b/pkg/kubelet/metrics/metrics.go
@ -127,10 +127,21 @@ const (
 	// Metric for tracking garbage collected images
 	ImageGarbageCollectedTotalKey = "image_garbage_collected_total"
 	// Metric for tracking aligment of compute resources
 	ContainerAlignedComputeResourcesNameKey          = "container_aligned_compute_resources_count"
 	ContainerAlignedComputeResourcesScopeLabelKey    = "scope"
 	ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
 	// Values used in metric labels
 	Container          = "container"
 	InitContainer      = "init_container"
 	EphemeralContainer = "ephemeral_container"
 	AlignScopePod       = "pod"
 	AlignScopeContainer = "container"
 	AlignedPhysicalCPU = "physical_cpu"
 	AlignedNUMANode    = "numa_node"
 )
 type imageSizeBucket struct {
@ -762,6 +773,16 @@ var (
 		},
 	)
 	ContainerAlignedComputeResources = metrics.NewCounterVec(
 		&metrics.CounterOpts{
 			Subsystem:      KubeletSubsystem,
 			Name:           ContainerAlignedComputeResourcesNameKey,
 			Help:           "Cumulative number of aligned compute resources allocated to containers by alignment type.",
 			StabilityLevel: metrics.ALPHA,
 		},
 		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
 	)
 	// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
 	MemoryManagerPinningRequestTotal = metrics.NewCounter(
 		&metrics.CounterOpts{
@ -985,6 +1006,7 @@ func Register(collectors ...metrics.StableCollector) {
 		legacyregistry.MustRegister(RunPodSandboxErrors)
 		legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
 		legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
 		legacyregistry.MustRegister(ContainerAlignedComputeResources)
 		if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
 			legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
 			legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
--- a/test/e2e_node/cpu_manager_metrics_test.go
+++ b/test/e2e_node/cpu_manager_metrics_test.go
@ -91,6 +91,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
 		ginkgo.AfterEach(func(ctx context.Context) {
 			if testPod != nil {
 				deletePodSyncByName(ctx, f, testPod.Name)
 				waitForContainerRemoval(ctx, testPod.Spec.Containers[0].Name, testPod.Name, testPod.Namespace)
 			}
 			updateKubeletConfig(ctx, f, oldCfg, true)
 		})
@ -160,11 +161,32 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
 			ginkgo.By("Ensuring the metrics match the expectations a few more times")
 			gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
 		})
 		ginkgo.It("should return updated alignment counters when pod successfully run", func(ctx context.Context) {
 			ginkgo.By("Creating the test pod")
 			testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-smt-ok", smtLevel))
 			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
 			// being [Serial], we can also assume noone else but us is running pods.
 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
 			idFn := makeCustomPairID("scope", "boundary")
 			matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
 				"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
 					"container::physical_cpu": timelessSample(1),
 				}),
 			})
 			ginkgo.By("Giving the Kubelet time to update the alignment metrics")
 			gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
 			ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
 			gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
 		})
 	})
 })
 func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
-	ginkgo.By("getting Kubelet metrics from the metrics API")
+	ginkgo.By("Getting Kubelet metrics from the metrics API")
 	return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
 }
@ -189,7 +211,7 @@ func makeGuaranteedCPUExclusiveSleeperPod(name string, cpus int) *v1.Pod {
 							v1.ResourceMemory: resource.MustParse("64Mi"),
 						},
 					},
-					Command: []string{"sh", "-c", "sleep", "1d"},
+					Command: []string{"sh", "-c", "sleep 1d"},
 				},
 			},
 		},
--- a/test/e2e_node/resource_metrics_test.go
+++ b/test/e2e_node/resource_metrics_test.go
@ -157,6 +157,13 @@ func containerID(element interface{}) string {
 	return fmt.Sprintf("%s::%s::%s", el.Metric["namespace"], el.Metric["pod"], el.Metric["container"])
 }
 func makeCustomPairID(pri, sec string) func(interface{}) string {
 	return func(element interface{}) string {
 		el := element.(*model.Sample)
 		return fmt.Sprintf("%s::%s", el.Metric[model.LabelName(pri)], el.Metric[model.LabelName(sec)])
 	}
 }
 func boundedSample(lower, upper interface{}) types.GomegaMatcher {
 	return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
 		// We already check Metric when matching the Id
--- a/test/e2e_node/topology_manager_metrics_test.go
+++ b/test/e2e_node/topology_manager_metrics_test.go
@ -28,6 +28,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
 	"k8s.io/kubernetes/pkg/kubelet/metrics"
 	"k8s.io/kubernetes/test/e2e/feature"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@ -152,6 +153,26 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
 			ginkgo.By("Ensuring the metrics match the expectations a few more times")
 			gomega.Consistently(ctx, getKubeletMetrics, 2*time.Minute, 10*time.Second).Should(matchResourceMetrics)
 		})
 		ginkgo.It("[alignment] should return updated alignment counters when pod successfully run", func(ctx context.Context) {
 			ginkgo.By("Creating the test pod")
 			testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-numa-ok", cpusNumPerNUMA))
 			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
 			// being [Serial], we can also assume noone else but us is running pods.
 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
 			matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
 				"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					metrics.AlignedNUMANode: timelessSample(1),
 				}),
 			})
 			ginkgo.By("Giving the Kubelet time to update the alignment metrics")
 			gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
 			ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
 			gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
 		})
 	})
 })