Merge pull request #115590 from swatisehgal/topology-mgr-duration-metrics

node: topology-mgr: Add metric to measure topology manager admission latency
2025-09-06 03:33:26 +00:00 · 2023-02-15 07:12:25 -08:00
parent b3d8ac8496 8442b450e5
commit e18fa74551
3 changed files with 39 additions and 4 deletions
--- a/pkg/kubelet/cm/topologymanager/topology_manager.go
+++ b/pkg/kubelet/cm/topologymanager/topology_manager.go
@@ -18,6 +18,7 @@ package topologymanager

 import (
 	"fmt"
+	"time"

 	cadvisorapi "github.com/google/cadvisor/info/v1"
 	"k8s.io/api/core/v1"
@@ -209,9 +210,11 @@ func (m *manager) RemoveContainer(containerID string) error {

 func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
 	klog.InfoS("Topology Admit Handler")
-
 	metrics.TopologyManagerAdmissionRequestsTotal.Inc()
-	pod := attrs.Pod

-	return m.scope.Admit(pod)
+	startTime := time.Now()
+	podAdmitResult := m.scope.Admit(attrs.Pod)
+	metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))
+
+	return podAdmitResult
 }
--- a/pkg/kubelet/metrics/metrics.go
+++ b/pkg/kubelet/metrics/metrics.go
@@ -94,6 +94,7 @@ const (
 	// Metrics to track the Topology manager behavior
 	TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
 	TopologyManagerAdmissionErrorsTotalKey   = "topology_manager_admission_errors_total"
+	TopologyManagerAdmissionDurationKey      = "topology_manager_admission_duration_ms"

 	// Values used in metric labels
 	Container          = "container"
@@ -573,6 +574,17 @@ var (
 			StabilityLevel: metrics.ALPHA,
 		},
 	)
+
+	// TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request.
+	TopologyManagerAdmissionDuration = metrics.NewHistogram(
+		&metrics.HistogramOpts{
+			Subsystem:      KubeletSubsystem,
+			Name:           TopologyManagerAdmissionDurationKey,
+			Help:           "Duration in milliseconds to serve a pod admission request.",
+			Buckets:        metrics.ExponentialBuckets(.05, 2, 15),
+			StabilityLevel: metrics.ALPHA,
+		},
+	)
 )

 var registerMetrics sync.Once
@@ -626,6 +638,7 @@ func Register(collectors ...metrics.StableCollector) {
 		legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
 		legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
 		legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
+		legacyregistry.MustRegister(TopologyManagerAdmissionDuration)

 		for _, collector := range collectors {
 			legacyregistry.CustomMustRegister(collector)
--- a/test/e2e_node/topology_manager_metrics_test.go
+++ b/test/e2e_node/topology_manager_metrics_test.go
@@ -23,6 +23,7 @@ import (
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 	"github.com/onsi/gomega/gstruct"
+	"github.com/onsi/gomega/types"

 	v1 "k8s.io/api/core/v1"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
@@ -85,6 +86,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					"": timelessSample(0),
 				}),
+				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
+					"": timelessSample(0),
+				}),
 			})

 			ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -108,6 +112,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					"": timelessSample(1),
 				}),
+				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
+					"": checkMetricValueGreaterThan(0),
+				}),
 			})

 			ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -122,7 +129,7 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"

 			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
 			// being [Serial], we can also assume noone else but us is running pods.
-			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
+			ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")

 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
 				"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
@@ -131,6 +138,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					"": timelessSample(0),
 				}),
+				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
+					"": checkMetricValueGreaterThan(0),
+				}),
 			})

 			ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -157,3 +167,12 @@ func hostCheck() (int, int) {

 	return numaNodes, coreCount
 }
+
+func checkMetricValueGreaterThan(value interface{}) types.GomegaMatcher {
+	return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
+		// We already check Metric when matching the Id
+		"Metric":    gstruct.Ignore(),
+		"Value":     gomega.BeNumerically(">", value),
+		"Timestamp": gstruct.Ignore(),
+	}))
+}