diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index 8b288aa6dec..567736e82d3 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -18,6 +18,7 @@ package topologymanager import ( "fmt" + "time" cadvisorapi "github.com/google/cadvisor/info/v1" "k8s.io/api/core/v1" @@ -209,9 +210,11 @@ func (m *manager) RemoveContainer(containerID string) error { func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { klog.InfoS("Topology Admit Handler") - metrics.TopologyManagerAdmissionRequestsTotal.Inc() - pod := attrs.Pod - return m.scope.Admit(pod) + startTime := time.Now() + podAdmitResult := m.scope.Admit(attrs.Pod) + metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds())) + + return podAdmitResult } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index bb3e16ced99..04e37c6d7d1 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -94,6 +94,7 @@ const ( // Metrics to track the Topology manager behavior TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total" TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" + TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms" // Values used in metric labels Container = "container" @@ -573,6 +574,17 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + + // TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request. + TopologyManagerAdmissionDuration = metrics.NewHistogram( + &metrics.HistogramOpts{ + Subsystem: KubeletSubsystem, + Name: TopologyManagerAdmissionDurationKey, + Help: "Duration in milliseconds to serve a pod admission request.", + Buckets: metrics.ExponentialBuckets(.05, 2, 15), + StabilityLevel: metrics.ALPHA, + }, + ) ) var registerMetrics sync.Once @@ -626,6 +638,7 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) + legacyregistry.MustRegister(TopologyManagerAdmissionDuration) for _, collector := range collectors { legacyregistry.CustomMustRegister(collector) diff --git a/test/e2e_node/topology_manager_metrics_test.go b/test/e2e_node/topology_manager_metrics_test.go index 84e1eadb32c..2af49d709e1 100644 --- a/test/e2e_node/topology_manager_metrics_test.go +++ b/test/e2e_node/topology_manager_metrics_test.go @@ -23,6 +23,7 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "github.com/onsi/gomega/gstruct" + "github.com/onsi/gomega/types" v1 "k8s.io/api/core/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" @@ -85,6 +86,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": timelessSample(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -108,6 +112,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": checkMetricValueGreaterThan(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -122,7 +129,7 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" // we updated the kubelet config in BeforeEach, so we can assume we start fresh. // being [Serial], we can also assume noone else but us is running pods. - ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") + ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ @@ -131,6 +138,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": checkMetricValueGreaterThan(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -157,3 +167,12 @@ func hostCheck() (int, int) { return numaNodes, coreCount } + +func checkMetricValueGreaterThan(value interface{}) types.GomegaMatcher { + return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{ + // We already check Metric when matching the Id + "Metric": gstruct.Ignore(), + "Value": gomega.BeNumerically(">", value), + "Timestamp": gstruct.Ignore(), + })) +}