From bc941633c1eb4203cd88463af191662c4038f7be Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 2 Feb 2023 12:21:41 +0000 Subject: [PATCH 1/3] node: topology-mgr: add metric to measure topology mgr admission latency Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/topologymanager/topology_manager.go | 8 ++++++-- pkg/kubelet/metrics/metrics.go | 13 +++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index 8b288aa6dec..6fb2f9e2442 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -18,6 +18,7 @@ package topologymanager import ( "fmt" + "time" cadvisorapi "github.com/google/cadvisor/info/v1" "k8s.io/api/core/v1" @@ -209,9 +210,12 @@ func (m *manager) RemoveContainer(containerID string) error { func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { klog.InfoS("Topology Admit Handler") - metrics.TopologyManagerAdmissionRequestsTotal.Inc() pod := attrs.Pod - return m.scope.Admit(pod) + startTime := time.Now() + podAdmitResult := m.scope.Admit(pod) + metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds())) + + return podAdmitResult } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index bb3e16ced99..04e37c6d7d1 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -94,6 +94,7 @@ const ( // Metrics to track the Topology manager behavior TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total" TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" + TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms" // Values used in metric labels Container = "container" @@ -573,6 +574,17 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + + // TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request. + TopologyManagerAdmissionDuration = metrics.NewHistogram( + &metrics.HistogramOpts{ + Subsystem: KubeletSubsystem, + Name: TopologyManagerAdmissionDurationKey, + Help: "Duration in milliseconds to serve a pod admission request.", + Buckets: metrics.ExponentialBuckets(.05, 2, 15), + StabilityLevel: metrics.ALPHA, + }, + ) ) var registerMetrics sync.Once @@ -626,6 +638,7 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) + legacyregistry.MustRegister(TopologyManagerAdmissionDuration) for _, collector := range collectors { legacyregistry.CustomMustRegister(collector) From cf21dcef5137cc9b9b1e3bf4e0b32f800ab680eb Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 2 Feb 2023 12:19:09 +0000 Subject: [PATCH 2/3] node: topology-mgr: e2e: changes to validate admission latency metrics The component was previously incorrect. This patch updates to the correct component name. Signed-off-by: Swati Sehgal --- .../e2e_node/topology_manager_metrics_test.go | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/e2e_node/topology_manager_metrics_test.go b/test/e2e_node/topology_manager_metrics_test.go index 84e1eadb32c..2af49d709e1 100644 --- a/test/e2e_node/topology_manager_metrics_test.go +++ b/test/e2e_node/topology_manager_metrics_test.go @@ -23,6 +23,7 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "github.com/onsi/gomega/gstruct" + "github.com/onsi/gomega/types" v1 "k8s.io/api/core/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" @@ -85,6 +86,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": timelessSample(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -108,6 +112,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": checkMetricValueGreaterThan(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -122,7 +129,7 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" // we updated the kubelet config in BeforeEach, so we can assume we start fresh. // being [Serial], we can also assume noone else but us is running pods. - ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") + ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ @@ -131,6 +138,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]" "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ + "": checkMetricValueGreaterThan(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -157,3 +167,12 @@ func hostCheck() (int, int) { return numaNodes, coreCount } + +func checkMetricValueGreaterThan(value interface{}) types.GomegaMatcher { + return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{ + // We already check Metric when matching the Id + "Metric": gstruct.Ignore(), + "Value": gomega.BeNumerically(">", value), + "Timestamp": gstruct.Ignore(), + })) +} From 8442b450e5064363c1923c31e6e29e25accbcdaf Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 15 Feb 2023 11:45:40 +0000 Subject: [PATCH 3/3] node: topology-mgr: code optimization Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/topologymanager/topology_manager.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index 6fb2f9e2442..567736e82d3 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -211,10 +211,9 @@ func (m *manager) RemoveContainer(containerID string) error { func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { klog.InfoS("Topology Admit Handler") metrics.TopologyManagerAdmissionRequestsTotal.Inc() - pod := attrs.Pod startTime := time.Now() - podAdmitResult := m.scope.Admit(pod) + podAdmitResult := m.scope.Admit(attrs.Pod) metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds())) return podAdmitResult