Merge pull request #115590 from swatisehgal/topology-mgr-duration-metrics

node: topology-mgr: Add metric to measure topology manager admission latency
This commit is contained in:
Kubernetes Prow Robot 2023-02-15 07:12:25 -08:00 committed by GitHub
commit e18fa74551
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 4 deletions

View File

@ -18,6 +18,7 @@ package topologymanager
import (
"fmt"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/api/core/v1"
@ -209,9 +210,11 @@ func (m *manager) RemoveContainer(containerID string) error {
func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
klog.InfoS("Topology Admit Handler")
metrics.TopologyManagerAdmissionRequestsTotal.Inc()
pod := attrs.Pod
return m.scope.Admit(pod)
startTime := time.Now()
podAdmitResult := m.scope.Admit(attrs.Pod)
metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))
return podAdmitResult
}

View File

@ -94,6 +94,7 @@ const (
// Metrics to track the Topology manager behavior
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
// Values used in metric labels
Container = "container"
@ -573,6 +574,17 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request.
TopologyManagerAdmissionDuration = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: TopologyManagerAdmissionDurationKey,
Help: "Duration in milliseconds to serve a pod admission request.",
Buckets: metrics.ExponentialBuckets(.05, 2, 15),
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
@ -626,6 +638,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
for _, collector := range collectors {
legacyregistry.CustomMustRegister(collector)

View File

@ -23,6 +23,7 @@ import (
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"github.com/onsi/gomega/gstruct"
"github.com/onsi/gomega/types"
v1 "k8s.io/api/core/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
@ -85,6 +86,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": timelessSample(0),
}),
})
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -108,6 +112,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0),
}),
})
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -122,7 +129,7 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
@ -131,6 +138,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0),
}),
})
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -157,3 +167,12 @@ func hostCheck() (int, int) {
return numaNodes, coreCount
}
func checkMetricValueGreaterThan(value interface{}) types.GomegaMatcher {
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
// We already check Metric when matching the Id
"Metric": gstruct.Ignore(),
"Value": gomega.BeNumerically(">", value),
"Timestamp": gstruct.Ignore(),
}))
}