mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-20 10:20:51 +00:00
Merge pull request #115137 from swatisehgal/topologymgr-metrics
node: topologymgr: add metrics about admission requests and errors
This commit is contained in:
commit
4df945853e
@ -22,6 +22,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
type containerScope struct {
|
||||
@ -54,6 +55,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
|
||||
if !admit {
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(&TopologyAffinityError{})
|
||||
}
|
||||
klog.InfoS("Topology Affinity", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
@ -61,6 +63,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
|
||||
err := s.allocateAlignedResources(pod, &container)
|
||||
if err != nil {
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(err)
|
||||
}
|
||||
}
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
type podScope struct {
|
||||
@ -52,6 +53,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
bestHint, admit := s.calculateAffinity(pod)
|
||||
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
|
||||
if !admit {
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(&TopologyAffinityError{})
|
||||
}
|
||||
|
||||
@ -61,6 +63,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
|
||||
err := s.allocateAlignedResources(pod, &container)
|
||||
if err != nil {
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(err)
|
||||
}
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ import (
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -208,6 +209,8 @@ func (m *manager) RemoveContainer(containerID string) error {
|
||||
|
||||
func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
|
||||
klog.InfoS("Topology Admit Handler")
|
||||
|
||||
metrics.TopologyManagerAdmissionRequestsTotal.Inc()
|
||||
pod := attrs.Pod
|
||||
|
||||
return m.scope.Admit(pod)
|
||||
|
@ -91,6 +91,10 @@ const (
|
||||
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
|
||||
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
|
||||
|
||||
// Metrics to track the Topology manager behavior
|
||||
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
|
||||
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
||||
|
||||
// Values used in metric labels
|
||||
Container = "container"
|
||||
InitContainer = "init_container"
|
||||
@ -549,6 +553,26 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod
|
||||
TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: TopologyManagerAdmissionRequestsTotalKey,
|
||||
Help: "The number of admission requests where resources have to be aligned.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// TopologyManagerAdmissionErrorsTotal tracks the number of times the pod spec required the topology manager to admit a pod, but the admission failed
|
||||
TopologyManagerAdmissionErrorsTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: TopologyManagerAdmissionErrorsTotalKey,
|
||||
Help: "The number of admission request failures where resources could not be aligned.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -600,6 +624,8 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(RunPodSandboxErrors)
|
||||
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
||||
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
||||
|
||||
for _, collector := range collectors {
|
||||
legacyregistry.CustomMustRegister(collector)
|
||||
|
@ -109,9 +109,9 @@ var _ = SIGDescribe("CPU Manager Metrics [Serial][Feature:CPUManager]", func() {
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should report pinning failures when the cpumanager allocation is known to fail", func(ctx context.Context) {
|
||||
@ -132,9 +132,9 @@ var _ = SIGDescribe("CPU Manager Metrics [Serial][Feature:CPUManager]", func() {
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should not report any pinning failures when the cpumanager allocation is expected to succeed", func(ctx context.Context) {
|
||||
@ -155,16 +155,15 @@ var _ = SIGDescribe("CPU Manager Metrics [Serial][Feature:CPUManager]", func() {
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func getCPUManagerMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
|
||||
// we are running out of good names, so we need to be unnecessarily specific to avoid clashes
|
||||
ginkgo.By("getting CPU Manager metrics from the metrics API")
|
||||
func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
|
||||
ginkgo.By("getting Kubelet metrics from the metrics API")
|
||||
return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, framework.TestContext.NodeName+":10255", "/metrics")
|
||||
}
|
||||
|
||||
|
159
test/e2e_node/topology_manager_metrics_test.go
Normal file
159
test/e2e_node/topology_manager_metrics_test.go
Normal file
@ -0,0 +1,159 @@
|
||||
/*
|
||||
Copyright 2023 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2enode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
"github.com/onsi/gomega"
|
||||
"github.com/onsi/gomega/gstruct"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
)
|
||||
|
||||
var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]", func() {
|
||||
f := framework.NewDefaultFramework("topologymanager-metrics")
|
||||
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
||||
|
||||
ginkgo.Context("when querying /metrics", func() {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var testPod *v1.Pod
|
||||
var cpusNumPerNUMA, numaNodes int
|
||||
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
var err error
|
||||
if oldCfg == nil {
|
||||
oldCfg, err = getCurrentKubeletConfig(ctx)
|
||||
framework.ExpectNoError(err)
|
||||
}
|
||||
|
||||
numaNodes, cpusNumPerNUMA = hostCheck()
|
||||
|
||||
// It is safe to assume that the CPUs are distributed equally across
|
||||
// NUMA nodes and therefore number of CPUs on all NUMA nodes are same
|
||||
// so we just check the CPUs on the first NUMA node
|
||||
|
||||
framework.Logf("numaNodes on the system %d", numaNodes)
|
||||
framework.Logf("CPUs per NUMA on the system %d", cpusNumPerNUMA)
|
||||
|
||||
policy := topologymanager.PolicySingleNumaNode
|
||||
scope := podScopeTopology
|
||||
|
||||
newCfg, _ := configureTopologyManagerInKubelet(oldCfg, policy, scope, nil, 0)
|
||||
updateKubeletConfig(ctx, f, newCfg, true)
|
||||
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func(ctx context.Context) {
|
||||
if testPod != nil {
|
||||
deletePodSyncByName(ctx, f, testPod.Name)
|
||||
}
|
||||
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||
})
|
||||
|
||||
ginkgo.It("should report zero admission counters after a fresh restart", func(ctx context.Context) {
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running")
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should report admission failures when the topology manager alignment is known to fail", func(ctx context.Context) {
|
||||
ginkgo.By("Creating the test pod which will be rejected for TopologyAffinity")
|
||||
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("topology-affinity-err", cpusNumPerNUMA+1))
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit")
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should not report any admission failures when the topology manager alignment is expected to succeed", func(ctx context.Context) {
|
||||
ginkgo.By("Creating the test pod")
|
||||
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("topology-alignment-ok", cpusNumPerNUMA))
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func hostCheck() (int, int) {
|
||||
// this is a very rough check. We just want to rule out system that does NOT have
|
||||
// multi-NUMA nodes or at least 4 cores
|
||||
|
||||
numaNodes := detectNUMANodes()
|
||||
if numaNodes < minNumaNodes {
|
||||
e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system")
|
||||
}
|
||||
|
||||
coreCount := detectCoresPerSocket()
|
||||
if coreCount < minCoreCount {
|
||||
e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount)
|
||||
}
|
||||
|
||||
return numaNodes, coreCount
|
||||
}
|
Loading…
Reference in New Issue
Block a user