Merge pull request #127155 from ffromani/alignment-metrics

node: metrics: add resource alignment metrics
This commit is contained in:
Kubernetes Prow Robot 2024-10-23 09:54:58 +01:00 committed by GitHub
commit c6669ea7d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 96 additions and 2 deletions

View File

@ -312,7 +312,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
defer func() {
if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc()
return
}
if !p.options.FullPhysicalCPUsOnly {
// increment only if we know we allocate aligned resources
return
}
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}()
if p.options.FullPhysicalCPUsOnly {

View File

@ -30,6 +30,14 @@ type Policy interface {
Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
}
// IsAlignmentGuaranteed return true if the given policy guarantees that either
// the compute resources will be allocated within a NUMA boundary, or the allocation will fail at all.
func IsAlignmentGuaranteed(p Policy) bool {
// We are abusing the name, but atm this matches almost 1:1 the policy name
// so we are not adding new fields for now.
return p.Name() == PolicySingleNumaNode
}
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
// of their affinity masks. The hint shall be preferred if all hits in the permutation
// are preferred.

View File

@ -61,6 +61,10 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(err)
}
if IsAlignmentGuaranteed(s.policy) {
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
}
}
return admission.GetPodAdmitResult(nil)
}

View File

@ -62,6 +62,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
return admission.GetPodAdmitResult(err)
}
}
if IsAlignmentGuaranteed(s.policy) {
// increment only if we know we allocate aligned resources.
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
}
return admission.GetPodAdmitResult(nil)
}

View File

@ -127,10 +127,21 @@ const (
// Metric for tracking garbage collected images
ImageGarbageCollectedTotalKey = "image_garbage_collected_total"
// Metric for tracking aligment of compute resources
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
// Values used in metric labels
Container = "container"
InitContainer = "init_container"
EphemeralContainer = "ephemeral_container"
AlignScopePod = "pod"
AlignScopeContainer = "container"
AlignedPhysicalCPU = "physical_cpu"
AlignedNUMANode = "numa_node"
)
type imageSizeBucket struct {
@ -762,6 +773,16 @@ var (
},
)
ContainerAlignedComputeResources = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ContainerAlignedComputeResourcesNameKey,
Help: "Cumulative number of aligned compute resources allocated to containers by alignment type.",
StabilityLevel: metrics.ALPHA,
},
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
)
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
MemoryManagerPinningRequestTotal = metrics.NewCounter(
&metrics.CounterOpts{
@ -985,6 +1006,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(RunPodSandboxErrors)
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
legacyregistry.MustRegister(ContainerAlignedComputeResources)
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)

View File

@ -91,6 +91,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
ginkgo.AfterEach(func(ctx context.Context) {
if testPod != nil {
deletePodSyncByName(ctx, f, testPod.Name)
waitForContainerRemoval(ctx, testPod.Spec.Containers[0].Name, testPod.Name, testPod.Namespace)
}
updateKubeletConfig(ctx, f, oldCfg, true)
})
@ -160,11 +161,32 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
})
ginkgo.It("should return updated alignment counters when pod successfully run", func(ctx context.Context) {
ginkgo.By("Creating the test pod")
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-smt-ok", smtLevel))
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
idFn := makeCustomPairID("scope", "boundary")
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(1),
}),
})
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
})
})
})
func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
ginkgo.By("getting Kubelet metrics from the metrics API")
ginkgo.By("Getting Kubelet metrics from the metrics API")
return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
}
@ -189,7 +211,7 @@ func makeGuaranteedCPUExclusiveSleeperPod(name string, cpus int) *v1.Pod {
v1.ResourceMemory: resource.MustParse("64Mi"),
},
},
Command: []string{"sh", "-c", "sleep", "1d"},
Command: []string{"sh", "-c", "sleep 1d"},
},
},
},

View File

@ -157,6 +157,13 @@ func containerID(element interface{}) string {
return fmt.Sprintf("%s::%s::%s", el.Metric["namespace"], el.Metric["pod"], el.Metric["container"])
}
func makeCustomPairID(pri, sec string) func(interface{}) string {
return func(element interface{}) string {
el := element.(*model.Sample)
return fmt.Sprintf("%s::%s", el.Metric[model.LabelName(pri)], el.Metric[model.LabelName(sec)])
}
}
func boundedSample(lower, upper interface{}) types.GomegaMatcher {
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
// We already check Metric when matching the Id

View File

@ -28,6 +28,7 @@ import (
v1 "k8s.io/api/core/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@ -152,6 +153,26 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(ctx, getKubeletMetrics, 2*time.Minute, 10*time.Second).Should(matchResourceMetrics)
})
ginkgo.It("[alignment] should return updated alignment counters when pod successfully run", func(ctx context.Context) {
ginkgo.By("Creating the test pod")
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-numa-ok", cpusNumPerNUMA))
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
metrics.AlignedNUMANode: timelessSample(1),
}),
})
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
})
})
})