mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-05 10:19:50 +00:00
Merge pull request #127155 from ffromani/alignment-metrics
node: metrics: add resource alignment metrics
This commit is contained in:
commit
c6669ea7d6
@ -312,7 +312,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
|
|||||||
defer func() {
|
defer func() {
|
||||||
if rerr != nil {
|
if rerr != nil {
|
||||||
metrics.CPUManagerPinningErrorsTotal.Inc()
|
metrics.CPUManagerPinningErrorsTotal.Inc()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
if !p.options.FullPhysicalCPUsOnly {
|
||||||
|
// increment only if we know we allocate aligned resources
|
||||||
|
return
|
||||||
|
}
|
||||||
|
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if p.options.FullPhysicalCPUsOnly {
|
if p.options.FullPhysicalCPUsOnly {
|
||||||
|
@ -30,6 +30,14 @@ type Policy interface {
|
|||||||
Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
|
Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsAlignmentGuaranteed return true if the given policy guarantees that either
|
||||||
|
// the compute resources will be allocated within a NUMA boundary, or the allocation will fail at all.
|
||||||
|
func IsAlignmentGuaranteed(p Policy) bool {
|
||||||
|
// We are abusing the name, but atm this matches almost 1:1 the policy name
|
||||||
|
// so we are not adding new fields for now.
|
||||||
|
return p.Name() == PolicySingleNumaNode
|
||||||
|
}
|
||||||
|
|
||||||
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
|
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
|
||||||
// of their affinity masks. The hint shall be preferred if all hits in the permutation
|
// of their affinity masks. The hint shall be preferred if all hits in the permutation
|
||||||
// are preferred.
|
// are preferred.
|
||||||
|
@ -61,6 +61,10 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
|||||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||||
return admission.GetPodAdmitResult(err)
|
return admission.GetPodAdmitResult(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if IsAlignmentGuaranteed(s.policy) {
|
||||||
|
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return admission.GetPodAdmitResult(nil)
|
return admission.GetPodAdmitResult(nil)
|
||||||
}
|
}
|
||||||
|
@ -62,6 +62,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
|||||||
return admission.GetPodAdmitResult(err)
|
return admission.GetPodAdmitResult(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if IsAlignmentGuaranteed(s.policy) {
|
||||||
|
// increment only if we know we allocate aligned resources.
|
||||||
|
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
|
||||||
|
}
|
||||||
return admission.GetPodAdmitResult(nil)
|
return admission.GetPodAdmitResult(nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,10 +127,21 @@ const (
|
|||||||
// Metric for tracking garbage collected images
|
// Metric for tracking garbage collected images
|
||||||
ImageGarbageCollectedTotalKey = "image_garbage_collected_total"
|
ImageGarbageCollectedTotalKey = "image_garbage_collected_total"
|
||||||
|
|
||||||
|
// Metric for tracking aligment of compute resources
|
||||||
|
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
|
||||||
|
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
|
||||||
|
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
|
||||||
|
|
||||||
// Values used in metric labels
|
// Values used in metric labels
|
||||||
Container = "container"
|
Container = "container"
|
||||||
InitContainer = "init_container"
|
InitContainer = "init_container"
|
||||||
EphemeralContainer = "ephemeral_container"
|
EphemeralContainer = "ephemeral_container"
|
||||||
|
|
||||||
|
AlignScopePod = "pod"
|
||||||
|
AlignScopeContainer = "container"
|
||||||
|
|
||||||
|
AlignedPhysicalCPU = "physical_cpu"
|
||||||
|
AlignedNUMANode = "numa_node"
|
||||||
)
|
)
|
||||||
|
|
||||||
type imageSizeBucket struct {
|
type imageSizeBucket struct {
|
||||||
@ -762,6 +773,16 @@ var (
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ContainerAlignedComputeResources = metrics.NewCounterVec(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: ContainerAlignedComputeResourcesNameKey,
|
||||||
|
Help: "Cumulative number of aligned compute resources allocated to containers by alignment type.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
|
||||||
|
)
|
||||||
|
|
||||||
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
|
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
|
||||||
MemoryManagerPinningRequestTotal = metrics.NewCounter(
|
MemoryManagerPinningRequestTotal = metrics.NewCounter(
|
||||||
&metrics.CounterOpts{
|
&metrics.CounterOpts{
|
||||||
@ -985,6 +1006,7 @@ func Register(collectors ...metrics.StableCollector) {
|
|||||||
legacyregistry.MustRegister(RunPodSandboxErrors)
|
legacyregistry.MustRegister(RunPodSandboxErrors)
|
||||||
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
||||||
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
||||||
|
legacyregistry.MustRegister(ContainerAlignedComputeResources)
|
||||||
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
|
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
|
||||||
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
|
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
|
||||||
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
|
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
|
||||||
|
@ -91,6 +91,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
|||||||
ginkgo.AfterEach(func(ctx context.Context) {
|
ginkgo.AfterEach(func(ctx context.Context) {
|
||||||
if testPod != nil {
|
if testPod != nil {
|
||||||
deletePodSyncByName(ctx, f, testPod.Name)
|
deletePodSyncByName(ctx, f, testPod.Name)
|
||||||
|
waitForContainerRemoval(ctx, testPod.Spec.Containers[0].Name, testPod.Name, testPod.Namespace)
|
||||||
}
|
}
|
||||||
updateKubeletConfig(ctx, f, oldCfg, true)
|
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||||
})
|
})
|
||||||
@ -160,11 +161,32 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
|||||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||||
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should return updated alignment counters when pod successfully run", func(ctx context.Context) {
|
||||||
|
ginkgo.By("Creating the test pod")
|
||||||
|
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-smt-ok", smtLevel))
|
||||||
|
|
||||||
|
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||||
|
// being [Serial], we can also assume noone else but us is running pods.
|
||||||
|
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
|
||||||
|
|
||||||
|
idFn := makeCustomPairID("scope", "boundary")
|
||||||
|
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||||
|
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||||
|
"container::physical_cpu": timelessSample(1),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
|
||||||
|
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
|
||||||
|
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
|
||||||
|
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
|
func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
|
||||||
ginkgo.By("getting Kubelet metrics from the metrics API")
|
ginkgo.By("Getting Kubelet metrics from the metrics API")
|
||||||
return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
|
return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,7 +211,7 @@ func makeGuaranteedCPUExclusiveSleeperPod(name string, cpus int) *v1.Pod {
|
|||||||
v1.ResourceMemory: resource.MustParse("64Mi"),
|
v1.ResourceMemory: resource.MustParse("64Mi"),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Command: []string{"sh", "-c", "sleep", "1d"},
|
Command: []string{"sh", "-c", "sleep 1d"},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -157,6 +157,13 @@ func containerID(element interface{}) string {
|
|||||||
return fmt.Sprintf("%s::%s::%s", el.Metric["namespace"], el.Metric["pod"], el.Metric["container"])
|
return fmt.Sprintf("%s::%s::%s", el.Metric["namespace"], el.Metric["pod"], el.Metric["container"])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func makeCustomPairID(pri, sec string) func(interface{}) string {
|
||||||
|
return func(element interface{}) string {
|
||||||
|
el := element.(*model.Sample)
|
||||||
|
return fmt.Sprintf("%s::%s", el.Metric[model.LabelName(pri)], el.Metric[model.LabelName(sec)])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func boundedSample(lower, upper interface{}) types.GomegaMatcher {
|
func boundedSample(lower, upper interface{}) types.GomegaMatcher {
|
||||||
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
|
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
|
||||||
// We already check Metric when matching the Id
|
// We already check Metric when matching the Id
|
||||||
|
@ -28,6 +28,7 @@ import (
|
|||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/test/e2e/feature"
|
"k8s.io/kubernetes/test/e2e/feature"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||||
@ -152,6 +153,26 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
|||||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||||
gomega.Consistently(ctx, getKubeletMetrics, 2*time.Minute, 10*time.Second).Should(matchResourceMetrics)
|
gomega.Consistently(ctx, getKubeletMetrics, 2*time.Minute, 10*time.Second).Should(matchResourceMetrics)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
ginkgo.It("[alignment] should return updated alignment counters when pod successfully run", func(ctx context.Context) {
|
||||||
|
ginkgo.By("Creating the test pod")
|
||||||
|
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-numa-ok", cpusNumPerNUMA))
|
||||||
|
|
||||||
|
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||||
|
// being [Serial], we can also assume noone else but us is running pods.
|
||||||
|
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
|
||||||
|
|
||||||
|
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||||
|
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
metrics.AlignedNUMANode: timelessSample(1),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
|
||||||
|
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
|
||||||
|
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
|
||||||
|
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user