mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-07 11:13:48 +00:00
Merge pull request #121778 from Tal-or/mm_metrics
kubelet: memorymanager: metrics: add metrics about static allocation
This commit is contained in:
commit
0f7cc6fcaa
@ -34,6 +34,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
|
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
|
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -95,7 +96,7 @@ func (p *staticPolicy) Start(s state.State) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocate call is idempotent
|
// Allocate call is idempotent
|
||||||
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
|
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
|
||||||
// allocate the memory only for guaranteed pods
|
// allocate the memory only for guaranteed pods
|
||||||
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
|
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
|
||||||
return nil
|
return nil
|
||||||
@ -103,6 +104,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
|
|||||||
|
|
||||||
podUID := string(pod.UID)
|
podUID := string(pod.UID)
|
||||||
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
|
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
|
||||||
|
// container belongs in an exclusively allocated pool
|
||||||
|
metrics.MemoryManagerPinningRequestTotal.Inc()
|
||||||
|
defer func() {
|
||||||
|
if rerr != nil {
|
||||||
|
metrics.MemoryManagerPinningErrorsTotal.Inc()
|
||||||
|
}
|
||||||
|
}()
|
||||||
if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
|
if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
|
||||||
p.updatePodReusableMemory(pod, container, blocks)
|
p.updatePodReusableMemory(pod, container, blocks)
|
||||||
|
|
||||||
|
@ -109,6 +109,10 @@ const (
|
|||||||
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
|
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
|
||||||
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
|
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
|
||||||
|
|
||||||
|
// Metrics to track the Memory manager behavior
|
||||||
|
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
|
||||||
|
MemoryManagerPinningErrorsTotalKey = "memory_manager_pinning_errors_total"
|
||||||
|
|
||||||
// Metrics to track the Topology manager behavior
|
// Metrics to track the Topology manager behavior
|
||||||
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
|
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
|
||||||
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
||||||
@ -742,6 +746,25 @@ var (
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
|
||||||
|
MemoryManagerPinningRequestTotal = metrics.NewCounter(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: MemoryManagerPinningRequestsTotalKey,
|
||||||
|
Help: "The number of memory pages allocations which required pinning.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
})
|
||||||
|
|
||||||
|
// MemoryManagerPinningErrorsTotal tracks the number of times the pod spec required the memory manager to pin memory pages, but the allocation failed
|
||||||
|
MemoryManagerPinningErrorsTotal = metrics.NewCounter(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: MemoryManagerPinningErrorsTotalKey,
|
||||||
|
Help: "The number of memory pages allocations which required pinning that failed.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
// TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod
|
// TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod
|
||||||
TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
|
TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
|
||||||
&metrics.CounterOpts{
|
&metrics.CounterOpts{
|
||||||
@ -935,6 +958,10 @@ func Register(collectors ...metrics.StableCollector) {
|
|||||||
legacyregistry.MustRegister(RunPodSandboxErrors)
|
legacyregistry.MustRegister(RunPodSandboxErrors)
|
||||||
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
||||||
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
||||||
|
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
|
||||||
|
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
|
||||||
|
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
|
||||||
|
}
|
||||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||||
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
||||||
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
||||||
|
155
test/e2e_node/memory_manager_metrics_test.go
Normal file
155
test/e2e_node/memory_manager_metrics_test.go
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
//go:build linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright 2023 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package e2enode
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/onsi/ginkgo/v2"
|
||||||
|
"github.com/onsi/gomega"
|
||||||
|
"github.com/onsi/gomega/gstruct"
|
||||||
|
v1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
|
"k8s.io/kubernetes/test/e2e/feature"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||||
|
admissionapi "k8s.io/pod-security-admission/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.MemoryManager, func() {
|
||||||
|
f := framework.NewDefaultFramework("memorymanager-metrics")
|
||||||
|
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
||||||
|
|
||||||
|
ginkgo.Context("when querying /metrics", func() {
|
||||||
|
var testPod *v1.Pod
|
||||||
|
|
||||||
|
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||||
|
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||||
|
var err error
|
||||||
|
if oldCfg == nil {
|
||||||
|
oldCfg, err = getCurrentKubeletConfig(ctx)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
newCfg := oldCfg.DeepCopy()
|
||||||
|
updateKubeletConfigWithMemoryManagerParams(newCfg,
|
||||||
|
&memoryManagerKubeletParams{
|
||||||
|
policy: staticPolicy,
|
||||||
|
systemReservedMemory: []kubeletconfig.MemoryReservation{
|
||||||
|
{
|
||||||
|
NumaNode: 0,
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
resourceMemory: resource.MustParse("1100Mi"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
systemReserved: map[string]string{resourceMemory: "500Mi"},
|
||||||
|
kubeReserved: map[string]string{resourceMemory: "500Mi"},
|
||||||
|
evictionHard: map[string]string{evictionHardMemory: "100Mi"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
updateKubeletConfig(ctx, f, newCfg, true)
|
||||||
|
ginkgo.DeferCleanup(func(ctx context.Context) {
|
||||||
|
if testPod != nil {
|
||||||
|
deletePodSyncByName(ctx, f, testPod.Name)
|
||||||
|
}
|
||||||
|
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
|
||||||
|
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||||
|
// being [Serial], we can also assume no one else but us is running pods.
|
||||||
|
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with no pods running")
|
||||||
|
|
||||||
|
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||||
|
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(0),
|
||||||
|
}),
|
||||||
|
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(0),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||||
|
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||||
|
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should report pinning failures when the memorymanager allocation is known to fail", func(ctx context.Context) {
|
||||||
|
ginkgo.By("Creating the test pod which will be rejected for memory request which is too big")
|
||||||
|
testPod = e2epod.NewPodClient(f).Create(ctx, makeMemoryManagerPod("memmngrpod", nil,
|
||||||
|
[]memoryManagerCtnAttributes{
|
||||||
|
{
|
||||||
|
ctnName: "memmngrcnt",
|
||||||
|
cpus: "100m",
|
||||||
|
memory: "1000Gi"},
|
||||||
|
}))
|
||||||
|
|
||||||
|
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||||
|
// being [Serial], we can also assume noone else but us is running pods.
|
||||||
|
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with pod failed to admit")
|
||||||
|
|
||||||
|
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||||
|
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(1),
|
||||||
|
}),
|
||||||
|
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(1),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||||
|
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||||
|
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
|
||||||
|
ginkgo.By("Creating the test pod")
|
||||||
|
testPod = e2epod.NewPodClient(f).Create(ctx, makeMemoryManagerPod("memmngrpod", nil,
|
||||||
|
[]memoryManagerCtnAttributes{
|
||||||
|
{
|
||||||
|
ctnName: "memmngrcnt",
|
||||||
|
cpus: "100m",
|
||||||
|
memory: "64Mi"},
|
||||||
|
}))
|
||||||
|
|
||||||
|
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||||
|
// being [Serial], we can also assume noone else but us is running pods.
|
||||||
|
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with pod should be admitted")
|
||||||
|
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||||
|
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(1),
|
||||||
|
}),
|
||||||
|
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||||
|
"": timelessSample(0),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||||
|
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||||
|
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
@ -174,20 +174,20 @@ func getAllocatableMemoryFromStateFile(s *state.MemoryManagerCheckpoint) []state
|
|||||||
return allocatableMemory
|
return allocatableMemory
|
||||||
}
|
}
|
||||||
|
|
||||||
type kubeletParams struct {
|
type memoryManagerKubeletParams struct {
|
||||||
memoryManagerPolicy string
|
policy string
|
||||||
systemReservedMemory []kubeletconfig.MemoryReservation
|
systemReservedMemory []kubeletconfig.MemoryReservation
|
||||||
systemReserved map[string]string
|
systemReserved map[string]string
|
||||||
kubeReserved map[string]string
|
kubeReserved map[string]string
|
||||||
evictionHard map[string]string
|
evictionHard map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateKubeletConfigWithMemoryManagerParams(initialCfg *kubeletconfig.KubeletConfiguration, params *kubeletParams) {
|
func updateKubeletConfigWithMemoryManagerParams(initialCfg *kubeletconfig.KubeletConfiguration, params *memoryManagerKubeletParams) {
|
||||||
if initialCfg.FeatureGates == nil {
|
if initialCfg.FeatureGates == nil {
|
||||||
initialCfg.FeatureGates = map[string]bool{}
|
initialCfg.FeatureGates = map[string]bool{}
|
||||||
}
|
}
|
||||||
|
|
||||||
initialCfg.MemoryManagerPolicy = params.memoryManagerPolicy
|
initialCfg.MemoryManagerPolicy = params.policy
|
||||||
|
|
||||||
// update system-reserved
|
// update system-reserved
|
||||||
if initialCfg.SystemReserved == nil {
|
if initialCfg.SystemReserved == nil {
|
||||||
@ -256,7 +256,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
|||||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||||
|
|
||||||
memoryQuantity := resource.MustParse("1100Mi")
|
memoryQuantity := resource.MustParse("1100Mi")
|
||||||
defaultKubeParams := &kubeletParams{
|
defaultKubeParams := &memoryManagerKubeletParams{
|
||||||
systemReservedMemory: []kubeletconfig.MemoryReservation{
|
systemReservedMemory: []kubeletconfig.MemoryReservation{
|
||||||
{
|
{
|
||||||
NumaNode: 0,
|
NumaNode: 0,
|
||||||
@ -366,7 +366,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
|||||||
ginkgo.Context("with static policy", func() {
|
ginkgo.Context("with static policy", func() {
|
||||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||||
kubeParams := *defaultKubeParams
|
kubeParams := *defaultKubeParams
|
||||||
kubeParams.memoryManagerPolicy = staticPolicy
|
kubeParams.policy = staticPolicy
|
||||||
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -644,7 +644,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
|||||||
ginkgo.Context("with none policy", func() {
|
ginkgo.Context("with none policy", func() {
|
||||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||||
kubeParams := *defaultKubeParams
|
kubeParams := *defaultKubeParams
|
||||||
kubeParams.memoryManagerPolicy = nonePolicy
|
kubeParams.policy = nonePolicy
|
||||||
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user