mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
Merge pull request #121778 from Tal-or/mm_metrics
kubelet: memorymanager: metrics: add metrics about static allocation
This commit is contained in:
commit
0f7cc6fcaa
@ -34,6 +34,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||
)
|
||||
|
||||
@ -95,7 +96,7 @@ func (p *staticPolicy) Start(s state.State) error {
|
||||
}
|
||||
|
||||
// Allocate call is idempotent
|
||||
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
|
||||
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
|
||||
// allocate the memory only for guaranteed pods
|
||||
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
|
||||
return nil
|
||||
@ -103,6 +104,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
|
||||
|
||||
podUID := string(pod.UID)
|
||||
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
// container belongs in an exclusively allocated pool
|
||||
metrics.MemoryManagerPinningRequestTotal.Inc()
|
||||
defer func() {
|
||||
if rerr != nil {
|
||||
metrics.MemoryManagerPinningErrorsTotal.Inc()
|
||||
}
|
||||
}()
|
||||
if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
|
||||
p.updatePodReusableMemory(pod, container, blocks)
|
||||
|
||||
|
@ -109,6 +109,10 @@ const (
|
||||
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
|
||||
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
|
||||
|
||||
// Metrics to track the Memory manager behavior
|
||||
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
|
||||
MemoryManagerPinningErrorsTotalKey = "memory_manager_pinning_errors_total"
|
||||
|
||||
// Metrics to track the Topology manager behavior
|
||||
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
|
||||
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
||||
@ -742,6 +746,25 @@ var (
|
||||
},
|
||||
)
|
||||
|
||||
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
|
||||
MemoryManagerPinningRequestTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: MemoryManagerPinningRequestsTotalKey,
|
||||
Help: "The number of memory pages allocations which required pinning.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
})
|
||||
|
||||
// MemoryManagerPinningErrorsTotal tracks the number of times the pod spec required the memory manager to pin memory pages, but the allocation failed
|
||||
MemoryManagerPinningErrorsTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: MemoryManagerPinningErrorsTotalKey,
|
||||
Help: "The number of memory pages allocations which required pinning that failed.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod
|
||||
TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
@ -935,6 +958,10 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(RunPodSandboxErrors)
|
||||
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
|
||||
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
|
||||
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
|
||||
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
|
||||
}
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
||||
|
155
test/e2e_node/memory_manager_metrics_test.go
Normal file
155
test/e2e_node/memory_manager_metrics_test.go
Normal file
@ -0,0 +1,155 @@
|
||||
//go:build linux
|
||||
|
||||
/*
|
||||
Copyright 2023 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2enode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
"github.com/onsi/gomega"
|
||||
"github.com/onsi/gomega/gstruct"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/test/e2e/feature"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
)
|
||||
|
||||
var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.MemoryManager, func() {
|
||||
f := framework.NewDefaultFramework("memorymanager-metrics")
|
||||
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
||||
|
||||
ginkgo.Context("when querying /metrics", func() {
|
||||
var testPod *v1.Pod
|
||||
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var err error
|
||||
if oldCfg == nil {
|
||||
oldCfg, err = getCurrentKubeletConfig(ctx)
|
||||
framework.ExpectNoError(err)
|
||||
}
|
||||
|
||||
newCfg := oldCfg.DeepCopy()
|
||||
updateKubeletConfigWithMemoryManagerParams(newCfg,
|
||||
&memoryManagerKubeletParams{
|
||||
policy: staticPolicy,
|
||||
systemReservedMemory: []kubeletconfig.MemoryReservation{
|
||||
{
|
||||
NumaNode: 0,
|
||||
Limits: v1.ResourceList{
|
||||
resourceMemory: resource.MustParse("1100Mi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
systemReserved: map[string]string{resourceMemory: "500Mi"},
|
||||
kubeReserved: map[string]string{resourceMemory: "500Mi"},
|
||||
evictionHard: map[string]string{evictionHardMemory: "100Mi"},
|
||||
},
|
||||
)
|
||||
updateKubeletConfig(ctx, f, newCfg, true)
|
||||
ginkgo.DeferCleanup(func(ctx context.Context) {
|
||||
if testPod != nil {
|
||||
deletePodSyncByName(ctx, f, testPod.Name)
|
||||
}
|
||||
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume no one else but us is running pods.
|
||||
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with no pods running")
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should report pinning failures when the memorymanager allocation is known to fail", func(ctx context.Context) {
|
||||
ginkgo.By("Creating the test pod which will be rejected for memory request which is too big")
|
||||
testPod = e2epod.NewPodClient(f).Create(ctx, makeMemoryManagerPod("memmngrpod", nil,
|
||||
[]memoryManagerCtnAttributes{
|
||||
{
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "1000Gi"},
|
||||
}))
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with pod failed to admit")
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
})
|
||||
|
||||
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
|
||||
ginkgo.By("Creating the test pod")
|
||||
testPod = e2epod.NewPodClient(f).Create(ctx, makeMemoryManagerPod("memmngrpod", nil,
|
||||
[]memoryManagerCtnAttributes{
|
||||
{
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "64Mi"},
|
||||
}))
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the memorymanager metrics right after the kubelet restart, with pod should be admitted")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
})
|
||||
})
|
||||
})
|
@ -174,20 +174,20 @@ func getAllocatableMemoryFromStateFile(s *state.MemoryManagerCheckpoint) []state
|
||||
return allocatableMemory
|
||||
}
|
||||
|
||||
type kubeletParams struct {
|
||||
memoryManagerPolicy string
|
||||
type memoryManagerKubeletParams struct {
|
||||
policy string
|
||||
systemReservedMemory []kubeletconfig.MemoryReservation
|
||||
systemReserved map[string]string
|
||||
kubeReserved map[string]string
|
||||
evictionHard map[string]string
|
||||
}
|
||||
|
||||
func updateKubeletConfigWithMemoryManagerParams(initialCfg *kubeletconfig.KubeletConfiguration, params *kubeletParams) {
|
||||
func updateKubeletConfigWithMemoryManagerParams(initialCfg *kubeletconfig.KubeletConfiguration, params *memoryManagerKubeletParams) {
|
||||
if initialCfg.FeatureGates == nil {
|
||||
initialCfg.FeatureGates = map[string]bool{}
|
||||
}
|
||||
|
||||
initialCfg.MemoryManagerPolicy = params.memoryManagerPolicy
|
||||
initialCfg.MemoryManagerPolicy = params.policy
|
||||
|
||||
// update system-reserved
|
||||
if initialCfg.SystemReserved == nil {
|
||||
@ -256,7 +256,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||
|
||||
memoryQuantity := resource.MustParse("1100Mi")
|
||||
defaultKubeParams := &kubeletParams{
|
||||
defaultKubeParams := &memoryManagerKubeletParams{
|
||||
systemReservedMemory: []kubeletconfig.MemoryReservation{
|
||||
{
|
||||
NumaNode: 0,
|
||||
@ -366,7 +366,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
||||
ginkgo.Context("with static policy", func() {
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
kubeParams := *defaultKubeParams
|
||||
kubeParams.memoryManagerPolicy = staticPolicy
|
||||
kubeParams.policy = staticPolicy
|
||||
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
||||
})
|
||||
|
||||
@ -644,7 +644,7 @@ var _ = SIGDescribe("Memory Manager", framework.WithDisruptive(), framework.With
|
||||
ginkgo.Context("with none policy", func() {
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
kubeParams := *defaultKubeParams
|
||||
kubeParams.memoryManagerPolicy = nonePolicy
|
||||
kubeParams.policy = nonePolicy
|
||||
updateKubeletConfigWithMemoryManagerParams(initialConfig, &kubeParams)
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user