mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
Merge pull request #123468 from ffromani/fix-mm-metrics-test
node: memory manager: fix the metrics tests
This commit is contained in:
commit
25a43070ee
@ -78,6 +78,26 @@ func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expect
|
||||
return nil
|
||||
}
|
||||
|
||||
func dumpRunningContainer(ctx context.Context) error {
|
||||
runtime, _, err := getCRIClient()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
containers, err := runtime.ListContainers(ctx, &runtimeapi.ContainerFilter{
|
||||
State: &runtimeapi.ContainerStateValue{
|
||||
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
framework.Logf("Running containers:")
|
||||
for _, c := range containers {
|
||||
framework.Logf("%+v", c)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
|
||||
f := framework.NewDefaultFramework("kubelet-container-manager")
|
||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||
@ -100,7 +120,24 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
|
||||
return validateOOMScoreAdjSetting(kubeletPids[0], -999)
|
||||
}, 5*time.Minute, 30*time.Second).Should(gomega.BeNil())
|
||||
})
|
||||
ginkgo.Context("", func() {
|
||||
|
||||
ginkgo.Context("with test pods", func() {
|
||||
var testPod *v1.Pod
|
||||
|
||||
// Log the running containers here to help debugging.
|
||||
ginkgo.AfterEach(func(ctx context.Context) {
|
||||
if ginkgo.CurrentSpecReport().Failed() {
|
||||
ginkgo.By("Dump all running containers")
|
||||
_ = dumpRunningContainer(ctx)
|
||||
}
|
||||
|
||||
if testPod == nil {
|
||||
return // nothing to do
|
||||
}
|
||||
deletePodSyncByName(ctx, f, testPod.Name)
|
||||
waitForAllContainerRemoval(ctx, testPod.Name, testPod.Namespace)
|
||||
})
|
||||
|
||||
ginkgo.It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func(ctx context.Context) {
|
||||
// Take a snapshot of existing pause processes. These were
|
||||
// created before this test, and may not be infra
|
||||
@ -111,7 +148,7 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
|
||||
|
||||
podClient := e2epod.NewPodClient(f)
|
||||
podName := "besteffort" + string(uuid.NewUUID())
|
||||
podClient.Create(ctx, &v1.Pod{
|
||||
testPod = podClient.Create(ctx, &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
@ -156,107 +193,90 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
|
||||
return validateOOMScoreAdjSetting(shPids[0], 1000)
|
||||
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
|
||||
})
|
||||
// Log the running containers here to help debugging.
|
||||
ginkgo.AfterEach(func() {
|
||||
if ginkgo.CurrentSpecReport().Failed() {
|
||||
ginkgo.By("Dump all running containers")
|
||||
runtime, _, err := getCRIClient()
|
||||
framework.ExpectNoError(err)
|
||||
containers, err := runtime.ListContainers(context.Background(), &runtimeapi.ContainerFilter{
|
||||
State: &runtimeapi.ContainerStateValue{
|
||||
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
|
||||
},
|
||||
})
|
||||
framework.ExpectNoError(err)
|
||||
framework.Logf("Running containers:")
|
||||
for _, c := range containers {
|
||||
framework.Logf("%+v", c)
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
|
||||
podClient := e2epod.NewPodClient(f)
|
||||
podName := "guaranteed" + string(uuid.NewUUID())
|
||||
podClient.Create(ctx, &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.Nginx),
|
||||
Name: podName,
|
||||
Resources: v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceCPU: resource.MustParse("100m"),
|
||||
v1.ResourceMemory: resource.MustParse("50Mi"),
|
||||
|
||||
ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
|
||||
podClient := e2epod.NewPodClient(f)
|
||||
podName := "guaranteed" + string(uuid.NewUUID())
|
||||
testPod = podClient.Create(ctx, &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.Nginx),
|
||||
Name: podName,
|
||||
Resources: v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceCPU: resource.MustParse("100m"),
|
||||
v1.ResourceMemory: resource.MustParse("50Mi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
var (
|
||||
ngPids []int
|
||||
err error
|
||||
)
|
||||
gomega.Eventually(ctx, func() error {
|
||||
ngPids, err = getPidsForProcess("nginx", "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get list of nginx process pids: %w", err)
|
||||
}
|
||||
for _, pid := range ngPids {
|
||||
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
|
||||
return err
|
||||
})
|
||||
var (
|
||||
ngPids []int
|
||||
err error
|
||||
)
|
||||
gomega.Eventually(ctx, func() error {
|
||||
ngPids, err = getPidsForProcess("nginx", "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get list of nginx process pids: %w", err)
|
||||
}
|
||||
for _, pid := range ngPids {
|
||||
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
|
||||
return nil
|
||||
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
|
||||
|
||||
})
|
||||
ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) {
|
||||
podClient := e2epod.NewPodClient(f)
|
||||
podName := "burstable" + string(uuid.NewUUID())
|
||||
podClient.Create(ctx, &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.Agnhost),
|
||||
Args: []string{"test-webserver"},
|
||||
Name: podName,
|
||||
Resources: v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
v1.ResourceCPU: resource.MustParse("100m"),
|
||||
v1.ResourceMemory: resource.MustParse("50Mi"),
|
||||
})
|
||||
ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) {
|
||||
podClient := e2epod.NewPodClient(f)
|
||||
podName := "burstable" + string(uuid.NewUUID())
|
||||
testPod = podClient.Create(ctx, &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: imageutils.GetE2EImage(imageutils.Agnhost),
|
||||
Args: []string{"test-webserver"},
|
||||
Name: podName,
|
||||
Resources: v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
v1.ResourceCPU: resource.MustParse("100m"),
|
||||
v1.ResourceMemory: resource.MustParse("50Mi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
var (
|
||||
wsPids []int
|
||||
err error
|
||||
)
|
||||
gomega.Eventually(ctx, func() error {
|
||||
wsPids, err = getPidsForProcess("agnhost", "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
|
||||
}
|
||||
for _, pid := range wsPids {
|
||||
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
|
||||
return err
|
||||
})
|
||||
var (
|
||||
wsPids []int
|
||||
err error
|
||||
)
|
||||
gomega.Eventually(ctx, func() error {
|
||||
wsPids, err = getPidsForProcess("agnhost", "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
|
||||
for _, pid := range wsPids {
|
||||
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
|
||||
|
||||
// TODO: Test the oom-score-adj logic for burstable more accurately.
|
||||
// TODO: Test the oom-score-adj logic for burstable more accurately.
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
@ -20,6 +20,7 @@ package e2enode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
@ -27,9 +28,13 @@ import (
|
||||
"github.com/onsi/gomega/gstruct"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/fields"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/test/e2e/feature"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
)
|
||||
@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||
|
||||
ginkgo.Context("when querying /metrics", func() {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var testPod *v1.Pod
|
||||
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var err error
|
||||
if oldCfg == nil {
|
||||
oldCfg, err = getCurrentKubeletConfig(ctx)
|
||||
@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
}
|
||||
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||
})
|
||||
|
||||
count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
|
||||
gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
|
||||
})
|
||||
|
||||
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
|
||||
@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
"": timelessSample(0), // intentionally use stricter value
|
||||
}),
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
"": timelessSample(0), // intentionally use stricter value
|
||||
}),
|
||||
})
|
||||
|
||||
@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "1000Gi"},
|
||||
}))
|
||||
}),
|
||||
)
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
|
||||
values, err := getKubeletMetrics(ctx)
|
||||
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
|
||||
err = validateMetrics(
|
||||
values,
|
||||
"kubelet_memory_manager_pinning_requests_total",
|
||||
"kubelet_memory_manager_pinning_errors_total",
|
||||
func(totVal, errVal float64) error {
|
||||
if int64(totVal) != int64(errVal) {
|
||||
return fmt.Errorf("expected total requests equal to total errors")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
|
||||
})
|
||||
|
||||
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
|
||||
@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
{
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "64Mi"},
|
||||
}))
|
||||
memory: "64Mi",
|
||||
},
|
||||
}),
|
||||
)
|
||||
|
||||
printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
|
||||
values, err := getKubeletMetrics(ctx)
|
||||
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
|
||||
err = validateMetrics(
|
||||
values,
|
||||
"kubelet_memory_manager_pinning_requests_total",
|
||||
"kubelet_memory_manager_pinning_errors_total",
|
||||
func(totVal, errVal float64) error {
|
||||
if int64(totVal-errVal) < 1 {
|
||||
return fmt.Errorf("expected total requests equal to total errors + 1")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
|
||||
totalSamples := values[totalKey]
|
||||
errorSamples := values[errorKey]
|
||||
if len(totalSamples) != len(errorSamples) {
|
||||
return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
|
||||
}
|
||||
for idx := range totalSamples {
|
||||
if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// printAllPodsOnNode outputs status of all kubelet pods into log.
|
||||
// Note considering the e2e_node environment we will always have exactly 1 node, but still.
|
||||
func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
|
||||
nodeSelector := fields.Set{
|
||||
"spec.nodeName": nodeName,
|
||||
}.AsSelector().String()
|
||||
|
||||
podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: nodeSelector,
|
||||
})
|
||||
if err != nil {
|
||||
framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
|
||||
return 0
|
||||
}
|
||||
count := 0
|
||||
framework.Logf("begin listing pods: %d found", len(podList.Items))
|
||||
for _, p := range podList.Items {
|
||||
framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
|
||||
p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
|
||||
for _, c := range p.Status.ContainerStatuses {
|
||||
framework.Logf("\tContainer %v ready: %v, restart count %v",
|
||||
c.Name, c.Ready, c.RestartCount)
|
||||
}
|
||||
count++
|
||||
}
|
||||
framework.Logf("end listing pods: %d found", len(podList.Items))
|
||||
return count
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user