Merge pull request #123468 from ffromani/fix-mm-metrics-test

node: memory manager: fix the metrics tests
This commit is contained in:
Kubernetes Prow Robot 2024-06-26 12:00:45 -07:00 committed by GitHub
commit 25a43070ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 203 additions and 97 deletions

View File

@ -78,6 +78,26 @@ func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expect
return nil
}
func dumpRunningContainer(ctx context.Context) error {
runtime, _, err := getCRIClient()
if err != nil {
return err
}
containers, err := runtime.ListContainers(ctx, &runtimeapi.ContainerFilter{
State: &runtimeapi.ContainerStateValue{
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
},
})
if err != nil {
return err
}
framework.Logf("Running containers:")
for _, c := range containers {
framework.Logf("%+v", c)
}
return nil
}
var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
f := framework.NewDefaultFramework("kubelet-container-manager")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
@ -100,7 +120,24 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
return validateOOMScoreAdjSetting(kubeletPids[0], -999)
}, 5*time.Minute, 30*time.Second).Should(gomega.BeNil())
})
ginkgo.Context("", func() {
ginkgo.Context("with test pods", func() {
var testPod *v1.Pod
// Log the running containers here to help debugging.
ginkgo.AfterEach(func(ctx context.Context) {
if ginkgo.CurrentSpecReport().Failed() {
ginkgo.By("Dump all running containers")
_ = dumpRunningContainer(ctx)
}
if testPod == nil {
return // nothing to do
}
deletePodSyncByName(ctx, f, testPod.Name)
waitForAllContainerRemoval(ctx, testPod.Name, testPod.Namespace)
})
ginkgo.It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func(ctx context.Context) {
// Take a snapshot of existing pause processes. These were
// created before this test, and may not be infra
@ -111,7 +148,7 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
podClient := e2epod.NewPodClient(f)
podName := "besteffort" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{
testPod = podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
@ -156,107 +193,90 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
return validateOOMScoreAdjSetting(shPids[0], 1000)
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
})
// Log the running containers here to help debugging.
ginkgo.AfterEach(func() {
if ginkgo.CurrentSpecReport().Failed() {
ginkgo.By("Dump all running containers")
runtime, _, err := getCRIClient()
framework.ExpectNoError(err)
containers, err := runtime.ListContainers(context.Background(), &runtimeapi.ContainerFilter{
State: &runtimeapi.ContainerStateValue{
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
},
})
framework.ExpectNoError(err)
framework.Logf("Running containers:")
for _, c := range containers {
framework.Logf("%+v", c)
}
}
})
})
ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f)
podName := "guaranteed" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.Nginx),
Name: podName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"),
ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f)
podName := "guaranteed" + string(uuid.NewUUID())
testPod = podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.Nginx),
Name: podName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"),
},
},
},
},
},
},
})
var (
ngPids []int
err error
)
gomega.Eventually(ctx, func() error {
ngPids, err = getPidsForProcess("nginx", "")
if err != nil {
return fmt.Errorf("failed to get list of nginx process pids: %w", err)
}
for _, pid := range ngPids {
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
return err
})
var (
ngPids []int
err error
)
gomega.Eventually(ctx, func() error {
ngPids, err = getPidsForProcess("nginx", "")
if err != nil {
return fmt.Errorf("failed to get list of nginx process pids: %w", err)
}
for _, pid := range ngPids {
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
return err
}
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
})
ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f)
podName := "burstable" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.Agnhost),
Args: []string{"test-webserver"},
Name: podName,
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"),
})
ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f)
podName := "burstable" + string(uuid.NewUUID())
testPod = podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.Agnhost),
Args: []string{"test-webserver"},
Name: podName,
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"),
},
},
},
},
},
},
})
var (
wsPids []int
err error
)
gomega.Eventually(ctx, func() error {
wsPids, err = getPidsForProcess("agnhost", "")
if err != nil {
return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
}
for _, pid := range wsPids {
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
return err
})
var (
wsPids []int
err error
)
gomega.Eventually(ctx, func() error {
wsPids, err = getPidsForProcess("agnhost", "")
if err != nil {
return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
for _, pid := range wsPids {
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
return err
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
// TODO: Test the oom-score-adj logic for burstable more accurately.
// TODO: Test the oom-score-adj logic for burstable more accurately.
})
})
})
})

View File

@ -20,6 +20,7 @@ package e2enode
import (
"context"
"fmt"
"time"
"github.com/onsi/ginkgo/v2"
@ -27,9 +28,13 @@ import (
"github.com/onsi/gomega/gstruct"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
admissionapi "k8s.io/pod-security-admission/api"
)
@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
ginkgo.Context("when querying /metrics", func() {
var oldCfg *kubeletconfig.KubeletConfiguration
var testPod *v1.Pod
ginkgo.BeforeEach(func(ctx context.Context) {
var oldCfg *kubeletconfig.KubeletConfiguration
var err error
if oldCfg == nil {
oldCfg, err = getCurrentKubeletConfig(ctx)
@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
}
updateKubeletConfig(ctx, f, oldCfg, true)
})
count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
})
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
"": timelessSample(0), // intentionally use stricter value
}),
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
"": timelessSample(0), // intentionally use stricter value
}),
})
@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
ctnName: "memmngrcnt",
cpus: "100m",
memory: "1000Gi"},
}))
}),
)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal) != int64(errVal) {
return fmt.Errorf("expected total requests equal to total errors")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
})
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
{
ctnName: "memmngrcnt",
cpus: "100m",
memory: "64Mi"},
}))
memory: "64Mi",
},
}),
)
printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1),
}),
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
})
@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal-errVal) < 1 {
return fmt.Errorf("expected total requests equal to total errors + 1")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
})
})
})
func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
totalSamples := values[totalKey]
errorSamples := values[errorKey]
if len(totalSamples) != len(errorSamples) {
return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
}
for idx := range totalSamples {
if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
return err
}
}
return nil
}
// printAllPodsOnNode outputs status of all kubelet pods into log.
// Note considering the e2e_node environment we will always have exactly 1 node, but still.
func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
nodeSelector := fields.Set{
"spec.nodeName": nodeName,
}.AsSelector().String()
podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
FieldSelector: nodeSelector,
})
if err != nil {
framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
return 0
}
count := 0
framework.Logf("begin listing pods: %d found", len(podList.Items))
for _, p := range podList.Items {
framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
for _, c := range p.Status.ContainerStatuses {
framework.Logf("\tContainer %v ready: %v, restart count %v",
c.Name, c.Ready, c.RestartCount)
}
count++
}
framework.Logf("end listing pods: %d found", len(podList.Items))
return count
}