Merge pull request #123468 from ffromani/fix-mm-metrics-test

node: memory manager: fix the metrics tests
This commit is contained in:
Kubernetes Prow Robot 2024-06-26 12:00:45 -07:00 committed by GitHub
commit 25a43070ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 203 additions and 97 deletions

View File

@ -78,6 +78,26 @@ func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expect
return nil return nil
} }
func dumpRunningContainer(ctx context.Context) error {
runtime, _, err := getCRIClient()
if err != nil {
return err
}
containers, err := runtime.ListContainers(ctx, &runtimeapi.ContainerFilter{
State: &runtimeapi.ContainerStateValue{
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
},
})
if err != nil {
return err
}
framework.Logf("Running containers:")
for _, c := range containers {
framework.Logf("%+v", c)
}
return nil
}
var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() { var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
f := framework.NewDefaultFramework("kubelet-container-manager") f := framework.NewDefaultFramework("kubelet-container-manager")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
@ -100,7 +120,24 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
return validateOOMScoreAdjSetting(kubeletPids[0], -999) return validateOOMScoreAdjSetting(kubeletPids[0], -999)
}, 5*time.Minute, 30*time.Second).Should(gomega.BeNil()) }, 5*time.Minute, 30*time.Second).Should(gomega.BeNil())
}) })
ginkgo.Context("", func() {
ginkgo.Context("with test pods", func() {
var testPod *v1.Pod
// Log the running containers here to help debugging.
ginkgo.AfterEach(func(ctx context.Context) {
if ginkgo.CurrentSpecReport().Failed() {
ginkgo.By("Dump all running containers")
_ = dumpRunningContainer(ctx)
}
if testPod == nil {
return // nothing to do
}
deletePodSyncByName(ctx, f, testPod.Name)
waitForAllContainerRemoval(ctx, testPod.Name, testPod.Namespace)
})
ginkgo.It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func(ctx context.Context) { ginkgo.It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func(ctx context.Context) {
// Take a snapshot of existing pause processes. These were // Take a snapshot of existing pause processes. These were
// created before this test, and may not be infra // created before this test, and may not be infra
@ -111,7 +148,7 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
podClient := e2epod.NewPodClient(f) podClient := e2epod.NewPodClient(f)
podName := "besteffort" + string(uuid.NewUUID()) podName := "besteffort" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{ testPod = podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: podName, Name: podName,
}, },
@ -156,107 +193,90 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() {
return validateOOMScoreAdjSetting(shPids[0], 1000) return validateOOMScoreAdjSetting(shPids[0], 1000)
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) }, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
}) })
// Log the running containers here to help debugging.
ginkgo.AfterEach(func() { ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
if ginkgo.CurrentSpecReport().Failed() { podClient := e2epod.NewPodClient(f)
ginkgo.By("Dump all running containers") podName := "guaranteed" + string(uuid.NewUUID())
runtime, _, err := getCRIClient() testPod = podClient.Create(ctx, &v1.Pod{
framework.ExpectNoError(err) ObjectMeta: metav1.ObjectMeta{
containers, err := runtime.ListContainers(context.Background(), &runtimeapi.ContainerFilter{ Name: podName,
State: &runtimeapi.ContainerStateValue{ },
State: runtimeapi.ContainerState_CONTAINER_RUNNING, Spec: v1.PodSpec{
}, Containers: []v1.Container{
}) {
framework.ExpectNoError(err) Image: imageutils.GetE2EImage(imageutils.Nginx),
framework.Logf("Running containers:") Name: podName,
for _, c := range containers { Resources: v1.ResourceRequirements{
framework.Logf("%+v", c) Limits: v1.ResourceList{
} v1.ResourceCPU: resource.MustParse("100m"),
} v1.ResourceMemory: resource.MustParse("50Mi"),
}) },
})
ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f)
podName := "guaranteed" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: imageutils.GetE2EImage(imageutils.Nginx),
Name: podName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"),
}, },
}, },
}, },
}, },
}, })
}) var (
var ( ngPids []int
ngPids []int err error
err error )
) gomega.Eventually(ctx, func() error {
gomega.Eventually(ctx, func() error { ngPids, err = getPidsForProcess("nginx", "")
ngPids, err = getPidsForProcess("nginx", "") if err != nil {
if err != nil { return fmt.Errorf("failed to get list of nginx process pids: %w", err)
return fmt.Errorf("failed to get list of nginx process pids: %w", err) }
} for _, pid := range ngPids {
for _, pid := range ngPids { if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
if err := validateOOMScoreAdjSetting(pid, -998); err != nil { return err
return err }
} }
}
return nil return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) }, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
}) })
ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) { ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) {
podClient := e2epod.NewPodClient(f) podClient := e2epod.NewPodClient(f)
podName := "burstable" + string(uuid.NewUUID()) podName := "burstable" + string(uuid.NewUUID())
podClient.Create(ctx, &v1.Pod{ testPod = podClient.Create(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: podName, Name: podName,
}, },
Spec: v1.PodSpec{ Spec: v1.PodSpec{
Containers: []v1.Container{ Containers: []v1.Container{
{ {
Image: imageutils.GetE2EImage(imageutils.Agnhost), Image: imageutils.GetE2EImage(imageutils.Agnhost),
Args: []string{"test-webserver"}, Args: []string{"test-webserver"},
Name: podName, Name: podName,
Resources: v1.ResourceRequirements{ Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{ Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"), v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("50Mi"), v1.ResourceMemory: resource.MustParse("50Mi"),
},
}, },
}, },
}, },
}, },
}, })
}) var (
var ( wsPids []int
wsPids []int err error
err error )
) gomega.Eventually(ctx, func() error {
gomega.Eventually(ctx, func() error { wsPids, err = getPidsForProcess("agnhost", "")
wsPids, err = getPidsForProcess("agnhost", "") if err != nil {
if err != nil { return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
return fmt.Errorf("failed to get list of test-webserver process pids: %w", err)
}
for _, pid := range wsPids {
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
return err
} }
} for _, pid := range wsPids {
return nil if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) return err
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
// TODO: Test the oom-score-adj logic for burstable more accurately. // TODO: Test the oom-score-adj logic for burstable more accurately.
})
}) })
}) })
}) })

View File

@ -20,6 +20,7 @@ package e2enode
import ( import (
"context" "context"
"fmt"
"time" "time"
"github.com/onsi/ginkgo/v2" "github.com/onsi/ginkgo/v2"
@ -27,9 +28,13 @@ import (
"github.com/onsi/gomega/gstruct" "github.com/onsi/gomega/gstruct"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
admissionapi "k8s.io/pod-security-admission/api" admissionapi "k8s.io/pod-security-admission/api"
) )
@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
ginkgo.Context("when querying /metrics", func() { ginkgo.Context("when querying /metrics", func() {
var oldCfg *kubeletconfig.KubeletConfiguration
var testPod *v1.Pod var testPod *v1.Pod
ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.BeforeEach(func(ctx context.Context) {
var oldCfg *kubeletconfig.KubeletConfiguration
var err error var err error
if oldCfg == nil { if oldCfg == nil {
oldCfg, err = getCurrentKubeletConfig(ctx) oldCfg, err = getCurrentKubeletConfig(ctx)
@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
} }
updateKubeletConfig(ctx, f, oldCfg, true) updateKubeletConfig(ctx, f, oldCfg, true)
}) })
count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
}) })
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) { ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0), // intentionally use stricter value
}), }),
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0), // intentionally use stricter value
}), }),
}) })
@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
ctnName: "memmngrcnt", ctnName: "memmngrcnt",
cpus: "100m", cpus: "100m",
memory: "1000Gi"}, memory: "1000Gi"},
})) }),
)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh. // we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times") ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal) != int64(errVal) {
return fmt.Errorf("expected total requests equal to total errors")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
}) })
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) { ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
{ {
ctnName: "memmngrcnt", ctnName: "memmngrcnt",
cpus: "100m", cpus: "100m",
memory: "64Mi"}, memory: "64Mi",
})) },
}),
)
printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh. // we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
}), }),
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
}) })
@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times") ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal-errVal) < 1 {
return fmt.Errorf("expected total requests equal to total errors + 1")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
}) })
}) })
}) })
func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
totalSamples := values[totalKey]
errorSamples := values[errorKey]
if len(totalSamples) != len(errorSamples) {
return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
}
for idx := range totalSamples {
if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
return err
}
}
return nil
}
// printAllPodsOnNode outputs status of all kubelet pods into log.
// Note considering the e2e_node environment we will always have exactly 1 node, but still.
func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
nodeSelector := fields.Set{
"spec.nodeName": nodeName,
}.AsSelector().String()
podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
FieldSelector: nodeSelector,
})
if err != nil {
framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
return 0
}
count := 0
framework.Logf("begin listing pods: %d found", len(podList.Items))
for _, p := range podList.Items {
framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
for _, c := range p.Status.ContainerStatuses {
framework.Logf("\tContainer %v ready: %v, restart count %v",
c.Name, c.Ready, c.RestartCount)
}
count++
}
framework.Logf("end listing pods: %d found", len(podList.Items))
return count
}