diff --git a/test/e2e_node/container_manager_test.go b/test/e2e_node/container_manager_test.go index 8aad239a6c8..4aa89eff3d8 100644 --- a/test/e2e_node/container_manager_test.go +++ b/test/e2e_node/container_manager_test.go @@ -78,6 +78,26 @@ func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expect return nil } +func dumpRunningContainer(ctx context.Context) error { + runtime, _, err := getCRIClient() + if err != nil { + return err + } + containers, err := runtime.ListContainers(ctx, &runtimeapi.ContainerFilter{ + State: &runtimeapi.ContainerStateValue{ + State: runtimeapi.ContainerState_CONTAINER_RUNNING, + }, + }) + if err != nil { + return err + } + framework.Logf("Running containers:") + for _, c := range containers { + framework.Logf("%+v", c) + } + return nil +} + var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() { f := framework.NewDefaultFramework("kubelet-container-manager") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged @@ -100,7 +120,24 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() { return validateOOMScoreAdjSetting(kubeletPids[0], -999) }, 5*time.Minute, 30*time.Second).Should(gomega.BeNil()) }) - ginkgo.Context("", func() { + + ginkgo.Context("with test pods", func() { + var testPod *v1.Pod + + // Log the running containers here to help debugging. + ginkgo.AfterEach(func(ctx context.Context) { + if ginkgo.CurrentSpecReport().Failed() { + ginkgo.By("Dump all running containers") + _ = dumpRunningContainer(ctx) + } + + if testPod == nil { + return // nothing to do + } + deletePodSyncByName(ctx, f, testPod.Name) + waitForAllContainerRemoval(ctx, testPod.Name, testPod.Namespace) + }) + ginkgo.It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func(ctx context.Context) { // Take a snapshot of existing pause processes. These were // created before this test, and may not be infra @@ -111,7 +148,7 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() { podClient := e2epod.NewPodClient(f) podName := "besteffort" + string(uuid.NewUUID()) - podClient.Create(ctx, &v1.Pod{ + testPod = podClient.Create(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, }, @@ -156,107 +193,90 @@ var _ = SIGDescribe("Container Manager Misc", framework.WithSerial(), func() { return validateOOMScoreAdjSetting(shPids[0], 1000) }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) }) - // Log the running containers here to help debugging. - ginkgo.AfterEach(func() { - if ginkgo.CurrentSpecReport().Failed() { - ginkgo.By("Dump all running containers") - runtime, _, err := getCRIClient() - framework.ExpectNoError(err) - containers, err := runtime.ListContainers(context.Background(), &runtimeapi.ContainerFilter{ - State: &runtimeapi.ContainerStateValue{ - State: runtimeapi.ContainerState_CONTAINER_RUNNING, - }, - }) - framework.ExpectNoError(err) - framework.Logf("Running containers:") - for _, c := range containers { - framework.Logf("%+v", c) - } - } - }) - }) - ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) { - podClient := e2epod.NewPodClient(f) - podName := "guaranteed" + string(uuid.NewUUID()) - podClient.Create(ctx, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Image: imageutils.GetE2EImage(imageutils.Nginx), - Name: podName, - Resources: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("100m"), - v1.ResourceMemory: resource.MustParse("50Mi"), + + ginkgo.It("guaranteed container's oom-score-adj should be -998", func(ctx context.Context) { + podClient := e2epod.NewPodClient(f) + podName := "guaranteed" + string(uuid.NewUUID()) + testPod = podClient.Create(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Image: imageutils.GetE2EImage(imageutils.Nginx), + Name: podName, + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("50Mi"), + }, }, }, }, }, - }, - }) - var ( - ngPids []int - err error - ) - gomega.Eventually(ctx, func() error { - ngPids, err = getPidsForProcess("nginx", "") - if err != nil { - return fmt.Errorf("failed to get list of nginx process pids: %w", err) - } - for _, pid := range ngPids { - if err := validateOOMScoreAdjSetting(pid, -998); err != nil { - return err + }) + var ( + ngPids []int + err error + ) + gomega.Eventually(ctx, func() error { + ngPids, err = getPidsForProcess("nginx", "") + if err != nil { + return fmt.Errorf("failed to get list of nginx process pids: %w", err) + } + for _, pid := range ngPids { + if err := validateOOMScoreAdjSetting(pid, -998); err != nil { + return err + } } - } - return nil - }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) + return nil + }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) - }) - ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) { - podClient := e2epod.NewPodClient(f) - podName := "burstable" + string(uuid.NewUUID()) - podClient.Create(ctx, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Image: imageutils.GetE2EImage(imageutils.Agnhost), - Args: []string{"test-webserver"}, - Name: podName, - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("100m"), - v1.ResourceMemory: resource.MustParse("50Mi"), + }) + ginkgo.It("burstable container's oom-score-adj should be between [2, 1000)", func(ctx context.Context) { + podClient := e2epod.NewPodClient(f) + podName := "burstable" + string(uuid.NewUUID()) + testPod = podClient.Create(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"test-webserver"}, + Name: podName, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("50Mi"), + }, }, }, }, }, - }, - }) - var ( - wsPids []int - err error - ) - gomega.Eventually(ctx, func() error { - wsPids, err = getPidsForProcess("agnhost", "") - if err != nil { - return fmt.Errorf("failed to get list of test-webserver process pids: %w", err) - } - for _, pid := range wsPids { - if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil { - return err + }) + var ( + wsPids []int + err error + ) + gomega.Eventually(ctx, func() error { + wsPids, err = getPidsForProcess("agnhost", "") + if err != nil { + return fmt.Errorf("failed to get list of test-webserver process pids: %w", err) } - } - return nil - }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) + for _, pid := range wsPids { + if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil { + return err + } + } + return nil + }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) - // TODO: Test the oom-score-adj logic for burstable more accurately. + // TODO: Test the oom-score-adj logic for burstable more accurately. + }) }) }) }) diff --git a/test/e2e_node/memory_manager_metrics_test.go b/test/e2e_node/memory_manager_metrics_test.go index b78e3167f7c..f085ac4f04c 100644 --- a/test/e2e_node/memory_manager_metrics_test.go +++ b/test/e2e_node/memory_manager_metrics_test.go @@ -20,6 +20,7 @@ package e2enode import ( "context" + "fmt" "time" "github.com/onsi/ginkgo/v2" @@ -27,9 +28,13 @@ import ( "github.com/onsi/gomega/gstruct" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + clientset "k8s.io/client-go/kubernetes" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" + e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" admissionapi "k8s.io/pod-security-admission/api" ) @@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged ginkgo.Context("when querying /metrics", func() { + var oldCfg *kubeletconfig.KubeletConfiguration var testPod *v1.Pod ginkgo.BeforeEach(func(ctx context.Context) { - var oldCfg *kubeletconfig.KubeletConfiguration var err error if oldCfg == nil { oldCfg, err = getCurrentKubeletConfig(ctx) @@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me } updateKubeletConfig(ctx, f, oldCfg, true) }) + + count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName) + gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName) }) ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) { @@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ - "": timelessSample(0), + "": timelessSample(0), // intentionally use stricter value }), "kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ - "": timelessSample(0), + "": timelessSample(0), // intentionally use stricter value }), }) @@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me ctnName: "memmngrcnt", cpus: "100m", memory: "1000Gi"}, - })) + }), + ) // we updated the kubelet config in BeforeEach, so we can assume we start fresh. // being [Serial], we can also assume noone else but us is running pods. @@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) ginkgo.By("Ensuring the metrics match the expectations a few more times") gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) + + values, err := getKubeletMetrics(ctx) + framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check") + err = validateMetrics( + values, + "kubelet_memory_manager_pinning_requests_total", + "kubelet_memory_manager_pinning_errors_total", + func(totVal, errVal float64) error { + if int64(totVal) != int64(errVal) { + return fmt.Errorf("expected total requests equal to total errors") + } + return nil + }, + ) + framework.ExpectNoError(err, "error validating the kubelet metrics between each other") }) ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) { @@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me { ctnName: "memmngrcnt", cpus: "100m", - memory: "64Mi"}, - })) + memory: "64Mi", + }, + }), + ) + + printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName) // we updated the kubelet config in BeforeEach, so we can assume we start fresh. // being [Serial], we can also assume noone else but us is running pods. @@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me "kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), }), - "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), }) @@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) ginkgo.By("Ensuring the metrics match the expectations a few more times") gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics) + + values, err := getKubeletMetrics(ctx) + framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check") + err = validateMetrics( + values, + "kubelet_memory_manager_pinning_requests_total", + "kubelet_memory_manager_pinning_errors_total", + func(totVal, errVal float64) error { + if int64(totVal-errVal) < 1 { + return fmt.Errorf("expected total requests equal to total errors + 1") + } + return nil + }, + ) + framework.ExpectNoError(err, "error validating the kubelet metrics between each other") }) }) }) + +func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error { + totalSamples := values[totalKey] + errorSamples := values[errorKey] + if len(totalSamples) != len(errorSamples) { + return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples)) + } + for idx := range totalSamples { + if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil { + return err + } + } + return nil +} + +// printAllPodsOnNode outputs status of all kubelet pods into log. +// Note considering the e2e_node environment we will always have exactly 1 node, but still. +func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int { + nodeSelector := fields.Set{ + "spec.nodeName": nodeName, + }.AsSelector().String() + + podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{ + FieldSelector: nodeSelector, + }) + if err != nil { + framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err) + return 0 + } + count := 0 + framework.Logf("begin listing pods: %d found", len(podList.Items)) + for _, p := range podList.Items { + framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)", + p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses)) + for _, c := range p.Status.ContainerStatuses { + framework.Logf("\tContainer %v ready: %v, restart count %v", + c.Name, c.Ready, c.RestartCount) + } + count++ + } + framework.Logf("end listing pods: %d found", len(podList.Items)) + return count +}