node: memory manager: fix the mm metrics test

fixes for the memory manager tests by correctly restoring the kubelet config after each test. We need to do before all the related tests run, in order to make sure to restore the correct values. Add more debug facilities to troubleshoot further failures. Signed-off-by: Francesco Romani <fromani@redhat.com>
2025-09-16 06:32:32 +00:00 · 2024-03-04 13:46:09 +01:00
parent ef9965ebc6
commit a5d771c911
1 changed files with 93 additions and 7 deletions
--- a/test/e2e_node/memory_manager_metrics_test.go
+++ b/test/e2e_node/memory_manager_metrics_test.go
@@ -20,6 +20,7 @@ package e2enode

 import (
 	"context"
+	"fmt"
 	"time"

 	"github.com/onsi/ginkgo/v2"
@@ -27,9 +28,13 @@ import (
 	"github.com/onsi/gomega/gstruct"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/fields"
+	clientset "k8s.io/client-go/kubernetes"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/test/e2e/feature"
 	"k8s.io/kubernetes/test/e2e/framework"
+	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	admissionapi "k8s.io/pod-security-admission/api"
 )
@@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged

 	ginkgo.Context("when querying /metrics", func() {
+		var oldCfg *kubeletconfig.KubeletConfiguration
 		var testPod *v1.Pod

 		ginkgo.BeforeEach(func(ctx context.Context) {
-			var oldCfg *kubeletconfig.KubeletConfiguration
 			var err error
 			if oldCfg == nil {
 				oldCfg, err = getCurrentKubeletConfig(ctx)
@@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 				}
 				updateKubeletConfig(ctx, f, oldCfg, true)
 			})
+
+			count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
+			gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
 		})

 		ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
@@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me

 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
 				"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
-					"": timelessSample(0),
+					"": timelessSample(0), // intentionally use stricter value
 				}),
 				"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
-					"": timelessSample(0),
+					"": timelessSample(0), // intentionally use stricter value
 				}),
 			})

@@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 						ctnName: "memmngrcnt",
 						cpus:    "100m",
 						memory:  "1000Gi"},
-				}))
+				}),
+			)

 			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
 			// being [Serial], we can also assume noone else but us is running pods.
@@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 			gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
 			ginkgo.By("Ensuring the metrics match the expectations a few more times")
 			gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
+
+			values, err := getKubeletMetrics(ctx)
+			framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
+			err = validateMetrics(
+				values,
+				"kubelet_memory_manager_pinning_requests_total",
+				"kubelet_memory_manager_pinning_errors_total",
+				func(totVal, errVal float64) error {
+					if int64(totVal) != int64(errVal) {
+						return fmt.Errorf("expected total requests equal to total errors")
+					}
+					return nil
+				},
+			)
+			framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
 		})

 		ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
@@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 					{
 						ctnName: "memmngrcnt",
 						cpus:    "100m",
-						memory:  "64Mi"},
-				}))
+						memory:  "64Mi",
+					},
+				}),
+			)
+
+			printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)

 			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
 			// being [Serial], we can also assume noone else but us is running pods.
@@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 				"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					"": timelessSample(1),
 				}),
-				"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
+				"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
 					"": timelessSample(0),
 				}),
 			})
@@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
 			gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
 			ginkgo.By("Ensuring the metrics match the expectations a few more times")
 			gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
+
+			values, err := getKubeletMetrics(ctx)
+			framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
+			err = validateMetrics(
+				values,
+				"kubelet_memory_manager_pinning_requests_total",
+				"kubelet_memory_manager_pinning_errors_total",
+				func(totVal, errVal float64) error {
+					if int64(totVal-errVal) < 1 {
+						return fmt.Errorf("expected total requests equal to total errors + 1")
+					}
+					return nil
+				},
+			)
+			framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
 		})
 	})
 })
+
+func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
+	totalSamples := values[totalKey]
+	errorSamples := values[errorKey]
+	if len(totalSamples) != len(errorSamples) {
+		return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
+	}
+	for idx := range totalSamples {
+		if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// printAllPodsOnNode outputs status of all kubelet pods into log.
+// Note considering the e2e_node environment we will always have exactly 1 node, but still.
+func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
+	nodeSelector := fields.Set{
+		"spec.nodeName": nodeName,
+	}.AsSelector().String()
+
+	podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
+		FieldSelector: nodeSelector,
+	})
+	if err != nil {
+		framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
+		return 0
+	}
+	count := 0
+	framework.Logf("begin listing pods: %d found", len(podList.Items))
+	for _, p := range podList.Items {
+		framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
+			p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
+		for _, c := range p.Status.ContainerStatuses {
+			framework.Logf("\tContainer %v ready: %v, restart count %v",
+				c.Name, c.Ready, c.RestartCount)
+		}
+		count++
+	}
+	framework.Logf("end listing pods: %d found", len(podList.Items))
+	return count
+}