From 844c2ef39d2fa6199aaebdf516a70d74c14072f7 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Fri, 14 Feb 2025 11:48:29 +0100
Subject: [PATCH 1/2] e2e: node: cpumgr: cleanup after each test case

Our CI machines happen to have 1 fully allocatable CPU for test workloads.
This is really, really the minimal amount. But still should be sufficient for the tests to run
the tests; the CFS quota pod, however, does create a series of pods (at time of writing, 6)
and does the cleanup only at the very end the end. This means pods
requiring resources accumulate on the CI machine node.

The fix implemented here is to just clean up after each subcase.
Doing so the cpu test footprint is equal to the higher requirement (say, 1000 millicores) vs
the sum of all the subcases requirements.

Doing like this doesn't change the test behavior, and make it possible
to run it on very barebones machines.
---
 test/e2e_node/cpu_manager_test.go      | 47 +++++++++++++++++---------
 test/e2e_node/topology_manager_test.go | 11 ++++--
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/test/e2e_node/cpu_manager_test.go b/test/e2e_node/cpu_manager_test.go
index 00fa990269f..313729aa26d 100644
--- a/test/e2e_node/cpu_manager_test.go
+++ b/test/e2e_node/cpu_manager_test.go
@@ -596,18 +596,27 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	var err error
 	var ctnAttrs []ctnAttribute
 	var pod1, pod2, pod3 *v1.Pod
-	var cleanupPods []*v1.Pod
-	ginkgo.DeferCleanup(func() {
+	podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod
+
+	deleteTestPod := func(pod *v1.Pod) {
 		// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
 		// 'deadline expired' message and the cleanup aborts, which we don't want.
-		ctx2 := context.TODO()
+		// So let's use a separate and more generous timeout (determined by trial and error)
+		ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+		defer cancel()
+		deletePodSyncAndWait(ctx2, f, pod.Namespace, pod.Name)
+		delete(podsToClean, string(pod.UID))
+	}
+
+	// cleanup leftovers on test failure. The happy path is covered by `deleteTestPod` calls
+	ginkgo.DeferCleanup(func() {
 		ginkgo.By("by deleting the pods and waiting for container removal")
-		for _, cleanupPod := range cleanupPods {
-			framework.Logf("deleting pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
-			deletePodSyncByName(ctx2, f, cleanupPod.Name)
-			waitForContainerRemoval(ctx2, cleanupPod.Spec.Containers[0].Name, cleanupPod.Name, cleanupPod.Namespace)
-			framework.Logf("deleted pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
-		}
+		// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
+		// 'deadline expired' message and the cleanup aborts, which we don't want.
+		// So let's use a separate and more generous timeout (determined by trial and error)
+		ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+		defer cancel()
+		deletePodsAsync(ctx2, f, podsToClean)
 	})
 
 	cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"}
@@ -623,7 +632,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod1 = makeCPUManagerPod("gu-pod1", ctnAttrs)
 	pod1.Spec.Containers[0].Command = cfsCheckCommand
 	pod1 = e2epod.NewPodClient(f).CreateSync(ctx, pod1)
-	cleanupPods = append(cleanupPods, pod1)
+	podsToClean[string(pod1.UID)] = pod1
 
 	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, exclusive CPUs, unlimited)")
 
@@ -635,6 +644,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod1.Name, pod1.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod1.Spec.Containers[0].Name, pod1.Name)
+	deleteTestPod(pod1)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -646,7 +656,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod2 = makeCPUManagerPod("gu-pod2", ctnAttrs)
 	pod2.Spec.Containers[0].Command = cfsCheckCommand
 	pod2 = e2epod.NewPodClient(f).CreateSync(ctx, pod2)
-	cleanupPods = append(cleanupPods, pod2)
+	podsToClean[string(pod2.UID)] = pod2
 
 	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, limited)")
 
@@ -655,6 +665,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod2.Name, pod2.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod2.Spec.Containers[0].Name, pod2.Name)
+	deleteTestPod(pod2)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -666,7 +677,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod3 = makeCPUManagerPod("non-gu-pod3", ctnAttrs)
 	pod3.Spec.Containers[0].Command = cfsCheckCommand
 	pod3 = e2epod.NewPodClient(f).CreateSync(ctx, pod3)
-	cleanupPods = append(cleanupPods, pod3)
+	podsToClean[string(pod3.UID)] = pod3
 
 	ginkgo.By("checking if the expected cfs quota was assigned (BU pod, limited)")
 
@@ -675,6 +686,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod3.Name, pod3.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod3.Spec.Containers[0].Name, pod3.Name)
+	deleteTestPod(pod3)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -692,7 +704,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod4.Spec.Containers[0].Command = cfsCheckCommand
 	pod4.Spec.Containers[1].Command = cfsCheckCommand
 	pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
-	cleanupPods = append(cleanupPods, pod4)
+	podsToClean[string(pod4.UID)] = pod4
 
 	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
 
@@ -709,6 +721,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
 		pod4.Spec.Containers[1].Name, pod4.Name)
+	deleteTestPod(pod4)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -728,7 +741,8 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
 	pod5.Spec.Containers[0].Command = podCFSCheckCommand
 	pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
-	cleanupPods = append(cleanupPods, pod5)
+	podsToClean[string(pod5.UID)] = pod5
+
 	ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
 
 	expectedQuota = "150000"
@@ -741,6 +755,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
+	deleteTestPod(pod5)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -753,7 +768,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	pod6 := makeCPUManagerPod("gu-pod6", ctnAttrs)
 	pod6.Spec.Containers[0].Command = podCFSCheckCommand
 	pod6 = e2epod.NewPodClient(f).CreateSync(ctx, pod6)
-	cleanupPods = append(cleanupPods, pod6)
+	podsToClean[string(pod6.UID)] = pod6
 
 	ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, limited)")
 
@@ -761,7 +776,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
 	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod6.Name, pod6.Spec.Containers[0].Name, expCFSQuotaRegex)
 	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod6.Spec.Containers[0].Name, pod6.Name)
-
+	deleteTestPod(pod6)
 }
 
 func runMultipleGuPods(ctx context.Context, f *framework.Framework) {
diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go
index bac00c5ee7b..7713772c6f2 100644
--- a/test/e2e_node/topology_manager_test.go
+++ b/test/e2e_node/topology_manager_test.go
@@ -463,14 +463,19 @@ func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[str
 		go func(podNS, podName string) {
 			defer ginkgo.GinkgoRecover()
 			defer wg.Done()
-
-			deletePodSyncByName(ctx, f, podName)
-			waitForAllContainerRemoval(ctx, podName, podNS)
+			deletePodSyncAndWait(ctx, f, podNS, podName)
 		}(pod.Namespace, pod.Name)
 	}
 	wg.Wait()
 }
 
+func deletePodSyncAndWait(ctx context.Context, f *framework.Framework, podNS, podName string) {
+	framework.Logf("deleting pod: %s/%s", podNS, podName)
+	deletePodSyncByName(ctx, f, podName)
+	waitForAllContainerRemoval(ctx, podName, podNS)
+	framework.Logf("deleted pod: %s/%s", podNS, podName)
+}
+
 func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
 	podName := "gu-pod"
 	framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)

From 323410664c4b64848ab0663797bc8728628ccf69 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Fri, 14 Feb 2025 19:37:44 +0100
Subject: [PATCH 2/2] e2e: node: cpumgr: check CPU allocatable for CFS quota
 test

add (admittedly pretty crude) CPU allocatable check.
A more incisive refactoring is needed, but we need
to unbreak CI first, so this seems the minimal decently clean test.

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 test/e2e_node/cpu_manager_test.go | 157 +++++++++++++++++-------------
 1 file changed, 87 insertions(+), 70 deletions(-)

diff --git a/test/e2e_node/cpu_manager_test.go b/test/e2e_node/cpu_manager_test.go
index 313729aa26d..c46e96166d3 100644
--- a/test/e2e_node/cpu_manager_test.go
+++ b/test/e2e_node/cpu_manager_test.go
@@ -592,12 +592,14 @@ func runMultipleCPUContainersGuPod(ctx context.Context, f *framework.Framework)
 	waitForContainerRemoval(ctx, pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
 }
 
-func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool) {
+func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool, cpuAlloc int64) {
 	var err error
 	var ctnAttrs []ctnAttribute
 	var pod1, pod2, pod3 *v1.Pod
 	podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod
 
+	framework.Logf("runCfsQuotaGuPods: disableQuota=%v, CPU Allocatable=%v", disabledCPUQuotaWithExclusiveCPUs, cpuAlloc)
+
 	deleteTestPod := func(pod *v1.Pod) {
 		// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
 		// 'deadline expired' message and the cleanup aborts, which we don't want.
@@ -619,6 +621,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 		deletePodsAsync(ctx2, f, podsToClean)
 	})
 
+	podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
 	cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"}
 	defaultPeriod := "100000"
 
@@ -688,74 +691,76 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
 		pod3.Spec.Containers[0].Name, pod3.Name)
 	deleteTestPod(pod3)
 
-	ctnAttrs = []ctnAttribute{
-		{
-			ctnName:    "gu-container-non-int-values",
-			cpuRequest: "500m",
-			cpuLimit:   "500m",
-		},
-		{
-			ctnName:    "gu-container-int-values",
-			cpuRequest: "1",
-			cpuLimit:   "1",
-		},
+	if cpuAlloc >= 2 {
+		ctnAttrs = []ctnAttribute{
+			{
+				ctnName:    "gu-container-non-int-values",
+				cpuRequest: "500m",
+				cpuLimit:   "500m",
+			},
+			{
+				ctnName:    "gu-container-int-values",
+				cpuRequest: "1",
+				cpuLimit:   "1",
+			},
+		}
+		pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
+		pod4.Spec.Containers[0].Command = cfsCheckCommand
+		pod4.Spec.Containers[1].Command = cfsCheckCommand
+		pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
+		podsToClean[string(pod4.UID)] = pod4
+
+		ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
+
+		expectedQuota = "50000"
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+			pod4.Spec.Containers[0].Name, pod4.Name)
+		expectedQuota = "100000"
+		if disabledCPUQuotaWithExclusiveCPUs {
+			expectedQuota = "max"
+		}
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+			pod4.Spec.Containers[1].Name, pod4.Name)
+		deleteTestPod(pod4)
+
+		ctnAttrs = []ctnAttribute{
+			{
+				ctnName:    "gu-container-non-int-values",
+				cpuRequest: "500m",
+				cpuLimit:   "500m",
+			},
+			{
+				ctnName:    "gu-container-int-values",
+				cpuRequest: "1",
+				cpuLimit:   "1",
+			},
+		}
+
+		pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
+		pod5.Spec.Containers[0].Command = podCFSCheckCommand
+		pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
+		podsToClean[string(pod5.UID)] = pod5
+
+		ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
+
+		expectedQuota = "150000"
+
+		if disabledCPUQuotaWithExclusiveCPUs {
+			expectedQuota = "max"
+		}
+
+		expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
+
+		err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
+		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
+		deleteTestPod(pod5)
+	} else {
+		ginkgo.By(fmt.Sprintf("some cases SKIPPED - requests at least %d allocatable cores, got %d", 2, cpuAlloc))
 	}
-	pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
-	pod4.Spec.Containers[0].Command = cfsCheckCommand
-	pod4.Spec.Containers[1].Command = cfsCheckCommand
-	pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
-	podsToClean[string(pod4.UID)] = pod4
-
-	ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
-
-	expectedQuota = "50000"
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
-		pod4.Spec.Containers[0].Name, pod4.Name)
-	expectedQuota = "100000"
-	if disabledCPUQuotaWithExclusiveCPUs {
-		expectedQuota = "max"
-	}
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
-		pod4.Spec.Containers[1].Name, pod4.Name)
-	deleteTestPod(pod4)
-
-	ctnAttrs = []ctnAttribute{
-		{
-			ctnName:    "gu-container-non-int-values",
-			cpuRequest: "500m",
-			cpuLimit:   "500m",
-		},
-		{
-			ctnName:    "gu-container-int-values",
-			cpuRequest: "1",
-			cpuLimit:   "1",
-		},
-	}
-
-	podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
-
-	pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
-	pod5.Spec.Containers[0].Command = podCFSCheckCommand
-	pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
-	podsToClean[string(pod5.UID)] = pod5
-
-	ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
-
-	expectedQuota = "150000"
-
-	if disabledCPUQuotaWithExclusiveCPUs {
-		expectedQuota = "max"
-	}
-
-	expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
-
-	err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
-	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
-	deleteTestPod(pod5)
 
 	ctnAttrs = []ctnAttribute{
 		{
@@ -936,6 +941,10 @@ func runCPUManagerTests(f *framework.Framework) {
 		if !IsCgroup2UnifiedMode() {
 			e2eskipper.Skipf("Skipping since CgroupV2 not used")
 		}
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
+		if cpuAlloc < 1 { // save expensive kubelet restart
+			e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
+		}
 		newCfg := configureCPUManagerInKubelet(oldCfg,
 			&cpuManagerKubeletArguments{
 				policyName:                       string(cpumanager.PolicyStatic),
@@ -944,13 +953,19 @@ func runCPUManagerTests(f *framework.Framework) {
 			},
 		)
 		updateKubeletConfig(ctx, f, newCfg, true)
-		runCfsQuotaGuPods(ctx, f, true)
+
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
+		runCfsQuotaGuPods(ctx, f, true, cpuAlloc)
 	})
 
 	ginkgo.It("should keep enforcing the CFS quota for containers with static CPUs assigned and feature gate disabled", func(ctx context.Context) {
 		if !IsCgroup2UnifiedMode() {
 			e2eskipper.Skipf("Skipping since CgroupV2 not used")
 		}
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
+		if cpuAlloc < 1 { // save expensive kubelet restart
+			e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
+		}
 		newCfg := configureCPUManagerInKubelet(oldCfg,
 			&cpuManagerKubeletArguments{
 				policyName:                       string(cpumanager.PolicyStatic),
@@ -960,7 +975,9 @@ func runCPUManagerTests(f *framework.Framework) {
 		)
 
 		updateKubeletConfig(ctx, f, newCfg, true)
-		runCfsQuotaGuPods(ctx, f, false)
+
+		_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
+		runCfsQuotaGuPods(ctx, f, false, cpuAlloc)
 	})
 
 	f.It("should not reuse CPUs of restartable init containers", feature.SidecarContainers, func(ctx context.Context) {