diff --git a/test/e2e_node/cpu_manager_test.go b/test/e2e_node/cpu_manager_test.go index 00fa990269f..c46e96166d3 100644 --- a/test/e2e_node/cpu_manager_test.go +++ b/test/e2e_node/cpu_manager_test.go @@ -592,24 +592,36 @@ func runMultipleCPUContainersGuPod(ctx context.Context, f *framework.Framework) waitForContainerRemoval(ctx, pod.Spec.Containers[1].Name, pod.Name, pod.Namespace) } -func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool) { +func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool, cpuAlloc int64) { var err error var ctnAttrs []ctnAttribute var pod1, pod2, pod3 *v1.Pod - var cleanupPods []*v1.Pod - ginkgo.DeferCleanup(func() { + podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod + + framework.Logf("runCfsQuotaGuPods: disableQuota=%v, CPU Allocatable=%v", disabledCPUQuotaWithExclusiveCPUs, cpuAlloc) + + deleteTestPod := func(pod *v1.Pod) { // waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a // 'deadline expired' message and the cleanup aborts, which we don't want. - ctx2 := context.TODO() + // So let's use a separate and more generous timeout (determined by trial and error) + ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + deletePodSyncAndWait(ctx2, f, pod.Namespace, pod.Name) + delete(podsToClean, string(pod.UID)) + } + + // cleanup leftovers on test failure. The happy path is covered by `deleteTestPod` calls + ginkgo.DeferCleanup(func() { ginkgo.By("by deleting the pods and waiting for container removal") - for _, cleanupPod := range cleanupPods { - framework.Logf("deleting pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name) - deletePodSyncByName(ctx2, f, cleanupPod.Name) - waitForContainerRemoval(ctx2, cleanupPod.Spec.Containers[0].Name, cleanupPod.Name, cleanupPod.Namespace) - framework.Logf("deleted pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name) - } + // waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a + // 'deadline expired' message and the cleanup aborts, which we don't want. + // So let's use a separate and more generous timeout (determined by trial and error) + ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + deletePodsAsync(ctx2, f, podsToClean) }) + podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`} cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"} defaultPeriod := "100000" @@ -623,7 +635,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ pod1 = makeCPUManagerPod("gu-pod1", ctnAttrs) pod1.Spec.Containers[0].Command = cfsCheckCommand pod1 = e2epod.NewPodClient(f).CreateSync(ctx, pod1) - cleanupPods = append(cleanupPods, pod1) + podsToClean[string(pod1.UID)] = pod1 ginkgo.By("checking if the expected cfs quota was assigned (GU pod, exclusive CPUs, unlimited)") @@ -635,6 +647,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod1.Name, pod1.Spec.Containers[0].Name, expCFSQuotaRegex) framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod1.Spec.Containers[0].Name, pod1.Name) + deleteTestPod(pod1) ctnAttrs = []ctnAttribute{ { @@ -646,7 +659,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ pod2 = makeCPUManagerPod("gu-pod2", ctnAttrs) pod2.Spec.Containers[0].Command = cfsCheckCommand pod2 = e2epod.NewPodClient(f).CreateSync(ctx, pod2) - cleanupPods = append(cleanupPods, pod2) + podsToClean[string(pod2.UID)] = pod2 ginkgo.By("checking if the expected cfs quota was assigned (GU pod, limited)") @@ -655,6 +668,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod2.Name, pod2.Spec.Containers[0].Name, expCFSQuotaRegex) framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod2.Spec.Containers[0].Name, pod2.Name) + deleteTestPod(pod2) ctnAttrs = []ctnAttribute{ { @@ -666,7 +680,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ pod3 = makeCPUManagerPod("non-gu-pod3", ctnAttrs) pod3.Spec.Containers[0].Command = cfsCheckCommand pod3 = e2epod.NewPodClient(f).CreateSync(ctx, pod3) - cleanupPods = append(cleanupPods, pod3) + podsToClean[string(pod3.UID)] = pod3 ginkgo.By("checking if the expected cfs quota was assigned (BU pod, limited)") @@ -675,72 +689,78 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod3.Name, pod3.Spec.Containers[0].Name, expCFSQuotaRegex) framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod3.Spec.Containers[0].Name, pod3.Name) + deleteTestPod(pod3) - ctnAttrs = []ctnAttribute{ - { - ctnName: "gu-container-non-int-values", - cpuRequest: "500m", - cpuLimit: "500m", - }, - { - ctnName: "gu-container-int-values", - cpuRequest: "1", - cpuLimit: "1", - }, + if cpuAlloc >= 2 { + ctnAttrs = []ctnAttribute{ + { + ctnName: "gu-container-non-int-values", + cpuRequest: "500m", + cpuLimit: "500m", + }, + { + ctnName: "gu-container-int-values", + cpuRequest: "1", + cpuLimit: "1", + }, + } + pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs) + pod4.Spec.Containers[0].Command = cfsCheckCommand + pod4.Spec.Containers[1].Command = cfsCheckCommand + pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4) + podsToClean[string(pod4.UID)] = pod4 + + ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)") + + expectedQuota = "50000" + expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) + err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", + pod4.Spec.Containers[0].Name, pod4.Name) + expectedQuota = "100000" + if disabledCPUQuotaWithExclusiveCPUs { + expectedQuota = "max" + } + expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) + err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", + pod4.Spec.Containers[1].Name, pod4.Name) + deleteTestPod(pod4) + + ctnAttrs = []ctnAttribute{ + { + ctnName: "gu-container-non-int-values", + cpuRequest: "500m", + cpuLimit: "500m", + }, + { + ctnName: "gu-container-int-values", + cpuRequest: "1", + cpuLimit: "1", + }, + } + + pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs) + pod5.Spec.Containers[0].Command = podCFSCheckCommand + pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5) + podsToClean[string(pod5.UID)] = pod5 + + ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)") + + expectedQuota = "150000" + + if disabledCPUQuotaWithExclusiveCPUs { + expectedQuota = "max" + } + + expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) + + err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name) + deleteTestPod(pod5) + } else { + ginkgo.By(fmt.Sprintf("some cases SKIPPED - requests at least %d allocatable cores, got %d", 2, cpuAlloc)) } - pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs) - pod4.Spec.Containers[0].Command = cfsCheckCommand - pod4.Spec.Containers[1].Command = cfsCheckCommand - pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4) - cleanupPods = append(cleanupPods, pod4) - - ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)") - - expectedQuota = "50000" - expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) - err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex) - framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", - pod4.Spec.Containers[0].Name, pod4.Name) - expectedQuota = "100000" - if disabledCPUQuotaWithExclusiveCPUs { - expectedQuota = "max" - } - expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) - err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex) - framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", - pod4.Spec.Containers[1].Name, pod4.Name) - - ctnAttrs = []ctnAttribute{ - { - ctnName: "gu-container-non-int-values", - cpuRequest: "500m", - cpuLimit: "500m", - }, - { - ctnName: "gu-container-int-values", - cpuRequest: "1", - cpuLimit: "1", - }, - } - - podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`} - - pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs) - pod5.Spec.Containers[0].Command = podCFSCheckCommand - pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5) - cleanupPods = append(cleanupPods, pod5) - ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)") - - expectedQuota = "150000" - - if disabledCPUQuotaWithExclusiveCPUs { - expectedQuota = "max" - } - - expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) - - err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex) - framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name) ctnAttrs = []ctnAttribute{ { @@ -753,7 +773,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ pod6 := makeCPUManagerPod("gu-pod6", ctnAttrs) pod6.Spec.Containers[0].Command = podCFSCheckCommand pod6 = e2epod.NewPodClient(f).CreateSync(ctx, pod6) - cleanupPods = append(cleanupPods, pod6) + podsToClean[string(pod6.UID)] = pod6 ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, limited)") @@ -761,7 +781,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod) err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod6.Name, pod6.Spec.Containers[0].Name, expCFSQuotaRegex) framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod6.Spec.Containers[0].Name, pod6.Name) - + deleteTestPod(pod6) } func runMultipleGuPods(ctx context.Context, f *framework.Framework) { @@ -921,6 +941,10 @@ func runCPUManagerTests(f *framework.Framework) { if !IsCgroup2UnifiedMode() { e2eskipper.Skipf("Skipping since CgroupV2 not used") } + _, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) + if cpuAlloc < 1 { // save expensive kubelet restart + e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc) + } newCfg := configureCPUManagerInKubelet(oldCfg, &cpuManagerKubeletArguments{ policyName: string(cpumanager.PolicyStatic), @@ -929,13 +953,19 @@ func runCPUManagerTests(f *framework.Framework) { }, ) updateKubeletConfig(ctx, f, newCfg, true) - runCfsQuotaGuPods(ctx, f, true) + + _, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU + runCfsQuotaGuPods(ctx, f, true, cpuAlloc) }) ginkgo.It("should keep enforcing the CFS quota for containers with static CPUs assigned and feature gate disabled", func(ctx context.Context) { if !IsCgroup2UnifiedMode() { e2eskipper.Skipf("Skipping since CgroupV2 not used") } + _, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) + if cpuAlloc < 1 { // save expensive kubelet restart + e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc) + } newCfg := configureCPUManagerInKubelet(oldCfg, &cpuManagerKubeletArguments{ policyName: string(cpumanager.PolicyStatic), @@ -945,7 +975,9 @@ func runCPUManagerTests(f *framework.Framework) { ) updateKubeletConfig(ctx, f, newCfg, true) - runCfsQuotaGuPods(ctx, f, false) + + _, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU + runCfsQuotaGuPods(ctx, f, false, cpuAlloc) }) f.It("should not reuse CPUs of restartable init containers", feature.SidecarContainers, func(ctx context.Context) { diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go index bac00c5ee7b..7713772c6f2 100644 --- a/test/e2e_node/topology_manager_test.go +++ b/test/e2e_node/topology_manager_test.go @@ -463,14 +463,19 @@ func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[str go func(podNS, podName string) { defer ginkgo.GinkgoRecover() defer wg.Done() - - deletePodSyncByName(ctx, f, podName) - waitForAllContainerRemoval(ctx, podName, podNS) + deletePodSyncAndWait(ctx, f, podNS, podName) }(pod.Namespace, pod.Name) } wg.Wait() } +func deletePodSyncAndWait(ctx context.Context, f *framework.Framework, podNS, podName string) { + framework.Logf("deleting pod: %s/%s", podNS, podName) + deletePodSyncByName(ctx, f, podName) + waitForAllContainerRemoval(ctx, podName, podNS) + framework.Logf("deleted pod: %s/%s", podNS, podName) +} + func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) { podName := "gu-pod" framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)