Merge pull request #130163 from ffromani/e2e-node-fix-cpu-quota-test

e2e: node: cpumgr: cleanup after each test case
This commit is contained in:
Kubernetes Prow Robot 2025-02-25 05:08:29 -08:00 committed by GitHub
commit 27cbe54b09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 121 additions and 84 deletions

View File

@ -592,24 +592,36 @@ func runMultipleCPUContainersGuPod(ctx context.Context, f *framework.Framework)
waitForContainerRemoval(ctx, pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
}
func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool) {
func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQuotaWithExclusiveCPUs bool, cpuAlloc int64) {
var err error
var ctnAttrs []ctnAttribute
var pod1, pod2, pod3 *v1.Pod
var cleanupPods []*v1.Pod
ginkgo.DeferCleanup(func() {
podsToClean := make(map[string]*v1.Pod) // pod.UID -> pod
framework.Logf("runCfsQuotaGuPods: disableQuota=%v, CPU Allocatable=%v", disabledCPUQuotaWithExclusiveCPUs, cpuAlloc)
deleteTestPod := func(pod *v1.Pod) {
// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
// 'deadline expired' message and the cleanup aborts, which we don't want.
ctx2 := context.TODO()
// So let's use a separate and more generous timeout (determined by trial and error)
ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
deletePodSyncAndWait(ctx2, f, pod.Namespace, pod.Name)
delete(podsToClean, string(pod.UID))
}
// cleanup leftovers on test failure. The happy path is covered by `deleteTestPod` calls
ginkgo.DeferCleanup(func() {
ginkgo.By("by deleting the pods and waiting for container removal")
for _, cleanupPod := range cleanupPods {
framework.Logf("deleting pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
deletePodSyncByName(ctx2, f, cleanupPod.Name)
waitForContainerRemoval(ctx2, cleanupPod.Spec.Containers[0].Name, cleanupPod.Name, cleanupPod.Namespace)
framework.Logf("deleted pod: %s/%s", cleanupPod.Namespace, cleanupPod.Name)
}
// waitForContainerRemoval takes "long" to complete; if we use the parent ctx we get a
// 'deadline expired' message and the cleanup aborts, which we don't want.
// So let's use a separate and more generous timeout (determined by trial and error)
ctx2, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
deletePodsAsync(ctx2, f, podsToClean)
})
podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
cfsCheckCommand := []string{"sh", "-c", "cat /sys/fs/cgroup/cpu.max && sleep 1d"}
defaultPeriod := "100000"
@ -623,7 +635,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
pod1 = makeCPUManagerPod("gu-pod1", ctnAttrs)
pod1.Spec.Containers[0].Command = cfsCheckCommand
pod1 = e2epod.NewPodClient(f).CreateSync(ctx, pod1)
cleanupPods = append(cleanupPods, pod1)
podsToClean[string(pod1.UID)] = pod1
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, exclusive CPUs, unlimited)")
@ -635,6 +647,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod1.Name, pod1.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod1.Spec.Containers[0].Name, pod1.Name)
deleteTestPod(pod1)
ctnAttrs = []ctnAttribute{
{
@ -646,7 +659,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
pod2 = makeCPUManagerPod("gu-pod2", ctnAttrs)
pod2.Spec.Containers[0].Command = cfsCheckCommand
pod2 = e2epod.NewPodClient(f).CreateSync(ctx, pod2)
cleanupPods = append(cleanupPods, pod2)
podsToClean[string(pod2.UID)] = pod2
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, limited)")
@ -655,6 +668,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod2.Name, pod2.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod2.Spec.Containers[0].Name, pod2.Name)
deleteTestPod(pod2)
ctnAttrs = []ctnAttribute{
{
@ -666,7 +680,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
pod3 = makeCPUManagerPod("non-gu-pod3", ctnAttrs)
pod3.Spec.Containers[0].Command = cfsCheckCommand
pod3 = e2epod.NewPodClient(f).CreateSync(ctx, pod3)
cleanupPods = append(cleanupPods, pod3)
podsToClean[string(pod3.UID)] = pod3
ginkgo.By("checking if the expected cfs quota was assigned (BU pod, limited)")
@ -675,72 +689,78 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod3.Name, pod3.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod3.Spec.Containers[0].Name, pod3.Name)
deleteTestPod(pod3)
ctnAttrs = []ctnAttribute{
{
ctnName: "gu-container-non-int-values",
cpuRequest: "500m",
cpuLimit: "500m",
},
{
ctnName: "gu-container-int-values",
cpuRequest: "1",
cpuLimit: "1",
},
if cpuAlloc >= 2 {
ctnAttrs = []ctnAttribute{
{
ctnName: "gu-container-non-int-values",
cpuRequest: "500m",
cpuLimit: "500m",
},
{
ctnName: "gu-container-int-values",
cpuRequest: "1",
cpuLimit: "1",
},
}
pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
pod4.Spec.Containers[0].Command = cfsCheckCommand
pod4.Spec.Containers[1].Command = cfsCheckCommand
pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
podsToClean[string(pod4.UID)] = pod4
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
expectedQuota = "50000"
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod4.Spec.Containers[0].Name, pod4.Name)
expectedQuota = "100000"
if disabledCPUQuotaWithExclusiveCPUs {
expectedQuota = "max"
}
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod4.Spec.Containers[1].Name, pod4.Name)
deleteTestPod(pod4)
ctnAttrs = []ctnAttribute{
{
ctnName: "gu-container-non-int-values",
cpuRequest: "500m",
cpuLimit: "500m",
},
{
ctnName: "gu-container-int-values",
cpuRequest: "1",
cpuLimit: "1",
},
}
pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
pod5.Spec.Containers[0].Command = podCFSCheckCommand
pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
podsToClean[string(pod5.UID)] = pod5
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
expectedQuota = "150000"
if disabledCPUQuotaWithExclusiveCPUs {
expectedQuota = "max"
}
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
deleteTestPod(pod5)
} else {
ginkgo.By(fmt.Sprintf("some cases SKIPPED - requests at least %d allocatable cores, got %d", 2, cpuAlloc))
}
pod4 := makeCPUManagerPod("gu-pod4", ctnAttrs)
pod4.Spec.Containers[0].Command = cfsCheckCommand
pod4.Spec.Containers[1].Command = cfsCheckCommand
pod4 = e2epod.NewPodClient(f).CreateSync(ctx, pod4)
cleanupPods = append(cleanupPods, pod4)
ginkgo.By("checking if the expected cfs quota was assigned (GU pod, container 0 exclusive CPUs unlimited, container 1 limited)")
expectedQuota = "50000"
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod4.Spec.Containers[0].Name, pod4.Name)
expectedQuota = "100000"
if disabledCPUQuotaWithExclusiveCPUs {
expectedQuota = "max"
}
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod4.Name, pod4.Spec.Containers[1].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod4.Spec.Containers[1].Name, pod4.Name)
ctnAttrs = []ctnAttribute{
{
ctnName: "gu-container-non-int-values",
cpuRequest: "500m",
cpuLimit: "500m",
},
{
ctnName: "gu-container-int-values",
cpuRequest: "1",
cpuLimit: "1",
},
}
podCFSCheckCommand := []string{"sh", "-c", `cat $(find /sysfscgroup | grep "$(cat /podinfo/uid | sed 's/-/_/g').slice/cpu.max$") && sleep 1d`}
pod5 := makeCPUManagerPod("gu-pod5", ctnAttrs)
pod5.Spec.Containers[0].Command = podCFSCheckCommand
pod5 = e2epod.NewPodClient(f).CreateSync(ctx, pod5)
cleanupPods = append(cleanupPods, pod5)
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, unlimited)")
expectedQuota = "150000"
if disabledCPUQuotaWithExclusiveCPUs {
expectedQuota = "max"
}
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod5.Name, pod5.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod5.Spec.Containers[0].Name, pod5.Name)
ctnAttrs = []ctnAttribute{
{
@ -753,7 +773,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
pod6 := makeCPUManagerPod("gu-pod6", ctnAttrs)
pod6.Spec.Containers[0].Command = podCFSCheckCommand
pod6 = e2epod.NewPodClient(f).CreateSync(ctx, pod6)
cleanupPods = append(cleanupPods, pod6)
podsToClean[string(pod6.UID)] = pod6
ginkgo.By("checking if the expected cfs quota was assigned to pod (GU pod, limited)")
@ -761,7 +781,7 @@ func runCfsQuotaGuPods(ctx context.Context, f *framework.Framework, disabledCPUQ
expCFSQuotaRegex = fmt.Sprintf("^%s %s\n$", expectedQuota, defaultPeriod)
err = e2epod.NewPodClient(f).MatchContainerOutput(ctx, pod6.Name, pod6.Spec.Containers[0].Name, expCFSQuotaRegex)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", pod6.Spec.Containers[0].Name, pod6.Name)
deleteTestPod(pod6)
}
func runMultipleGuPods(ctx context.Context, f *framework.Framework) {
@ -921,6 +941,10 @@ func runCPUManagerTests(f *framework.Framework) {
if !IsCgroup2UnifiedMode() {
e2eskipper.Skipf("Skipping since CgroupV2 not used")
}
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
if cpuAlloc < 1 { // save expensive kubelet restart
e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
}
newCfg := configureCPUManagerInKubelet(oldCfg,
&cpuManagerKubeletArguments{
policyName: string(cpumanager.PolicyStatic),
@ -929,13 +953,19 @@ func runCPUManagerTests(f *framework.Framework) {
},
)
updateKubeletConfig(ctx, f, newCfg, true)
runCfsQuotaGuPods(ctx, f, true)
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
runCfsQuotaGuPods(ctx, f, true, cpuAlloc)
})
ginkgo.It("should keep enforcing the CFS quota for containers with static CPUs assigned and feature gate disabled", func(ctx context.Context) {
if !IsCgroup2UnifiedMode() {
e2eskipper.Skipf("Skipping since CgroupV2 not used")
}
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
if cpuAlloc < 1 { // save expensive kubelet restart
e2eskipper.Skipf("Skipping since not enough allocatable CPU got %d required 1", cpuAlloc)
}
newCfg := configureCPUManagerInKubelet(oldCfg,
&cpuManagerKubeletArguments{
policyName: string(cpumanager.PolicyStatic),
@ -945,7 +975,9 @@ func runCPUManagerTests(f *framework.Framework) {
)
updateKubeletConfig(ctx, f, newCfg, true)
runCfsQuotaGuPods(ctx, f, false)
_, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) // check again after we reserved 1 full CPU. Some tests require > 1 exclusive CPU
runCfsQuotaGuPods(ctx, f, false, cpuAlloc)
})
f.It("should not reuse CPUs of restartable init containers", feature.SidecarContainers, func(ctx context.Context) {

View File

@ -463,14 +463,19 @@ func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[str
go func(podNS, podName string) {
defer ginkgo.GinkgoRecover()
defer wg.Done()
deletePodSyncByName(ctx, f, podName)
waitForAllContainerRemoval(ctx, podName, podNS)
deletePodSyncAndWait(ctx, f, podNS, podName)
}(pod.Namespace, pod.Name)
}
wg.Wait()
}
func deletePodSyncAndWait(ctx context.Context, f *framework.Framework, podNS, podName string) {
framework.Logf("deleting pod: %s/%s", podNS, podName)
deletePodSyncByName(ctx, f, podName)
waitForAllContainerRemoval(ctx, podName, podNS)
framework.Logf("deleted pod: %s/%s", podNS, podName)
}
func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
podName := "gu-pod"
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)