From cc87438f2fa7fbd16f7e845bd357b49618412570 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 10 Sep 2024 14:29:38 +0200 Subject: [PATCH] e2e_node: add a test to verify the kubelet starts with systemd cgroup driver and cpumanager none policy. This was originally planned to be a correctness check for https://issues.k8s.io/125923, but it was difficult to reproduce the bug, so it's now a regression test against it. Signed-off-by: Francesco Romani Signed-off-by: Peter Hunt --- test/e2e_node/node_container_manager_test.go | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/test/e2e_node/node_container_manager_test.go b/test/e2e_node/node_container_manager_test.go index 93c048d2e53..ce66ac65c48 100644 --- a/test/e2e_node/node_container_manager_test.go +++ b/test/e2e_node/node_container_manager_test.go @@ -76,6 +76,82 @@ var _ = SIGDescribe("Node Container Manager", framework.WithSerial(), func() { framework.ExpectNoError(runTest(ctx, f)) }) }) + f.Describe("Validate CGroup management", func() { + // Regression test for https://issues.k8s.io/125923 + // In this issue there's a race involved with systemd which seems to manifest most likely, or perhaps only + // (data gathered so far seems inconclusive) on the very first boot of the machine, so restarting the kubelet + // seems not sufficient. OTOH, the exact reproducer seems to require a dedicate lane with only this test, or + // to reboot the machine before to run this test. Both are practically unrealistic in CI. + // The closest approximation is this test in this current form, using a kubelet restart. This at least + // acts as non regression testing, so it still brings value. + ginkgo.It("should correctly start with cpumanager none policy in use with systemd", func(ctx context.Context) { + if !IsCgroup2UnifiedMode() { + ginkgo.Skip("this test requires cgroups v2") + } + + var err error + var oldCfg *kubeletconfig.KubeletConfiguration + // Get current kubelet configuration + oldCfg, err = getCurrentKubeletConfig(ctx) + framework.ExpectNoError(err) + + ginkgo.DeferCleanup(func(ctx context.Context) { + if oldCfg != nil { + // Update the Kubelet configuration. + ginkgo.By("Stopping the kubelet") + startKubelet := stopKubelet() + + // wait until the kubelet health check will fail + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }).WithTimeout(time.Minute).WithPolling(time.Second).Should(gomega.BeFalseBecause("expected kubelet health check to be failed")) + ginkgo.By("Stopped the kubelet") + + framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(oldCfg)) + + ginkgo.By("Starting the kubelet") + startKubelet() + + // wait until the kubelet health check will succeed + gomega.Eventually(ctx, func(ctx context.Context) bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrueBecause("expected kubelet to be in healthy state")) + ginkgo.By("Started the kubelet") + } + }) + + newCfg := oldCfg.DeepCopy() + // Change existing kubelet configuration + newCfg.CPUManagerPolicy = "none" + newCfg.CgroupDriver = "systemd" + newCfg.FailCgroupV1 = true // extra safety. We want to avoid false negatives though, so we added the skip check earlier + + // Update the Kubelet configuration. + ginkgo.By("Stopping the kubelet") + startKubelet := stopKubelet() + + // wait until the kubelet health check will fail + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }).WithTimeout(time.Minute).WithPolling(time.Second).Should(gomega.BeFalseBecause("expected kubelet health check to be failed")) + ginkgo.By("Stopped the kubelet") + + framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(newCfg)) + + ginkgo.By("Starting the kubelet") + startKubelet() + + // wait until the kubelet health check will succeed + gomega.Eventually(ctx, func() bool { + return getNodeReadyStatus(ctx, f) && kubeletHealthCheck(kubeletHealthCheckURL) + }).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrueBecause("expected kubelet to be in healthy state")) + ginkgo.By("Started the kubelet") + + gomega.Consistently(ctx, func(ctx context.Context) bool { + return getNodeReadyStatus(ctx, f) && kubeletHealthCheck(kubeletHealthCheckURL) + }).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(gomega.BeTrueBecause("node keeps reporting ready status")) + }) + }) }) func expectFileValToEqual(filePath string, expectedValue, delta int64) error {