e2e_node: add a test to verify the kubelet starts

with systemd cgroup driver and cpumanager none policy.

This was originally planned to be a correctness check for
https://issues.k8s.io/125923, but it was difficult to reproduce the bug,
so it's now a regression test against it.

Signed-off-by: Francesco Romani <fromani@redhat.com>
Signed-off-by: Peter Hunt <pehunt@redhat.com>
This commit is contained in:
Francesco Romani 2024-09-10 14:29:38 +02:00 committed by Peter Hunt
parent 77d03e42cd
commit cc87438f2f

View File

@ -76,6 +76,82 @@ var _ = SIGDescribe("Node Container Manager", framework.WithSerial(), func() {
framework.ExpectNoError(runTest(ctx, f))
})
})
f.Describe("Validate CGroup management", func() {
// Regression test for https://issues.k8s.io/125923
// In this issue there's a race involved with systemd which seems to manifest most likely, or perhaps only
// (data gathered so far seems inconclusive) on the very first boot of the machine, so restarting the kubelet
// seems not sufficient. OTOH, the exact reproducer seems to require a dedicate lane with only this test, or
// to reboot the machine before to run this test. Both are practically unrealistic in CI.
// The closest approximation is this test in this current form, using a kubelet restart. This at least
// acts as non regression testing, so it still brings value.
ginkgo.It("should correctly start with cpumanager none policy in use with systemd", func(ctx context.Context) {
if !IsCgroup2UnifiedMode() {
ginkgo.Skip("this test requires cgroups v2")
}
var err error
var oldCfg *kubeletconfig.KubeletConfiguration
// Get current kubelet configuration
oldCfg, err = getCurrentKubeletConfig(ctx)
framework.ExpectNoError(err)
ginkgo.DeferCleanup(func(ctx context.Context) {
if oldCfg != nil {
// Update the Kubelet configuration.
ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()
// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}).WithTimeout(time.Minute).WithPolling(time.Second).Should(gomega.BeFalseBecause("expected kubelet health check to be failed"))
ginkgo.By("Stopped the kubelet")
framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(oldCfg))
ginkgo.By("Starting the kubelet")
startKubelet()
// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func(ctx context.Context) bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrueBecause("expected kubelet to be in healthy state"))
ginkgo.By("Started the kubelet")
}
})
newCfg := oldCfg.DeepCopy()
// Change existing kubelet configuration
newCfg.CPUManagerPolicy = "none"
newCfg.CgroupDriver = "systemd"
newCfg.FailCgroupV1 = true // extra safety. We want to avoid false negatives though, so we added the skip check earlier
// Update the Kubelet configuration.
ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()
// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}).WithTimeout(time.Minute).WithPolling(time.Second).Should(gomega.BeFalseBecause("expected kubelet health check to be failed"))
ginkgo.By("Stopped the kubelet")
framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(newCfg))
ginkgo.By("Starting the kubelet")
startKubelet()
// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
return getNodeReadyStatus(ctx, f) && kubeletHealthCheck(kubeletHealthCheckURL)
}).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrueBecause("expected kubelet to be in healthy state"))
ginkgo.By("Started the kubelet")
gomega.Consistently(ctx, func(ctx context.Context) bool {
return getNodeReadyStatus(ctx, f) && kubeletHealthCheck(kubeletHealthCheckURL)
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(gomega.BeTrueBecause("node keeps reporting ready status"))
})
})
})
func expectFileValToEqual(filePath string, expectedValue, delta int64) error {