From 61c4732c71a90883ad0b2a8c98e3b5b3b891da3d Mon Sep 17 00:00:00 2001 From: Imran Pochi Date: Thu, 12 Aug 2021 21:06:57 +0530 Subject: [PATCH 1/2] Revert "Merge pull request #104308 from ehashman/revert-103608-imran/e2e-lock-contention" This reverts commit 9d09c9d24630d76f6c491181de00829e8a0c2994 This E2E test was reverted becuase the test was failing continously. More on the issue here #104307 This commit re-reverts and brings back the LockContention test, with the addition of [Serial] tag to the test. --- test/e2e_node/lock_contention_test.go | 82 +++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 test/e2e_node/lock_contention_test.go diff --git a/test/e2e_node/lock_contention_test.go b/test/e2e_node/lock_contention_test.go new file mode 100644 index 00000000000..e7bacd0822a --- /dev/null +++ b/test/e2e_node/lock_contention_test.go @@ -0,0 +1,82 @@ +//go:build linux +// +build linux + +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "time" + + "golang.org/x/sys/unix" + + "github.com/onsi/ginkgo" + "github.com/onsi/gomega" + "k8s.io/kubernetes/test/e2e/framework" +) + +const contentionLockFile = "/var/run/kubelet.lock" + +var _ = SIGDescribe("Lock contention [Slow] [Disruptive] [Serial] [NodeFeature:LockContention]", func() { + + ginkgo.It("Kubelet should stop when the test acquires the lock on lock file and restart once the lock is released", func() { + + ginkgo.By("perform kubelet health check to check if kubelet is healthy and running.") + // Precautionary check that kubelet is healthy before running the test. + gomega.Expect(kubeletHealthCheck(kubeletHealthCheckURL)).To(gomega.BeTrue()) + + ginkgo.By("acquiring the lock on lock file i.e /var/run/kubelet.lock") + // Open the file with the intention to acquire the lock, this would imitate the behaviour + // of the another kubelet(self-hosted) trying to start. When this lock contention happens + // it is expected that the running kubelet must terminate and wait until the lock on the + // lock file is released. + // Kubelet uses the same approach to acquire the lock on lock file as shown here: + // https://github.com/kubernetes/kubernetes/blob/master/cmd/kubelet/app/server.go#L530-#L546 + // and the function definition of Acquire is here: + // https://github.com/kubernetes/kubernetes/blob/master/pkg/util/flock/flock_unix.go#L25 + fd, err := unix.Open(contentionLockFile, unix.O_CREAT|unix.O_RDWR|unix.O_CLOEXEC, 0600) + framework.ExpectNoError(err) + // Defer the lock release in case test fails and we don't reach the step of the release + // lock. This ensures that we release the lock for sure. + defer func() { + err = unix.Flock(fd, unix.LOCK_UN) + framework.ExpectNoError(err) + }() + // Acquire lock. + err = unix.Flock(fd, unix.LOCK_EX) + framework.ExpectNoError(err) + + ginkgo.By("verifying the kubelet is not healthy as there was a lock contention.") + // Once the lock is acquired, check if the kubelet is in healthy state or not. + // It should not be. + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, 10*time.Second, time.Second).Should(gomega.BeFalse()) + + ginkgo.By("releasing the lock on lock file i.e /var/run/kubelet.lock") + // Release the lock. + err = unix.Flock(fd, unix.LOCK_UN) + framework.ExpectNoError(err) + + ginkgo.By("verifying the kubelet is healthy again after the lock was released.") + // Releasing the lock triggers kubelet to re-acquire the lock and restart. + // Hence the kubelet should report healthy state. + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, 10*time.Second, time.Second).Should(gomega.BeTrue()) + }) +}) From 6071f6e8ab79eb61cbf7ca15820c1c0437a891d9 Mon Sep 17 00:00:00 2001 From: Imran Pochi Date: Thu, 20 Jan 2022 11:45:22 +0530 Subject: [PATCH 2/2] Addressing review comments This commit is to be squashed and merged with the first commit. Signed-off-by: Imran Pochi --- ...on_test.go => lock_contention_linux_test.go} | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) rename test/e2e_node/{lock_contention_test.go => lock_contention_linux_test.go} (76%) diff --git a/test/e2e_node/lock_contention_test.go b/test/e2e_node/lock_contention_linux_test.go similarity index 76% rename from test/e2e_node/lock_contention_test.go rename to test/e2e_node/lock_contention_linux_test.go index e7bacd0822a..ae01b23ebc3 100644 --- a/test/e2e_node/lock_contention_test.go +++ b/test/e2e_node/lock_contention_linux_test.go @@ -31,8 +31,13 @@ import ( const contentionLockFile = "/var/run/kubelet.lock" -var _ = SIGDescribe("Lock contention [Slow] [Disruptive] [Serial] [NodeFeature:LockContention]", func() { +// Kubelet Lock contention tests the lock contention feature. +// Disruptive because the kubelet is restarted in the test. +// NodeSpecialFeature:LockContention because we don't want the test to be picked up by any other +// test suite, hence the unique name "LockContention". +var _ = SIGDescribe("Lock contention [Slow] [Disruptive] [NodeSpecialFeature:LockContention]", func() { + // Requires `--lock-file` & `--exit-on-lock-contention` flags to be set on the Kubelet. ginkgo.It("Kubelet should stop when the test acquires the lock on lock file and restart once the lock is released", func() { ginkgo.By("perform kubelet health check to check if kubelet is healthy and running.") @@ -45,9 +50,9 @@ var _ = SIGDescribe("Lock contention [Slow] [Disruptive] [Serial] [NodeFeature:L // it is expected that the running kubelet must terminate and wait until the lock on the // lock file is released. // Kubelet uses the same approach to acquire the lock on lock file as shown here: - // https://github.com/kubernetes/kubernetes/blob/master/cmd/kubelet/app/server.go#L530-#L546 + // https://github.com/kubernetes/kubernetes/blob/9d2b361ebc7ef28f7cb75596ef40b7c239732d37/cmd/kubelet/app/server.go#L512-#L523 // and the function definition of Acquire is here: - // https://github.com/kubernetes/kubernetes/blob/master/pkg/util/flock/flock_unix.go#L25 + // https://github.com/kubernetes/kubernetes/blob/9d2b361ebc7ef28f7cb75596ef40b7c239732d37/pkg/util/flock/flock_unix.go#L26 fd, err := unix.Open(contentionLockFile, unix.O_CREAT|unix.O_RDWR|unix.O_CLOEXEC, 0600) framework.ExpectNoError(err) // Defer the lock release in case test fails and we don't reach the step of the release @@ -67,14 +72,14 @@ var _ = SIGDescribe("Lock contention [Slow] [Disruptive] [Serial] [NodeFeature:L return kubeletHealthCheck(kubeletHealthCheckURL) }, 10*time.Second, time.Second).Should(gomega.BeFalse()) - ginkgo.By("releasing the lock on lock file i.e /var/run/kubelet.lock") + ginkgo.By("releasing the lock on lock file i.e /var/run/kubelet.lock, triggering kubelet restart.") // Release the lock. err = unix.Flock(fd, unix.LOCK_UN) framework.ExpectNoError(err) - ginkgo.By("verifying the kubelet is healthy again after the lock was released.") // Releasing the lock triggers kubelet to re-acquire the lock and restart. - // Hence the kubelet should report healthy state. + ginkgo.By("verifying the kubelet is healthy after restart.") + // Kubelet should report healthy state. gomega.Eventually(func() bool { return kubeletHealthCheck(kubeletHealthCheckURL) }, 10*time.Second, time.Second).Should(gomega.BeTrue())