mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Merge pull request #41659 from jeffvance/kubelet-wedge2
Automatic merge from submit-queue add kubelet tests to verify host clean up **What this PR does / why we need it**: Increasingly we're seeing more failures in persistent volume e2e tests where pv tests are run in parallel with disruptive tests. The quick solution is to tag the pv tests as Flaky. This pr addresses one cause of the flakiness and adds a disruptive kubelet test. Once this pr is shown to not produce flakes the [Flaky] tag for the "HostCleanup" tests will be removed in a separate pr. + Adds volume tests to _kubelet.go_ motivated by issues [31272](https://github.com/kubernetes/kubernetes/issues/31272) and [37657](https://github.com/kubernetes/kubernetes/issues/37657) + Addresses reverted pr [41178](https://github.com/kubernetes/kubernetes/pull/41178) and negates the need for pr [41229](https://github.com/kubernetes/kubernetes/pull/41229) **Which issue this PR fixes** Adds regression tests to cover issues: #31272 and #37657 **Special notes for your reviewer**: It's possible that one of the new tests, which relies on the existence of _/usr/sbin/rpc.nfsd_ in the nfs-server pod, will not work in the GCI container env. If this turns out to be true then I will add a `SkipIfProviderIs("gke")` to the `It` block. **Release note**: ```release-note NONE ```
This commit is contained in:
commit
b0a724173a
@ -412,14 +412,13 @@ func DeletePodWithWait(f *Framework, c clientset.Interface, pod *v1.Pod) {
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
}
|
||||
|
||||
// wait for pod to terminate. Expect apierr NotFound
|
||||
// wait for pod to terminate
|
||||
err = f.WaitForPodTerminated(pod.Name, "")
|
||||
Expect(err).To(HaveOccurred())
|
||||
if !apierrs.IsNotFound(err) {
|
||||
Logf("Error! Expected IsNotFound error deleting pod %q, instead got: %v", pod.Name, err)
|
||||
Expect(apierrs.IsNotFound(err)).To(BeTrue())
|
||||
if err != nil {
|
||||
Expect(apierrs.IsNotFound(err)).To(BeTrue(), fmt.Sprintf("Expected 'IsNotFound' error deleting pod \"%v/%v\", instead got: %v", pod.Namespace, pod.Name, err))
|
||||
Logf("Ignore \"not found\" error above")
|
||||
}
|
||||
Logf("Ignore \"not found\" error above. Pod %v successfully deleted", pod.Name)
|
||||
Logf("Pod %q successfully deleted", pod.Name)
|
||||
}
|
||||
|
||||
// Sanity check for GCE testing. Verify the persistent disk attached to the node.
|
||||
@ -616,17 +615,13 @@ func deletePD(pdName string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Create the test pod, wait for (hopefully) success, and then delete the pod.
|
||||
// Create the test pod, wait for success, and then delete the pod.
|
||||
func CreateWaitAndDeletePod(f *Framework, c clientset.Interface, ns string, pvc *v1.PersistentVolumeClaim) {
|
||||
Logf("Creating nfs test pod")
|
||||
// Make pod spec
|
||||
pod := MakeWritePod(ns, pvc)
|
||||
|
||||
// Instantiate pod (Create)
|
||||
runPod, err := c.CoreV1().Pods(ns).Create(pod)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(runPod).NotTo(BeNil())
|
||||
|
||||
defer DeletePodWithWait(f, c, runPod)
|
||||
|
||||
// Wait for the test pod to complete its lifecycle
|
||||
|
@ -104,7 +104,7 @@ func updateNodeLabels(c clientset.Interface, nodeNames sets.String, toAdd, toRem
|
||||
var node *v1.Node
|
||||
var err error
|
||||
for i := 0; i < maxRetries; i++ {
|
||||
node, err = c.Core().Nodes().Get(nodeName, metav1.GetOptions{})
|
||||
node, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
framework.Logf("Error getting node %s: %v", nodeName, err)
|
||||
continue
|
||||
@ -119,7 +119,7 @@ func updateNodeLabels(c clientset.Interface, nodeNames sets.String, toAdd, toRem
|
||||
delete(node.ObjectMeta.Labels, k)
|
||||
}
|
||||
}
|
||||
_, err = c.Core().Nodes().Update(node)
|
||||
_, err = c.CoreV1().Nodes().Update(node)
|
||||
if err != nil {
|
||||
framework.Logf("Error updating node %s: %v", nodeName, err)
|
||||
} else {
|
||||
@ -145,6 +145,26 @@ func createNfsServerPod(c clientset.Interface, config framework.VolumeTestConfig
|
||||
return pod, ip
|
||||
}
|
||||
|
||||
// Restart the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 1` command in the
|
||||
// pod's (only) container. This command changes the number of nfs server threads from
|
||||
// (presumably) zero back to 1, and therefore allows nfs to open connections again.
|
||||
func restartNfsServer(serverPod *v1.Pod) {
|
||||
|
||||
const startcmd = "/usr/sbin/rpc.nfsd 1"
|
||||
ns := fmt.Sprintf("--namespace=%v", serverPod.Namespace)
|
||||
framework.RunKubectlOrDie("exec", ns, serverPod.Name, "--", "/bin/sh", "-c", startcmd)
|
||||
}
|
||||
|
||||
// Stop the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 0` command in the
|
||||
// pod's (only) container. This command changes the number of nfs server threads to 0,
|
||||
// thus closing all open nfs connections.
|
||||
func stopNfsServer(serverPod *v1.Pod) {
|
||||
|
||||
const stopcmd = "/usr/sbin/rpc.nfsd 0"
|
||||
ns := fmt.Sprintf("--namespace=%v", serverPod.Namespace)
|
||||
framework.RunKubectlOrDie("exec", ns, serverPod.Name, "--", "/bin/sh", "-c", stopcmd)
|
||||
}
|
||||
|
||||
// Creates a pod that mounts an nfs volume that is served by the nfs-server pod. The container
|
||||
// will execute the passed in shell cmd. Waits for the pod to start.
|
||||
// Note: the nfs plugin is defined inline, no PV or PVC.
|
||||
@ -196,18 +216,35 @@ func createPodUsingNfs(f *framework.Framework, c clientset.Interface, ns, nfsIP,
|
||||
},
|
||||
},
|
||||
}
|
||||
rtnPod, err := c.Core().Pods(ns).Create(pod)
|
||||
rtnPod, err := c.CoreV1().Pods(ns).Create(pod)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
err = f.WaitForPodReady(rtnPod.Name) // running & ready
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
rtnPod, err = c.Core().Pods(ns).Get(rtnPod.Name, metav1.GetOptions{}) // return fresh pod
|
||||
rtnPod, err = c.CoreV1().Pods(ns).Get(rtnPod.Name, metav1.GetOptions{}) // return fresh pod
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
return rtnPod
|
||||
}
|
||||
|
||||
// move the passed-in pod's UID directory to /tmp.
|
||||
func movePodUidDir(c clientset.Interface, pod *v1.Pod) {
|
||||
|
||||
dest := "/tmp"
|
||||
podDir := filepath.Join("/var/lib/kubelet/pods", string(pod.UID))
|
||||
cmd := fmt.Sprintf("mv %v %v", podDir, dest)
|
||||
// use ip rather than hostname in GCE
|
||||
nodeIP, err := framework.GetHostExternalAddress(c, pod)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
// excute cmd over ssh
|
||||
result, _ := nodeExec(nodeIP, cmd)
|
||||
framework.LogSSHResult(result)
|
||||
Expect(result.Code).To(BeZero())
|
||||
Expect(len(result.Stderr)).To(BeZero())
|
||||
}
|
||||
|
||||
// Checks for a lingering nfs mount and/or uid directory on the pod's host. The host IP is used
|
||||
// so that this test runs in GCE, where it appears that SSH cannot resolve the hostname.
|
||||
// If expectClean is true then we expect the node to be cleaned up and thus commands like
|
||||
@ -218,42 +255,48 @@ func checkPodCleanup(c clientset.Interface, pod *v1.Pod, expectClean bool) {
|
||||
|
||||
timeout := 5 * time.Minute
|
||||
poll := 20 * time.Second
|
||||
podUID := string(pod.UID)
|
||||
podDir := filepath.Join("/var/lib/kubelet/pods", podUID)
|
||||
podDir := filepath.Join("/var/lib/kubelet/pods", string(pod.UID))
|
||||
mountDir := filepath.Join(podDir, "volumes", "kubernetes.io~nfs")
|
||||
// use ip rather than hostname in GCE
|
||||
nodeIP, err := framework.GetHostExternalAddress(c, pod)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
condMsg := map[bool]string{
|
||||
true: "deleted",
|
||||
false: "present",
|
||||
condMsg := "deleted"
|
||||
if !expectClean {
|
||||
condMsg = "present"
|
||||
}
|
||||
|
||||
// table of host tests to perform
|
||||
tests := map[string]string{ //["what-to-test"] "remote-command"
|
||||
"pod UID directory": fmt.Sprintf("sudo ls %v", podDir),
|
||||
"pod nfs mount": fmt.Sprintf("sudo mount | grep %v", mountDir),
|
||||
// table of host tests to perform (order may matter so not using a map)
|
||||
type testT struct {
|
||||
feature string // feature to test
|
||||
cmd string // remote command to execute on node
|
||||
}
|
||||
tests := []testT{
|
||||
{
|
||||
feature: "pod UID directory",
|
||||
cmd: fmt.Sprintf("sudo ls %v", podDir),
|
||||
},
|
||||
{
|
||||
feature: "pod nfs mount",
|
||||
cmd: fmt.Sprintf("sudo mount | grep %v", mountDir),
|
||||
},
|
||||
}
|
||||
|
||||
for test, cmd := range tests {
|
||||
framework.Logf("Wait up to %v for host's (%v) %q to be %v", timeout, nodeIP, test, condMsg[expectClean])
|
||||
for _, test := range tests {
|
||||
framework.Logf("Wait up to %v for host's (%v) %q to be %v", timeout, nodeIP, test.feature, condMsg)
|
||||
err = wait.Poll(poll, timeout, func() (bool, error) {
|
||||
result, _ := nodeExec(nodeIP, cmd)
|
||||
result, _ := nodeExec(nodeIP, test.cmd)
|
||||
framework.LogSSHResult(result)
|
||||
sawFiles := result.Code == 0
|
||||
if expectClean && sawFiles { // keep trying
|
||||
ok := (result.Code == 0 && len(result.Stdout) > 0 && len(result.Stderr) == 0)
|
||||
if expectClean && ok { // keep trying
|
||||
return false, nil
|
||||
}
|
||||
if !expectClean && !sawFiles { // stop wait loop
|
||||
return true, fmt.Errorf("%v is gone but expected to exist", test)
|
||||
if !expectClean && !ok { // stop wait loop
|
||||
return true, fmt.Errorf("%v is gone but expected to exist", test.feature)
|
||||
}
|
||||
return true, nil // done, host is as expected
|
||||
})
|
||||
if err != nil {
|
||||
framework.Logf("Host (%v) cleanup error: %v. Expected %q to be %v", nodeIP, err, test, condMsg[expectClean])
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Host (%v) cleanup error: %v. Expected %q to be %v", nodeIP, err, test.feature, condMsg))
|
||||
}
|
||||
|
||||
if expectClean {
|
||||
@ -375,14 +418,20 @@ var _ = framework.KubeDescribe("kubelet", func() {
|
||||
}
|
||||
})
|
||||
|
||||
// Delete nfs server pod after another pods accesses the mounted nfs volume.
|
||||
framework.KubeDescribe("host cleanup with volume mounts [HostCleanup][Flaky]", func() {
|
||||
// Test host cleanup when disrupting the volume environment.
|
||||
framework.KubeDescribe("host cleanup with volume mounts [Volume][HostCleanup][Flaky]", func() {
|
||||
|
||||
type hostCleanupTest struct {
|
||||
itDescr string
|
||||
podCmd string
|
||||
}
|
||||
|
||||
Context("Host cleanup after pod using NFS mount is deleted [Volume][NFS]", func() {
|
||||
// Disrupt the nfs-server pod after a client pod accesses the nfs volume.
|
||||
// Note: the nfs-server is stopped NOT deleted. This is done to preserve its ip addr.
|
||||
// If the nfs-server pod is deleted the client pod's mount can not be unmounted.
|
||||
// If the nfs-server pod is deleted and re-created, due to having a different ip
|
||||
// addr, the client pod's mount still cannot be unmounted.
|
||||
Context("Host cleanup after disrupting NFS volume [NFS]", func() {
|
||||
// issue #31272
|
||||
var (
|
||||
nfsServerPod *v1.Pod
|
||||
@ -394,16 +443,17 @@ var _ = framework.KubeDescribe("kubelet", func() {
|
||||
// fill in test slice for this context
|
||||
testTbl := []hostCleanupTest{
|
||||
{
|
||||
itDescr: "after deleting the nfs-server, the host should be cleaned-up when deleting sleeping pod which mounts an NFS vol",
|
||||
podCmd: "sleep 6000",
|
||||
itDescr: "after stopping the nfs-server and deleting the (sleeping) client pod, the NFS mount and the pod's UID directory should be removed.",
|
||||
podCmd: "sleep 6000", // keep pod running
|
||||
},
|
||||
{
|
||||
itDescr: "after deleting the nfs-server, the host should be cleaned-up when deleting a pod accessing the NFS vol",
|
||||
podCmd: "while true; do echo FeFieFoFum >>/mnt/SUCCESS; cat /mnt/SUCCESS; done",
|
||||
itDescr: "after stopping the nfs-server and deleting the (active) client pod, the NFS mount and the pod's UID directory should be removed.",
|
||||
podCmd: "while true; do echo FeFieFoFum >>/mnt/SUCCESS; sleep 1; cat /mnt/SUCCESS; done",
|
||||
},
|
||||
}
|
||||
|
||||
BeforeEach(func() {
|
||||
framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
|
||||
NFSconfig = framework.VolumeTestConfig{
|
||||
Namespace: ns,
|
||||
Prefix: "nfs",
|
||||
@ -420,31 +470,46 @@ var _ = framework.KubeDescribe("kubelet", func() {
|
||||
})
|
||||
|
||||
// execute It blocks from above table of tests
|
||||
for _, test := range testTbl {
|
||||
t := test // local copy for closure
|
||||
It(fmt.Sprintf("%v [Serial]", t.itDescr), func() {
|
||||
// create a pod which uses the nfs server's volume
|
||||
for _, t := range testTbl {
|
||||
It(t.itDescr, func() {
|
||||
pod = createPodUsingNfs(f, c, ns, nfsIP, t.podCmd)
|
||||
|
||||
By("Delete the NFS server pod")
|
||||
framework.DeletePodWithWait(f, c, nfsServerPod)
|
||||
nfsServerPod = nil
|
||||
By("Stop the NFS server")
|
||||
stopNfsServer(nfsServerPod)
|
||||
|
||||
By("Delete the pod mounted to the NFS volume")
|
||||
framework.DeletePodWithWait(f, c, pod)
|
||||
// pod object is now stale, but is intentionally not nil
|
||||
|
||||
By("Check if host running deleted pod has been cleaned up -- expect not")
|
||||
// expect the pod's host *not* to be cleaned up
|
||||
By("Check if pod's host has been cleaned up -- expect not")
|
||||
checkPodCleanup(c, pod, false)
|
||||
|
||||
By("Recreate the nfs server pod")
|
||||
nfsServerPod, nfsIP = createNfsServerPod(c, NFSconfig)
|
||||
By("Restart the nfs server")
|
||||
restartNfsServer(nfsServerPod)
|
||||
|
||||
By("Verify host running the deleted pod is now cleaned up")
|
||||
// expect the pod's host to be cleaned up
|
||||
checkPodCleanup(c, pod, true)
|
||||
})
|
||||
}
|
||||
|
||||
// Move a pod's uid dir to /tmp and delete the pod.
|
||||
// Addresses issue #37657.
|
||||
// Note: the pod's vol mount (as a side effect) ends up being moved to /tmp
|
||||
// and can be unmounted via `umount -f`.
|
||||
It("move NFS client pod's UID directory then delete pod", func() {
|
||||
pod = createPodUsingNfs(f, c, ns, nfsIP, "sleep 6000")
|
||||
|
||||
By("Move pod's uid dir to /tmp")
|
||||
movePodUidDir(c, pod)
|
||||
|
||||
By("Delete the pod mounted to the NFS volume")
|
||||
framework.DeletePodWithWait(f, c, pod)
|
||||
// pod object is now stale, but is intentionally not nil
|
||||
// Note: the pod's nfs mount, now in /tmp, will not be unmounted
|
||||
|
||||
By("Verify host running the deleted pod is cleaned up")
|
||||
checkPodCleanup(c, pod, true)
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
Loading…
Reference in New Issue
Block a user