diff --git a/test/e2e/framework/pv_util.go b/test/e2e/framework/pv_util.go index e58c9b4c4e5..4b7515c2784 100644 --- a/test/e2e/framework/pv_util.go +++ b/test/e2e/framework/pv_util.go @@ -412,14 +412,13 @@ func DeletePodWithWait(f *Framework, c clientset.Interface, pod *v1.Pod) { Expect(err).NotTo(HaveOccurred()) } - // wait for pod to terminate. Expect apierr NotFound + // wait for pod to terminate err = f.WaitForPodTerminated(pod.Name, "") - Expect(err).To(HaveOccurred()) - if !apierrs.IsNotFound(err) { - Logf("Error! Expected IsNotFound error deleting pod %q, instead got: %v", pod.Name, err) - Expect(apierrs.IsNotFound(err)).To(BeTrue()) + if err != nil { + Expect(apierrs.IsNotFound(err)).To(BeTrue(), fmt.Sprintf("Expected 'IsNotFound' error deleting pod \"%v/%v\", instead got: %v", pod.Namespace, pod.Name, err)) + Logf("Ignore \"not found\" error above") } - Logf("Ignore \"not found\" error above. Pod %v successfully deleted", pod.Name) + Logf("Pod %q successfully deleted", pod.Name) } // Sanity check for GCE testing. Verify the persistent disk attached to the node. @@ -616,17 +615,13 @@ func deletePD(pdName string) error { } } -// Create the test pod, wait for (hopefully) success, and then delete the pod. +// Create the test pod, wait for success, and then delete the pod. func CreateWaitAndDeletePod(f *Framework, c clientset.Interface, ns string, pvc *v1.PersistentVolumeClaim) { Logf("Creating nfs test pod") - // Make pod spec pod := MakeWritePod(ns, pvc) - - // Instantiate pod (Create) runPod, err := c.CoreV1().Pods(ns).Create(pod) Expect(err).NotTo(HaveOccurred()) Expect(runPod).NotTo(BeNil()) - defer DeletePodWithWait(f, c, runPod) // Wait for the test pod to complete its lifecycle diff --git a/test/e2e/kubelet.go b/test/e2e/kubelet.go index 8fcbc24eb7d..07de91f4197 100644 --- a/test/e2e/kubelet.go +++ b/test/e2e/kubelet.go @@ -104,7 +104,7 @@ func updateNodeLabels(c clientset.Interface, nodeNames sets.String, toAdd, toRem var node *v1.Node var err error for i := 0; i < maxRetries; i++ { - node, err = c.Core().Nodes().Get(nodeName, metav1.GetOptions{}) + node, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) if err != nil { framework.Logf("Error getting node %s: %v", nodeName, err) continue @@ -119,7 +119,7 @@ func updateNodeLabels(c clientset.Interface, nodeNames sets.String, toAdd, toRem delete(node.ObjectMeta.Labels, k) } } - _, err = c.Core().Nodes().Update(node) + _, err = c.CoreV1().Nodes().Update(node) if err != nil { framework.Logf("Error updating node %s: %v", nodeName, err) } else { @@ -145,6 +145,26 @@ func createNfsServerPod(c clientset.Interface, config framework.VolumeTestConfig return pod, ip } +// Restart the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 1` command in the +// pod's (only) container. This command changes the number of nfs server threads from +// (presumably) zero back to 1, and therefore allows nfs to open connections again. +func restartNfsServer(serverPod *v1.Pod) { + + const startcmd = "/usr/sbin/rpc.nfsd 1" + ns := fmt.Sprintf("--namespace=%v", serverPod.Namespace) + framework.RunKubectlOrDie("exec", ns, serverPod.Name, "--", "/bin/sh", "-c", startcmd) +} + +// Stop the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 0` command in the +// pod's (only) container. This command changes the number of nfs server threads to 0, +// thus closing all open nfs connections. +func stopNfsServer(serverPod *v1.Pod) { + + const stopcmd = "/usr/sbin/rpc.nfsd 0" + ns := fmt.Sprintf("--namespace=%v", serverPod.Namespace) + framework.RunKubectlOrDie("exec", ns, serverPod.Name, "--", "/bin/sh", "-c", stopcmd) +} + // Creates a pod that mounts an nfs volume that is served by the nfs-server pod. The container // will execute the passed in shell cmd. Waits for the pod to start. // Note: the nfs plugin is defined inline, no PV or PVC. @@ -196,18 +216,35 @@ func createPodUsingNfs(f *framework.Framework, c clientset.Interface, ns, nfsIP, }, }, } - rtnPod, err := c.Core().Pods(ns).Create(pod) + rtnPod, err := c.CoreV1().Pods(ns).Create(pod) Expect(err).NotTo(HaveOccurred()) err = f.WaitForPodReady(rtnPod.Name) // running & ready Expect(err).NotTo(HaveOccurred()) - rtnPod, err = c.Core().Pods(ns).Get(rtnPod.Name, metav1.GetOptions{}) // return fresh pod + rtnPod, err = c.CoreV1().Pods(ns).Get(rtnPod.Name, metav1.GetOptions{}) // return fresh pod Expect(err).NotTo(HaveOccurred()) return rtnPod } +// move the passed-in pod's UID directory to /tmp. +func movePodUidDir(c clientset.Interface, pod *v1.Pod) { + + dest := "/tmp" + podDir := filepath.Join("/var/lib/kubelet/pods", string(pod.UID)) + cmd := fmt.Sprintf("mv %v %v", podDir, dest) + // use ip rather than hostname in GCE + nodeIP, err := framework.GetHostExternalAddress(c, pod) + Expect(err).NotTo(HaveOccurred()) + + // excute cmd over ssh + result, _ := nodeExec(nodeIP, cmd) + framework.LogSSHResult(result) + Expect(result.Code).To(BeZero()) + Expect(len(result.Stderr)).To(BeZero()) +} + // Checks for a lingering nfs mount and/or uid directory on the pod's host. The host IP is used // so that this test runs in GCE, where it appears that SSH cannot resolve the hostname. // If expectClean is true then we expect the node to be cleaned up and thus commands like @@ -218,42 +255,48 @@ func checkPodCleanup(c clientset.Interface, pod *v1.Pod, expectClean bool) { timeout := 5 * time.Minute poll := 20 * time.Second - podUID := string(pod.UID) - podDir := filepath.Join("/var/lib/kubelet/pods", podUID) + podDir := filepath.Join("/var/lib/kubelet/pods", string(pod.UID)) mountDir := filepath.Join(podDir, "volumes", "kubernetes.io~nfs") // use ip rather than hostname in GCE nodeIP, err := framework.GetHostExternalAddress(c, pod) Expect(err).NotTo(HaveOccurred()) - condMsg := map[bool]string{ - true: "deleted", - false: "present", + condMsg := "deleted" + if !expectClean { + condMsg = "present" } - // table of host tests to perform - tests := map[string]string{ //["what-to-test"] "remote-command" - "pod UID directory": fmt.Sprintf("sudo ls %v", podDir), - "pod nfs mount": fmt.Sprintf("sudo mount | grep %v", mountDir), + // table of host tests to perform (order may matter so not using a map) + type testT struct { + feature string // feature to test + cmd string // remote command to execute on node + } + tests := []testT{ + { + feature: "pod UID directory", + cmd: fmt.Sprintf("sudo ls %v", podDir), + }, + { + feature: "pod nfs mount", + cmd: fmt.Sprintf("sudo mount | grep %v", mountDir), + }, } - for test, cmd := range tests { - framework.Logf("Wait up to %v for host's (%v) %q to be %v", timeout, nodeIP, test, condMsg[expectClean]) + for _, test := range tests { + framework.Logf("Wait up to %v for host's (%v) %q to be %v", timeout, nodeIP, test.feature, condMsg) err = wait.Poll(poll, timeout, func() (bool, error) { - result, _ := nodeExec(nodeIP, cmd) + result, _ := nodeExec(nodeIP, test.cmd) framework.LogSSHResult(result) - sawFiles := result.Code == 0 - if expectClean && sawFiles { // keep trying + ok := (result.Code == 0 && len(result.Stdout) > 0 && len(result.Stderr) == 0) + if expectClean && ok { // keep trying return false, nil } - if !expectClean && !sawFiles { // stop wait loop - return true, fmt.Errorf("%v is gone but expected to exist", test) + if !expectClean && !ok { // stop wait loop + return true, fmt.Errorf("%v is gone but expected to exist", test.feature) } return true, nil // done, host is as expected }) - if err != nil { - framework.Logf("Host (%v) cleanup error: %v. Expected %q to be %v", nodeIP, err, test, condMsg[expectClean]) - Expect(err).NotTo(HaveOccurred()) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Host (%v) cleanup error: %v. Expected %q to be %v", nodeIP, err, test.feature, condMsg)) } if expectClean { @@ -375,14 +418,20 @@ var _ = framework.KubeDescribe("kubelet", func() { } }) - // Delete nfs server pod after another pods accesses the mounted nfs volume. - framework.KubeDescribe("host cleanup with volume mounts [HostCleanup][Flaky]", func() { + // Test host cleanup when disrupting the volume environment. + framework.KubeDescribe("host cleanup with volume mounts [Volume][HostCleanup][Flaky]", func() { + type hostCleanupTest struct { itDescr string podCmd string } - Context("Host cleanup after pod using NFS mount is deleted [Volume][NFS]", func() { + // Disrupt the nfs-server pod after a client pod accesses the nfs volume. + // Note: the nfs-server is stopped NOT deleted. This is done to preserve its ip addr. + // If the nfs-server pod is deleted the client pod's mount can not be unmounted. + // If the nfs-server pod is deleted and re-created, due to having a different ip + // addr, the client pod's mount still cannot be unmounted. + Context("Host cleanup after disrupting NFS volume [NFS]", func() { // issue #31272 var ( nfsServerPod *v1.Pod @@ -394,16 +443,17 @@ var _ = framework.KubeDescribe("kubelet", func() { // fill in test slice for this context testTbl := []hostCleanupTest{ { - itDescr: "after deleting the nfs-server, the host should be cleaned-up when deleting sleeping pod which mounts an NFS vol", - podCmd: "sleep 6000", + itDescr: "after stopping the nfs-server and deleting the (sleeping) client pod, the NFS mount and the pod's UID directory should be removed.", + podCmd: "sleep 6000", // keep pod running }, { - itDescr: "after deleting the nfs-server, the host should be cleaned-up when deleting a pod accessing the NFS vol", - podCmd: "while true; do echo FeFieFoFum >>/mnt/SUCCESS; cat /mnt/SUCCESS; done", + itDescr: "after stopping the nfs-server and deleting the (active) client pod, the NFS mount and the pod's UID directory should be removed.", + podCmd: "while true; do echo FeFieFoFum >>/mnt/SUCCESS; sleep 1; cat /mnt/SUCCESS; done", }, } BeforeEach(func() { + framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...) NFSconfig = framework.VolumeTestConfig{ Namespace: ns, Prefix: "nfs", @@ -420,31 +470,46 @@ var _ = framework.KubeDescribe("kubelet", func() { }) // execute It blocks from above table of tests - for _, test := range testTbl { - t := test // local copy for closure - It(fmt.Sprintf("%v [Serial]", t.itDescr), func() { - // create a pod which uses the nfs server's volume + for _, t := range testTbl { + It(t.itDescr, func() { pod = createPodUsingNfs(f, c, ns, nfsIP, t.podCmd) - By("Delete the NFS server pod") - framework.DeletePodWithWait(f, c, nfsServerPod) - nfsServerPod = nil + By("Stop the NFS server") + stopNfsServer(nfsServerPod) By("Delete the pod mounted to the NFS volume") framework.DeletePodWithWait(f, c, pod) // pod object is now stale, but is intentionally not nil - By("Check if host running deleted pod has been cleaned up -- expect not") - // expect the pod's host *not* to be cleaned up + By("Check if pod's host has been cleaned up -- expect not") checkPodCleanup(c, pod, false) - By("Recreate the nfs server pod") - nfsServerPod, nfsIP = createNfsServerPod(c, NFSconfig) + By("Restart the nfs server") + restartNfsServer(nfsServerPod) + By("Verify host running the deleted pod is now cleaned up") - // expect the pod's host to be cleaned up checkPodCleanup(c, pod, true) }) } + + // Move a pod's uid dir to /tmp and delete the pod. + // Addresses issue #37657. + // Note: the pod's vol mount (as a side effect) ends up being moved to /tmp + // and can be unmounted via `umount -f`. + It("move NFS client pod's UID directory then delete pod", func() { + pod = createPodUsingNfs(f, c, ns, nfsIP, "sleep 6000") + + By("Move pod's uid dir to /tmp") + movePodUidDir(c, pod) + + By("Delete the pod mounted to the NFS volume") + framework.DeletePodWithWait(f, c, pod) + // pod object is now stale, but is intentionally not nil + // Note: the pod's nfs mount, now in /tmp, will not be unmounted + + By("Verify host running the deleted pod is cleaned up") + checkPodCleanup(c, pod, true) + }) }) }) })