From c95975d3f32e409f0443d041fd5be0e0438c23d7 Mon Sep 17 00:00:00 2001 From: Mauricio Poppe Date: Mon, 8 Mar 2021 19:01:09 +0000 Subject: [PATCH] wait until the node is not using the volume before taking a snapshot --- test/e2e/storage/testsuites/provisioning.go | 6 ++- test/e2e/storage/testsuites/snapshottable.go | 47 ++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/test/e2e/storage/testsuites/provisioning.go b/test/e2e/storage/testsuites/provisioning.go index 1ec3fb38f68..ae77dbf808d 100644 --- a/test/e2e/storage/testsuites/provisioning.go +++ b/test/e2e/storage/testsuites/provisioning.go @@ -674,10 +674,14 @@ func (t StorageClassTest) TestBindingWaitForFirstConsumerMultiPVC(claims []*v1.P // RunInPodWithVolume runs a command in a pod with given claim mounted to /mnt directory. // It starts, checks, collects output and stops it. -func RunInPodWithVolume(c clientset.Interface, t *framework.TimeoutContext, ns, claimName, podName, command string, node e2epod.NodeSelection) { +func RunInPodWithVolume(c clientset.Interface, t *framework.TimeoutContext, ns, claimName, podName, command string, node e2epod.NodeSelection) *v1.Pod { pod := StartInPodWithVolume(c, ns, claimName, podName, command, node) defer StopPod(c, pod) framework.ExpectNoError(e2epod.WaitForPodSuccessInNamespaceTimeout(c, pod.Name, pod.Namespace, t.PodStartSlow)) + // get the latest status of the pod + pod, err := c.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + return pod } // StartInPodWithVolume starts a command in a pod with given claim mounted to /mnt directory diff --git a/test/e2e/storage/testsuites/snapshottable.go b/test/e2e/storage/testsuites/snapshottable.go index 7273d8e46de..87b5f1393fe 100644 --- a/test/e2e/storage/testsuites/snapshottable.go +++ b/test/e2e/storage/testsuites/snapshottable.go @@ -19,9 +19,11 @@ package testsuites import ( "context" "fmt" + "strings" "time" "github.com/onsi/ginkgo" + "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" @@ -143,7 +145,7 @@ func (s *snapshottableTestSuite) DefineTests(driver storageframework.TestDriver, originalMntTestData = fmt.Sprintf("hello from %s namespace", pvc.GetNamespace()) command := fmt.Sprintf("echo '%s' > %s", originalMntTestData, datapath) - RunInPodWithVolume(cs, f.Timeouts, pvc.Namespace, pvc.Name, "pvc-snapshottable-tester", command, config.ClientNodeSelection) + pod := RunInPodWithVolume(cs, f.Timeouts, pvc.Namespace, pvc.Name, "pvc-snapshottable-tester", command, config.ClientNodeSelection) err = e2epv.WaitForPersistentVolumeClaimPhase(v1.ClaimBound, cs, pvc.Namespace, pvc.Name, framework.Poll, f.Timeouts.ClaimProvision) framework.ExpectNoError(err) @@ -155,8 +157,47 @@ func (s *snapshottableTestSuite) DefineTests(driver storageframework.TestDriver, // Get the bound PV ginkgo.By("[init] checking the PV") - _, err = cs.CoreV1().PersistentVolumes().Get(context.TODO(), pvc.Spec.VolumeName, metav1.GetOptions{}) + pv, err := cs.CoreV1().PersistentVolumes().Get(context.TODO(), pvc.Spec.VolumeName, metav1.GetOptions{}) framework.ExpectNoError(err) + + // At this point we know that: + // - a pod was created with a PV that's supposed to have data + // + // However there's a caching issue that @jinxu97 explained and it's related with the pod & volume + // lifecycle in windows, to understand it we first analyze what the volumemanager does: + // - when a pod is delete the volumemanager will try to cleanup the volume mounts + // - NodeUnpublishVolume: unbinds the bind mount from the container + // - Linux: the data is flushed to disk + // - Windows: we delete a symlink, data's not flushed yet to disk + // - NodeUnstageVolume: unmount the global mount + // - Linux: disk is detached + // - Windows: data is flushed to disk and the disk is detached + // + // Pod deletion might not guarantee a data flush to disk, however NodeUnstageVolume adds the logic + // to flush the data to disk (see #81690 for details). + // + // In the following code by checking if the PV is not in the node.Status.VolumesInUse field we + // ensure that the volume is not used by the node anymore (an indicator that NodeUnstageVolume has + // already finished) + if framework.NodeOSDistroIs("windows") { + nodeName := pod.Spec.NodeName + gomega.Expect(nodeName).NotTo(gomega.BeEmpty(), "pod.Spec.NodeName must not be empty") + + ginkgo.By(fmt.Sprintf("[init] waiting until the node=%s is not using the volume=%s", nodeName, pv.Name)) + success := storageutils.WaitUntil(framework.Poll, f.Timeouts.PVDelete, func() bool { + node, err := cs.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + volumesInUse := node.Status.VolumesInUse + framework.Logf("current volumes in use: %+v", volumesInUse) + for i := 0; i < len(volumesInUse); i++ { + if strings.HasSuffix(string(volumesInUse[i]), pv.Name) { + return false + } + } + return true + }) + framework.ExpectEqual(success, true) + } } cleanup := func() { @@ -200,8 +241,8 @@ func (s *snapshottableTestSuite) DefineTests(driver storageframework.TestDriver, vsc = sr.Vsclass }) ginkgo.It("should check snapshot fields, check restore correctly works after modifying source data, check deletion", func() { - ginkgo.By("checking the snapshot") // Get new copy of the snapshot + ginkgo.By("checking the snapshot") vs, err = dc.Resource(storageutils.SnapshotGVR).Namespace(vs.GetNamespace()).Get(context.TODO(), vs.GetName(), metav1.GetOptions{}) framework.ExpectNoError(err)