From 220163f67de05890dd57b2ed6f2b02362f633a78 Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Tue, 2 Feb 2016 14:38:49 +0100 Subject: [PATCH] Fixed races in Cinder volume attach/detach. Add a mutex to guard SetUpAt() and TearDownAt() calls - they should not run in parallel. There is a race in these calls when there are two pods using the same volume, one of them is dying and the other one starting. TearDownAt() checks that a volume is not needed by any pods and detaches the volume. It does so by counting how many times is the volume mounted (GetMountRefs() call below). When SetUpAt() of the starting pod already attached the volume and did not mount it yet, TearDownAt() of the dying pod will detach it - GetMountRefs() does not count with this volume. These two threads run in parallel: dying pod.TearDownAt("myVolume") starting pod.SetUpAt("myVolume") | | | AttachDisk("myVolume") refs, err := mount.GetMountRefs() | Unmount("myDir") | if refs == 1 { | | | Mount("myVolume", "myDir") | | | | DetachDisk("myVolume") | | start containers - OOPS! The volume is detached! | finish the pod cleanup Also, add some logs to cinder plugin for easier debugging in the future, add a test and update the fake mounter to know about bind mounts. --- pkg/util/mount/fake.go | 58 ++++++++++- pkg/volume/cinder/cinder.go | 53 +++++++++- pkg/volume/cinder/cinder_test.go | 173 +++++++++++++++++++++++++++++-- 3 files changed, 269 insertions(+), 15 deletions(-) diff --git a/pkg/util/mount/fake.go b/pkg/util/mount/fake.go index 35c2f36aa5f..115293813b5 100644 --- a/pkg/util/mount/fake.go +++ b/pkg/util/mount/fake.go @@ -16,10 +16,19 @@ limitations under the License. package mount +import ( + "sync" + + "github.com/golang/glog" +) + // FakeMounter implements mount.Interface for tests. type FakeMounter struct { MountPoints []MountPoint Log []FakeAction + // Some tests run things in parallel, make sure the mounter does not produce + // any golang's DATA RACE warnings. + mutex sync.Mutex } var _ Interface = &FakeMounter{} @@ -37,21 +46,58 @@ type FakeAction struct { } func (f *FakeMounter) ResetLog() { + f.mutex.Lock() + defer f.mutex.Unlock() + f.Log = []FakeAction{} } func (f *FakeMounter) Mount(source string, target string, fstype string, options []string) error { + f.mutex.Lock() + defer f.mutex.Unlock() + + // find 'bind' option + for _, option := range options { + if option == "bind" { + // This is a bind-mount. In order to mimic linux behaviour, we must + // use the original device of the bind-mount as the real source. + // E.g. when mounted /dev/sda like this: + // $ mount /dev/sda /mnt/test + // $ mount -o bind /mnt/test /mnt/bound + // then /proc/mount contains: + // /dev/sda /mnt/test + // /dev/sda /mnt/bound + // (and not /mnt/test /mnt/bound) + // I.e. we must use /dev/sda as source instead of /mnt/test in the + // bind mount. + for _, mnt := range f.MountPoints { + if source == mnt.Path { + source = mnt.Device + break + } + } + break + } + } + f.MountPoints = append(f.MountPoints, MountPoint{Device: source, Path: target, Type: fstype}) + glog.V(5).Infof("Fake mounter: mouted %s to %s", source, target) f.Log = append(f.Log, FakeAction{Action: FakeActionMount, Target: target, Source: source, FSType: fstype}) return nil } func (f *FakeMounter) Unmount(target string) error { + f.mutex.Lock() + defer f.mutex.Unlock() + newMountpoints := []MountPoint{} for _, mp := range f.MountPoints { - if mp.Path != target { - newMountpoints = append(newMountpoints, MountPoint{Device: mp.Device, Path: mp.Path, Type: mp.Type}) + if mp.Path == target { + glog.V(5).Infof("Fake mounter: unmouted %s from %s", mp.Device, target) + // Don't copy it to newMountpoints + continue } + newMountpoints = append(newMountpoints, MountPoint{Device: mp.Device, Path: mp.Path, Type: mp.Type}) } f.MountPoints = newMountpoints f.Log = append(f.Log, FakeAction{Action: FakeActionUnmount, Target: target}) @@ -59,14 +105,22 @@ func (f *FakeMounter) Unmount(target string) error { } func (f *FakeMounter) List() ([]MountPoint, error) { + f.mutex.Lock() + defer f.mutex.Unlock() + return f.MountPoints, nil } func (f *FakeMounter) IsLikelyNotMountPoint(file string) (bool, error) { + f.mutex.Lock() + defer f.mutex.Unlock() + for _, mp := range f.MountPoints { if mp.Path == file { + glog.V(5).Infof("isLikelyMountPoint for %s: monted %s, false", file, mp.Path) return false, nil } } + glog.V(5).Infof("isLikelyMountPoint for %s: true", file) return true, nil } diff --git a/pkg/volume/cinder/cinder.go b/pkg/volume/cinder/cinder.go index d8c222e837c..0356400bce7 100644 --- a/pkg/volume/cinder/cinder.go +++ b/pkg/volume/cinder/cinder.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/cloudprovider/providers/openstack" "k8s.io/kubernetes/pkg/types" "k8s.io/kubernetes/pkg/util/exec" + "k8s.io/kubernetes/pkg/util/keymutex" "k8s.io/kubernetes/pkg/util/mount" "k8s.io/kubernetes/pkg/util/strings" "k8s.io/kubernetes/pkg/volume" @@ -35,11 +36,13 @@ import ( // This is the primary entrypoint for volume plugins. func ProbeVolumePlugins() []volume.VolumePlugin { - return []volume.VolumePlugin{&cinderPlugin{nil}} + return []volume.VolumePlugin{&cinderPlugin{}} } type cinderPlugin struct { host volume.VolumeHost + // Guarding SetUp and TearDown operations + volumeLocks keymutex.KeyMutex } var _ volume.VolumePlugin = &cinderPlugin{} @@ -53,6 +56,7 @@ const ( func (plugin *cinderPlugin) Init(host volume.VolumeHost) error { plugin.host = host + plugin.volumeLocks = keymutex.NewKeyMutex() return nil } @@ -228,19 +232,27 @@ func (b *cinderVolumeBuilder) SetUp(fsGroup *int64) error { // SetUp attaches the disk and bind mounts to the volume path. func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error { + glog.V(5).Infof("Cinder SetUp %s to %s", b.pdName, dir) + + b.plugin.volumeLocks.LockKey(b.pdName) + defer b.plugin.volumeLocks.UnlockKey(b.pdName) + // TODO: handle failed mounts here. notmnt, err := b.mounter.IsLikelyNotMountPoint(dir) - glog.V(4).Infof("PersistentDisk set up: %s %v %v", dir, !notmnt, err) if err != nil && !os.IsNotExist(err) { + glog.V(4).Infof("IsLikelyNotMountPoint failed: %v", err) return err } if !notmnt { + glog.V(4).Infof("Something is already mounted to target %s", dir) return nil } globalPDPath := makeGlobalPDName(b.plugin.host, b.pdName) if err := b.manager.AttachDisk(b, globalPDPath); err != nil { + glog.V(4).Infof("AttachDisk failed: %v", err) return err } + glog.V(3).Infof("Cinder volume %s attached", b.pdName) options := []string{"bind"} if b.readOnly { @@ -249,6 +261,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error { if err := os.MkdirAll(dir, 0750); err != nil { // TODO: we should really eject the attach/detach out into its own control loop. + glog.V(4).Infof("Could not create directory %s: %v", dir, err) detachDiskLogError(b.cinderVolume) return err } @@ -256,6 +269,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error { // Perform a bind mount to the full path to allow duplicate mounts of the same PD. err = b.mounter.Mount(globalPDPath, dir, "", options) if err != nil { + glog.V(4).Infof("Mount failed: %v", err) notmnt, mntErr := b.mounter.IsLikelyNotMountPoint(dir) if mntErr != nil { glog.Errorf("IsLikelyNotMountPoint check failed: %v", mntErr) @@ -286,6 +300,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error { if !b.readOnly { volume.SetVolumeOwnership(b, fsGroup) } + glog.V(3).Infof("Cinder volume %s mounted to %s", b.pdName, dir) return nil } @@ -312,37 +327,65 @@ func (c *cinderVolumeCleaner) TearDown() error { // Unmounts the bind mount, and detaches the disk only if the PD // resource was the last reference to that disk on the kubelet. func (c *cinderVolumeCleaner) TearDownAt(dir string) error { + glog.V(5).Infof("Cinder TearDown of %s", dir) notmnt, err := c.mounter.IsLikelyNotMountPoint(dir) if err != nil { + glog.V(4).Infof("IsLikelyNotMountPoint check failed: %v", err) return err } if notmnt { + glog.V(4).Infof("Nothing is mounted to %s, ignoring", dir) return os.Remove(dir) } + + // Find Cinder volumeID to lock the right volume + // TODO: refactor VolumePlugin.NewCleaner to get full volume.Spec just like + // NewBuilder. We could then find volumeID there without probing MountRefs. refs, err := mount.GetMountRefs(c.mounter, dir) if err != nil { + glog.V(4).Infof("GetMountRefs failed: %v", err) + return err + } + if len(refs) == 0 { + glog.V(4).Infof("Directory %s is not mounted", dir) + return fmt.Errorf("directory %s is not mounted", dir) + } + c.pdName = path.Base(refs[0]) + glog.V(4).Infof("Found volume %s mounted to %s", c.pdName, dir) + + // lock the volume (and thus wait for any concurrrent SetUpAt to finish) + c.plugin.volumeLocks.LockKey(c.pdName) + defer c.plugin.volumeLocks.UnlockKey(c.pdName) + + // Reload list of references, there might be SetUpAt finished in the meantime + refs, err = mount.GetMountRefs(c.mounter, dir) + if err != nil { + glog.V(4).Infof("GetMountRefs failed: %v", err) return err } if err := c.mounter.Unmount(dir); err != nil { + glog.V(4).Infof("Unmount failed: %v", err) return err } - glog.Infof("successfully unmounted: %s\n", dir) + glog.V(3).Infof("Successfully unmounted: %s\n", dir) // If refCount is 1, then all bind mounts have been removed, and the // remaining reference is the global mount. It is safe to detach. if len(refs) == 1 { - c.pdName = path.Base(refs[0]) if err := c.manager.DetachDisk(c); err != nil { + glog.V(4).Infof("DetachDisk failed: %v", err) return err } + glog.V(3).Infof("Volume %s detached", c.pdName) } notmnt, mntErr := c.mounter.IsLikelyNotMountPoint(dir) if mntErr != nil { glog.Errorf("IsLikelyNotMountPoint check failed: %v", mntErr) return err } - if !notmnt { + if notmnt { if err := os.Remove(dir); err != nil { + glog.V(4).Infof("Failed to remove directory after unmount: %v", err) return err } } diff --git a/pkg/volume/cinder/cinder_test.go b/pkg/volume/cinder/cinder_test.go index 33a426ec480..d138ae920dc 100644 --- a/pkg/volume/cinder/cinder_test.go +++ b/pkg/volume/cinder/cinder_test.go @@ -18,9 +18,14 @@ package cinder import ( "fmt" + "io/ioutil" "os" "path" + "sync/atomic" "testing" + "time" + + "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/resource" @@ -55,20 +60,60 @@ func TestCanSupport(t *testing.T) { } } -type fakePDManager struct{} +type fakePDManager struct { + // How long should AttachDisk/DetachDisk take - we need slower AttachDisk in a test. + attachDetachDuration time.Duration +} +func getFakeDeviceName(host volume.VolumeHost, pdName string) string { + return path.Join(host.GetPluginDir(cinderVolumePluginName), "device", pdName) +} + +// Real Cinder AttachDisk attaches a cinder volume. If it is not yet mounted, +// it mounts it it to globalPDPath. +// We create a dummy directory (="device") and bind-mount it to globalPDPath func (fake *fakePDManager) AttachDisk(b *cinderVolumeBuilder, globalPDPath string) error { globalPath := makeGlobalPDName(b.plugin.host, b.pdName) - err := os.MkdirAll(globalPath, 0750) + fakeDeviceName := getFakeDeviceName(b.plugin.host, b.pdName) + err := os.MkdirAll(fakeDeviceName, 0750) if err != nil { return err } + // Attaching a Cinder volume can be slow... + time.Sleep(fake.attachDetachDuration) + + // The volume is "attached", bind-mount it if it's not mounted yet. + notmnt, err := b.mounter.IsLikelyNotMountPoint(globalPath) + if err != nil { + if os.IsNotExist(err) { + if err := os.MkdirAll(globalPath, 0750); err != nil { + return err + } + notmnt = true + } else { + return err + } + } + if notmnt { + err = b.mounter.Mount(fakeDeviceName, globalPath, "", []string{"bind"}) + if err != nil { + return err + } + } return nil } func (fake *fakePDManager) DetachDisk(c *cinderVolumeCleaner) error { globalPath := makeGlobalPDName(c.plugin.host, c.pdName) - err := os.RemoveAll(globalPath) + fakeDeviceName := getFakeDeviceName(c.plugin.host, c.pdName) + // unmount the bind-mount - should be fast + err := c.mounter.Unmount(globalPath) + if err != nil { + return err + } + + // "Detach" the fake "device" + err = os.RemoveAll(fakeDeviceName) if err != nil { return err } @@ -108,7 +153,7 @@ func TestPlugin(t *testing.T) { }, }, } - builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{}, &mount.FakeMounter{}) + builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{0}, &mount.FakeMounter{}) if err != nil { t.Errorf("Failed to make a new Builder: %v", err) } @@ -139,7 +184,7 @@ func TestPlugin(t *testing.T) { } } - cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{}, &mount.FakeMounter{}) + cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{0}, &mount.FakeMounter{}) if err != nil { t.Errorf("Failed to make a new Cleaner: %v", err) } @@ -165,14 +210,14 @@ func TestPlugin(t *testing.T) { }, PersistentVolumeReclaimPolicy: api.PersistentVolumeReclaimDelete, } - provisioner, err := plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{}) + provisioner, err := plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{0}) persistentSpec, err := provisioner.NewPersistentVolumeTemplate() if err != nil { t.Errorf("NewPersistentVolumeTemplate() failed: %v", err) } // get 2nd Provisioner - persistent volume controller will do the same - provisioner, err = plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{}) + provisioner, err = plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{0}) err = provisioner.Provision(persistentSpec) if err != nil { t.Errorf("Provision() failed: %v", err) @@ -191,9 +236,121 @@ func TestPlugin(t *testing.T) { volSpec := &volume.Spec{ PersistentVolume: persistentSpec, } - deleter, err := plug.(*cinderPlugin).newDeleterInternal(volSpec, &fakePDManager{}) + deleter, err := plug.(*cinderPlugin).newDeleterInternal(volSpec, &fakePDManager{0}) err = deleter.Delete() if err != nil { t.Errorf("Deleter() failed: %v", err) } } + +// Test a race when a volume is simultaneously SetUp and TearedDown +func TestAttachDetachRace(t *testing.T) { + tmpDir, err := ioutil.TempDir(os.TempDir(), "cinderTest") + if err != nil { + t.Fatalf("can't make a temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + plugMgr := volume.VolumePluginMgr{} + host := volume.NewFakeVolumeHost(tmpDir, nil, nil) + plugMgr.InitPlugins(ProbeVolumePlugins(), host) + + plug, err := plugMgr.FindPluginByName("kubernetes.io/cinder") + if err != nil { + t.Errorf("Can't find the plugin by name") + } + spec := &api.Volume{ + Name: "vol1", + VolumeSource: api.VolumeSource{ + Cinder: &api.CinderVolumeSource{ + VolumeID: "pd", + FSType: "ext4", + }, + }, + } + fakeMounter := &mount.FakeMounter{} + // SetUp the volume for 1st time + builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{time.Second}, fakeMounter) + if err != nil { + t.Errorf("Failed to make a new Builder: %v", err) + } + if builder == nil { + t.Errorf("Got a nil Builder: %v") + } + + if err := builder.SetUp(nil); err != nil { + t.Errorf("Expected success, got: %v", err) + } + path := builder.GetPath() + + // TearDown the 1st volume and SetUp the 2nd volume (to different pod) at the same time + builder, err = plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid2"), &fakePDManager{time.Second}, fakeMounter) + if err != nil { + t.Errorf("Failed to make a new Builder: %v", err) + } + if builder == nil { + t.Errorf("Got a nil Builder: %v") + } + + cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{time.Second}, fakeMounter) + if err != nil { + t.Errorf("Failed to make a new Cleaner: %v", err) + } + + var buildComplete uint32 = 0 + + go func() { + glog.Infof("Attaching volume") + if err := builder.SetUp(nil); err != nil { + t.Errorf("Expected success, got: %v", err) + } + glog.Infof("Volume attached") + atomic.AddUint32(&buildComplete, 1) + }() + + // builder is attaching the volume, which takes 1 second. Detach it in the middle of this interval + time.Sleep(time.Second / 2) + + glog.Infof("Detaching volume") + if err = cleaner.TearDown(); err != nil { + t.Errorf("Expected success, got: %v", err) + } + glog.Infof("Volume detached") + + // wait for the builder to finish + for atomic.LoadUint32(&buildComplete) == 0 { + time.Sleep(time.Millisecond * 100) + } + + // The volume should still be attached + devicePath := getFakeDeviceName(host, "pd") + if _, err := os.Stat(devicePath); err != nil { + if os.IsNotExist(err) { + t.Errorf("SetUp() failed, volume detached by simultaneous TearDown: %s", path) + } else { + t.Errorf("SetUp() failed: %v", err) + } + } + + // TearDown the 2nd volume + cleaner, err = plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid2"), &fakePDManager{0}, fakeMounter) + if err != nil { + t.Errorf("Failed to make a new Cleaner: %v", err) + } + if cleaner == nil { + t.Errorf("Got a nil Cleaner: %v") + } + + if err := cleaner.TearDown(); err != nil { + t.Errorf("Expected success, got: %v", err) + } + if _, err := os.Stat(path); err == nil { + t.Errorf("TearDown() failed, volume path still exists: %s", path) + } else if !os.IsNotExist(err) { + t.Errorf("SetUp() failed: %v", err) + } + if _, err := os.Stat(devicePath); err == nil { + t.Errorf("TearDown() failed, volume is still attached: %s", devicePath) + } else if !os.IsNotExist(err) { + t.Errorf("SetUp() failed: %v", err) + } +}