Fixed races in Cinder volume attach/detach.

Add a mutex to guard SetUpAt() and TearDownAt() calls - they should not
run in parallel.  There is a race in these calls when there are two pods
using the same volume, one of them is dying and the other one starting.

TearDownAt() checks that a volume is not needed by any pods and detaches the
volume. It does so by counting how many times is the volume mounted
(GetMountRefs() call below).

When SetUpAt() of the starting pod already attached the volume and did not mount
it yet, TearDownAt() of the dying pod will detach it - GetMountRefs() does not
count with this volume.

These two threads run in parallel:

 dying pod.TearDownAt("myVolume")          starting pod.SetUpAt("myVolume")
   |                                       |
   |                                       AttachDisk("myVolume")
   refs, err := mount.GetMountRefs()       |
   Unmount("myDir")                        |
   if refs == 1 {                          |
   |  |                                    Mount("myVolume", "myDir")
   |  |                                    |
   |  DetachDisk("myVolume")               |
   |                                       start containers - OOPS! The volume is detached!
   |
   finish the pod cleanup


Also, add some logs to cinder plugin for easier debugging in the future, add
a test and update the fake mounter to know about bind mounts.
This commit is contained in:
Jan Safranek 2016-02-02 14:38:49 +01:00
parent f7cab1fd3e
commit 220163f67d
3 changed files with 269 additions and 15 deletions

View File

@ -16,10 +16,19 @@ limitations under the License.
package mount
import (
"sync"
"github.com/golang/glog"
)
// FakeMounter implements mount.Interface for tests.
type FakeMounter struct {
MountPoints []MountPoint
Log []FakeAction
// Some tests run things in parallel, make sure the mounter does not produce
// any golang's DATA RACE warnings.
mutex sync.Mutex
}
var _ Interface = &FakeMounter{}
@ -37,21 +46,58 @@ type FakeAction struct {
}
func (f *FakeMounter) ResetLog() {
f.mutex.Lock()
defer f.mutex.Unlock()
f.Log = []FakeAction{}
}
func (f *FakeMounter) Mount(source string, target string, fstype string, options []string) error {
f.mutex.Lock()
defer f.mutex.Unlock()
// find 'bind' option
for _, option := range options {
if option == "bind" {
// This is a bind-mount. In order to mimic linux behaviour, we must
// use the original device of the bind-mount as the real source.
// E.g. when mounted /dev/sda like this:
// $ mount /dev/sda /mnt/test
// $ mount -o bind /mnt/test /mnt/bound
// then /proc/mount contains:
// /dev/sda /mnt/test
// /dev/sda /mnt/bound
// (and not /mnt/test /mnt/bound)
// I.e. we must use /dev/sda as source instead of /mnt/test in the
// bind mount.
for _, mnt := range f.MountPoints {
if source == mnt.Path {
source = mnt.Device
break
}
}
break
}
}
f.MountPoints = append(f.MountPoints, MountPoint{Device: source, Path: target, Type: fstype})
glog.V(5).Infof("Fake mounter: mouted %s to %s", source, target)
f.Log = append(f.Log, FakeAction{Action: FakeActionMount, Target: target, Source: source, FSType: fstype})
return nil
}
func (f *FakeMounter) Unmount(target string) error {
f.mutex.Lock()
defer f.mutex.Unlock()
newMountpoints := []MountPoint{}
for _, mp := range f.MountPoints {
if mp.Path != target {
newMountpoints = append(newMountpoints, MountPoint{Device: mp.Device, Path: mp.Path, Type: mp.Type})
if mp.Path == target {
glog.V(5).Infof("Fake mounter: unmouted %s from %s", mp.Device, target)
// Don't copy it to newMountpoints
continue
}
newMountpoints = append(newMountpoints, MountPoint{Device: mp.Device, Path: mp.Path, Type: mp.Type})
}
f.MountPoints = newMountpoints
f.Log = append(f.Log, FakeAction{Action: FakeActionUnmount, Target: target})
@ -59,14 +105,22 @@ func (f *FakeMounter) Unmount(target string) error {
}
func (f *FakeMounter) List() ([]MountPoint, error) {
f.mutex.Lock()
defer f.mutex.Unlock()
return f.MountPoints, nil
}
func (f *FakeMounter) IsLikelyNotMountPoint(file string) (bool, error) {
f.mutex.Lock()
defer f.mutex.Unlock()
for _, mp := range f.MountPoints {
if mp.Path == file {
glog.V(5).Infof("isLikelyMountPoint for %s: monted %s, false", file, mp.Path)
return false, nil
}
}
glog.V(5).Infof("isLikelyMountPoint for %s: true", file)
return true, nil
}

View File

@ -28,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/cloudprovider/providers/openstack"
"k8s.io/kubernetes/pkg/types"
"k8s.io/kubernetes/pkg/util/exec"
"k8s.io/kubernetes/pkg/util/keymutex"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/pkg/util/strings"
"k8s.io/kubernetes/pkg/volume"
@ -35,11 +36,13 @@ import (
// This is the primary entrypoint for volume plugins.
func ProbeVolumePlugins() []volume.VolumePlugin {
return []volume.VolumePlugin{&cinderPlugin{nil}}
return []volume.VolumePlugin{&cinderPlugin{}}
}
type cinderPlugin struct {
host volume.VolumeHost
// Guarding SetUp and TearDown operations
volumeLocks keymutex.KeyMutex
}
var _ volume.VolumePlugin = &cinderPlugin{}
@ -53,6 +56,7 @@ const (
func (plugin *cinderPlugin) Init(host volume.VolumeHost) error {
plugin.host = host
plugin.volumeLocks = keymutex.NewKeyMutex()
return nil
}
@ -228,19 +232,27 @@ func (b *cinderVolumeBuilder) SetUp(fsGroup *int64) error {
// SetUp attaches the disk and bind mounts to the volume path.
func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error {
glog.V(5).Infof("Cinder SetUp %s to %s", b.pdName, dir)
b.plugin.volumeLocks.LockKey(b.pdName)
defer b.plugin.volumeLocks.UnlockKey(b.pdName)
// TODO: handle failed mounts here.
notmnt, err := b.mounter.IsLikelyNotMountPoint(dir)
glog.V(4).Infof("PersistentDisk set up: %s %v %v", dir, !notmnt, err)
if err != nil && !os.IsNotExist(err) {
glog.V(4).Infof("IsLikelyNotMountPoint failed: %v", err)
return err
}
if !notmnt {
glog.V(4).Infof("Something is already mounted to target %s", dir)
return nil
}
globalPDPath := makeGlobalPDName(b.plugin.host, b.pdName)
if err := b.manager.AttachDisk(b, globalPDPath); err != nil {
glog.V(4).Infof("AttachDisk failed: %v", err)
return err
}
glog.V(3).Infof("Cinder volume %s attached", b.pdName)
options := []string{"bind"}
if b.readOnly {
@ -249,6 +261,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error {
if err := os.MkdirAll(dir, 0750); err != nil {
// TODO: we should really eject the attach/detach out into its own control loop.
glog.V(4).Infof("Could not create directory %s: %v", dir, err)
detachDiskLogError(b.cinderVolume)
return err
}
@ -256,6 +269,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error {
// Perform a bind mount to the full path to allow duplicate mounts of the same PD.
err = b.mounter.Mount(globalPDPath, dir, "", options)
if err != nil {
glog.V(4).Infof("Mount failed: %v", err)
notmnt, mntErr := b.mounter.IsLikelyNotMountPoint(dir)
if mntErr != nil {
glog.Errorf("IsLikelyNotMountPoint check failed: %v", mntErr)
@ -286,6 +300,7 @@ func (b *cinderVolumeBuilder) SetUpAt(dir string, fsGroup *int64) error {
if !b.readOnly {
volume.SetVolumeOwnership(b, fsGroup)
}
glog.V(3).Infof("Cinder volume %s mounted to %s", b.pdName, dir)
return nil
}
@ -312,37 +327,65 @@ func (c *cinderVolumeCleaner) TearDown() error {
// Unmounts the bind mount, and detaches the disk only if the PD
// resource was the last reference to that disk on the kubelet.
func (c *cinderVolumeCleaner) TearDownAt(dir string) error {
glog.V(5).Infof("Cinder TearDown of %s", dir)
notmnt, err := c.mounter.IsLikelyNotMountPoint(dir)
if err != nil {
glog.V(4).Infof("IsLikelyNotMountPoint check failed: %v", err)
return err
}
if notmnt {
glog.V(4).Infof("Nothing is mounted to %s, ignoring", dir)
return os.Remove(dir)
}
// Find Cinder volumeID to lock the right volume
// TODO: refactor VolumePlugin.NewCleaner to get full volume.Spec just like
// NewBuilder. We could then find volumeID there without probing MountRefs.
refs, err := mount.GetMountRefs(c.mounter, dir)
if err != nil {
glog.V(4).Infof("GetMountRefs failed: %v", err)
return err
}
if len(refs) == 0 {
glog.V(4).Infof("Directory %s is not mounted", dir)
return fmt.Errorf("directory %s is not mounted", dir)
}
c.pdName = path.Base(refs[0])
glog.V(4).Infof("Found volume %s mounted to %s", c.pdName, dir)
// lock the volume (and thus wait for any concurrrent SetUpAt to finish)
c.plugin.volumeLocks.LockKey(c.pdName)
defer c.plugin.volumeLocks.UnlockKey(c.pdName)
// Reload list of references, there might be SetUpAt finished in the meantime
refs, err = mount.GetMountRefs(c.mounter, dir)
if err != nil {
glog.V(4).Infof("GetMountRefs failed: %v", err)
return err
}
if err := c.mounter.Unmount(dir); err != nil {
glog.V(4).Infof("Unmount failed: %v", err)
return err
}
glog.Infof("successfully unmounted: %s\n", dir)
glog.V(3).Infof("Successfully unmounted: %s\n", dir)
// If refCount is 1, then all bind mounts have been removed, and the
// remaining reference is the global mount. It is safe to detach.
if len(refs) == 1 {
c.pdName = path.Base(refs[0])
if err := c.manager.DetachDisk(c); err != nil {
glog.V(4).Infof("DetachDisk failed: %v", err)
return err
}
glog.V(3).Infof("Volume %s detached", c.pdName)
}
notmnt, mntErr := c.mounter.IsLikelyNotMountPoint(dir)
if mntErr != nil {
glog.Errorf("IsLikelyNotMountPoint check failed: %v", mntErr)
return err
}
if !notmnt {
if notmnt {
if err := os.Remove(dir); err != nil {
glog.V(4).Infof("Failed to remove directory after unmount: %v", err)
return err
}
}

View File

@ -18,9 +18,14 @@ package cinder
import (
"fmt"
"io/ioutil"
"os"
"path"
"sync/atomic"
"testing"
"time"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
@ -55,20 +60,60 @@ func TestCanSupport(t *testing.T) {
}
}
type fakePDManager struct{}
type fakePDManager struct {
// How long should AttachDisk/DetachDisk take - we need slower AttachDisk in a test.
attachDetachDuration time.Duration
}
func getFakeDeviceName(host volume.VolumeHost, pdName string) string {
return path.Join(host.GetPluginDir(cinderVolumePluginName), "device", pdName)
}
// Real Cinder AttachDisk attaches a cinder volume. If it is not yet mounted,
// it mounts it it to globalPDPath.
// We create a dummy directory (="device") and bind-mount it to globalPDPath
func (fake *fakePDManager) AttachDisk(b *cinderVolumeBuilder, globalPDPath string) error {
globalPath := makeGlobalPDName(b.plugin.host, b.pdName)
err := os.MkdirAll(globalPath, 0750)
fakeDeviceName := getFakeDeviceName(b.plugin.host, b.pdName)
err := os.MkdirAll(fakeDeviceName, 0750)
if err != nil {
return err
}
// Attaching a Cinder volume can be slow...
time.Sleep(fake.attachDetachDuration)
// The volume is "attached", bind-mount it if it's not mounted yet.
notmnt, err := b.mounter.IsLikelyNotMountPoint(globalPath)
if err != nil {
if os.IsNotExist(err) {
if err := os.MkdirAll(globalPath, 0750); err != nil {
return err
}
notmnt = true
} else {
return err
}
}
if notmnt {
err = b.mounter.Mount(fakeDeviceName, globalPath, "", []string{"bind"})
if err != nil {
return err
}
}
return nil
}
func (fake *fakePDManager) DetachDisk(c *cinderVolumeCleaner) error {
globalPath := makeGlobalPDName(c.plugin.host, c.pdName)
err := os.RemoveAll(globalPath)
fakeDeviceName := getFakeDeviceName(c.plugin.host, c.pdName)
// unmount the bind-mount - should be fast
err := c.mounter.Unmount(globalPath)
if err != nil {
return err
}
// "Detach" the fake "device"
err = os.RemoveAll(fakeDeviceName)
if err != nil {
return err
}
@ -108,7 +153,7 @@ func TestPlugin(t *testing.T) {
},
},
}
builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{}, &mount.FakeMounter{})
builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{0}, &mount.FakeMounter{})
if err != nil {
t.Errorf("Failed to make a new Builder: %v", err)
}
@ -139,7 +184,7 @@ func TestPlugin(t *testing.T) {
}
}
cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{}, &mount.FakeMounter{})
cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{0}, &mount.FakeMounter{})
if err != nil {
t.Errorf("Failed to make a new Cleaner: %v", err)
}
@ -165,14 +210,14 @@ func TestPlugin(t *testing.T) {
},
PersistentVolumeReclaimPolicy: api.PersistentVolumeReclaimDelete,
}
provisioner, err := plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{})
provisioner, err := plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{0})
persistentSpec, err := provisioner.NewPersistentVolumeTemplate()
if err != nil {
t.Errorf("NewPersistentVolumeTemplate() failed: %v", err)
}
// get 2nd Provisioner - persistent volume controller will do the same
provisioner, err = plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{})
provisioner, err = plug.(*cinderPlugin).newProvisionerInternal(options, &fakePDManager{0})
err = provisioner.Provision(persistentSpec)
if err != nil {
t.Errorf("Provision() failed: %v", err)
@ -191,9 +236,121 @@ func TestPlugin(t *testing.T) {
volSpec := &volume.Spec{
PersistentVolume: persistentSpec,
}
deleter, err := plug.(*cinderPlugin).newDeleterInternal(volSpec, &fakePDManager{})
deleter, err := plug.(*cinderPlugin).newDeleterInternal(volSpec, &fakePDManager{0})
err = deleter.Delete()
if err != nil {
t.Errorf("Deleter() failed: %v", err)
}
}
// Test a race when a volume is simultaneously SetUp and TearedDown
func TestAttachDetachRace(t *testing.T) {
tmpDir, err := ioutil.TempDir(os.TempDir(), "cinderTest")
if err != nil {
t.Fatalf("can't make a temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
plugMgr := volume.VolumePluginMgr{}
host := volume.NewFakeVolumeHost(tmpDir, nil, nil)
plugMgr.InitPlugins(ProbeVolumePlugins(), host)
plug, err := plugMgr.FindPluginByName("kubernetes.io/cinder")
if err != nil {
t.Errorf("Can't find the plugin by name")
}
spec := &api.Volume{
Name: "vol1",
VolumeSource: api.VolumeSource{
Cinder: &api.CinderVolumeSource{
VolumeID: "pd",
FSType: "ext4",
},
},
}
fakeMounter := &mount.FakeMounter{}
// SetUp the volume for 1st time
builder, err := plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid"), &fakePDManager{time.Second}, fakeMounter)
if err != nil {
t.Errorf("Failed to make a new Builder: %v", err)
}
if builder == nil {
t.Errorf("Got a nil Builder: %v")
}
if err := builder.SetUp(nil); err != nil {
t.Errorf("Expected success, got: %v", err)
}
path := builder.GetPath()
// TearDown the 1st volume and SetUp the 2nd volume (to different pod) at the same time
builder, err = plug.(*cinderPlugin).newBuilderInternal(volume.NewSpecFromVolume(spec), types.UID("poduid2"), &fakePDManager{time.Second}, fakeMounter)
if err != nil {
t.Errorf("Failed to make a new Builder: %v", err)
}
if builder == nil {
t.Errorf("Got a nil Builder: %v")
}
cleaner, err := plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid"), &fakePDManager{time.Second}, fakeMounter)
if err != nil {
t.Errorf("Failed to make a new Cleaner: %v", err)
}
var buildComplete uint32 = 0
go func() {
glog.Infof("Attaching volume")
if err := builder.SetUp(nil); err != nil {
t.Errorf("Expected success, got: %v", err)
}
glog.Infof("Volume attached")
atomic.AddUint32(&buildComplete, 1)
}()
// builder is attaching the volume, which takes 1 second. Detach it in the middle of this interval
time.Sleep(time.Second / 2)
glog.Infof("Detaching volume")
if err = cleaner.TearDown(); err != nil {
t.Errorf("Expected success, got: %v", err)
}
glog.Infof("Volume detached")
// wait for the builder to finish
for atomic.LoadUint32(&buildComplete) == 0 {
time.Sleep(time.Millisecond * 100)
}
// The volume should still be attached
devicePath := getFakeDeviceName(host, "pd")
if _, err := os.Stat(devicePath); err != nil {
if os.IsNotExist(err) {
t.Errorf("SetUp() failed, volume detached by simultaneous TearDown: %s", path)
} else {
t.Errorf("SetUp() failed: %v", err)
}
}
// TearDown the 2nd volume
cleaner, err = plug.(*cinderPlugin).newCleanerInternal("vol1", types.UID("poduid2"), &fakePDManager{0}, fakeMounter)
if err != nil {
t.Errorf("Failed to make a new Cleaner: %v", err)
}
if cleaner == nil {
t.Errorf("Got a nil Cleaner: %v")
}
if err := cleaner.TearDown(); err != nil {
t.Errorf("Expected success, got: %v", err)
}
if _, err := os.Stat(path); err == nil {
t.Errorf("TearDown() failed, volume path still exists: %s", path)
} else if !os.IsNotExist(err) {
t.Errorf("SetUp() failed: %v", err)
}
if _, err := os.Stat(devicePath); err == nil {
t.Errorf("TearDown() failed, volume is still attached: %s", devicePath)
} else if !os.IsNotExist(err) {
t.Errorf("SetUp() failed: %v", err)
}
}