RBD Plugin: Pass monitors addresses in a comma-separated list instead of

trying one by one.

In production, monitors may crash (or have a network problem), if we try
monitors one by one, rbd command will hang a long time (e.g. `rbd map -m
<unconnectable_host_ip>` on linux 4.4 timed out in 6 minutes) when
trying a unconnectable monitor. This is unacceptable.

Actually, we can simply pass a comma-separed list monitor addresses
to `rbd` command utility. Kernel rbd/libceph modules will pick
monitor randomly and try one by one, `rbd` command utility succeed soon
if there is a good one in monitors list.
This commit is contained in:
Yecheng Fu 2017-12-21 20:11:36 +08:00
parent 754bb1350f
commit 08d1c0b412

View File

@ -25,7 +25,6 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"math/rand"
"os" "os"
"os/exec" "os/exec"
"path" "path"
@ -133,6 +132,17 @@ func rbdErrors(runErr, resultErr error) error {
return resultErr return resultErr
} }
// 'rbd' utility simply pass '-m <mon>' parameter to kernel rbd/libceph
// modules, which takes a comma-seprated list of one or more monitor addresses
// (e.g. ip1[:port1][,ip2[:port2]...]) in its first version in linux (see
// https://github.com/torvalds/linux/blob/602adf400201636e95c3fed9f31fba54a3d7e844/net/ceph/ceph_common.c#L239)
// Also, libceph choose monitor randomly, so we can simply pass all addresses
// without randomization (see
// https://github.com/torvalds/linux/blob/602adf400201636e95c3fed9f31fba54a3d7e844/net/ceph/mon_client.c#L132).
func (util *RBDUtil) kernelRBDMonitorsOpt(mons []string) string {
return strings.Join(mons, ",")
}
// rbdLock acquires a lock on image if lock is true, otherwise releases if a // rbdLock acquires a lock on image if lock is true, otherwise releases if a
// lock is found on image. // lock is found on image.
func (util *RBDUtil) rbdLock(b rbdMounter, lock bool) error { func (util *RBDUtil) rbdLock(b rbdMounter, lock bool) error {
@ -156,88 +166,79 @@ func (util *RBDUtil) rbdLock(b rbdMounter, lock bool) error {
// construct lock id using host name and a magic prefix // construct lock id using host name and a magic prefix
lock_id := kubeLockMagic + node.GetHostname("") lock_id := kubeLockMagic + node.GetHostname("")
l := len(b.Mon) mon := util.kernelRBDMonitorsOpt(b.Mon)
// avoid mount storm, pick a host randomly
start := rand.Int() % l // cmd "rbd lock list" serves two purposes:
// iterate all hosts until mount succeeds. // for fencing, check if lock already held for this host
for i := start; i < start+l; i++ { // this edge case happens if host crashes in the middle of acquiring lock and mounting rbd
mon := b.Mon[i%l] // for defencing, get the locker name, something like "client.1234"
// cmd "rbd lock list" serves two purposes: args := []string{"lock", "list", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon}
// for fencing, check if lock already held for this host args = append(args, secret_opt...)
// this edge case happens if host crashes in the middle of acquiring lock and mounting rbd cmd, err = b.exec.Run("rbd", args...)
// for defencing, get the locker name, something like "client.1234" output = string(cmd)
args := []string{"lock", "list", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon} glog.Infof("lock list output %q", output)
args = append(args, secret_opt...) if err != nil {
cmd, err = b.exec.Run("rbd", args...) return err
output = string(cmd) }
glog.Infof("lock list output %q", output)
if err != nil { if lock {
continue // check if lock is already held for this host by matching lock_id and rbd lock id
if strings.Contains(output, lock_id) {
// this host already holds the lock, exit
glog.V(1).Infof("rbd: lock already held for %s", lock_id)
return nil
}
// clean up orphaned lock if no watcher on the image
used, rbdOutput, statusErr := util.rbdStatus(&b)
if statusErr != nil {
return fmt.Errorf("rbdStatus failed error %v, rbd output: %v", statusErr, rbdOutput)
}
if used {
// this image is already used by a node other than this node
return fmt.Errorf("rbd image: %s/%s is already used by a node other than this node, rbd output: %v", b.Image, b.Pool, output)
} }
if lock { // best effort clean up orphaned locked if not used
// check if lock is already held for this host by matching lock_id and rbd lock id locks := clientKubeLockMagicRe.FindAllStringSubmatch(output, -1)
if strings.Contains(output, lock_id) { for _, v := range locks {
// this host already holds the lock, exit if len(v) > 0 {
glog.V(1).Infof("rbd: lock already held for %s", lock_id) lockInfo := strings.Split(v[0], " ")
return nil if len(lockInfo) > 2 {
} args := []string{"lock", "remove", b.Image, lockInfo[1], lockInfo[0], "--pool", b.Pool, "--id", b.Id, "-m", mon}
// clean up orphaned lock if no watcher on the image args = append(args, secret_opt...)
used, rbdOutput, statusErr := util.rbdStatus(&b) cmd, err = b.exec.Run("rbd", args...)
if statusErr != nil { glog.Infof("remove orphaned locker %s from client %s: err %v, rbd output: %s", lockInfo[1], lockInfo[0], err, string(cmd))
return fmt.Errorf("rbdStatus failed error %v, rbd output: %v", statusErr, rbdOutput)
}
if used {
// this image is already used by a node other than this node
return fmt.Errorf("rbd image: %s/%s is already used by a node other than this node, rbd output: %v", b.Image, b.Pool, output)
}
// best effort clean up orphaned locked if not used
locks := clientKubeLockMagicRe.FindAllStringSubmatch(output, -1)
for _, v := range locks {
if len(v) > 0 {
lockInfo := strings.Split(v[0], " ")
if len(lockInfo) > 2 {
args := []string{"lock", "remove", b.Image, lockInfo[1], lockInfo[0], "--pool", b.Pool, "--id", b.Id, "-m", mon}
args = append(args, secret_opt...)
cmd, err = b.exec.Run("rbd", args...)
glog.Infof("remove orphaned locker %s from client %s: err %v, rbd output: %s", lockInfo[1], lockInfo[0], err, string(cmd))
}
} }
} }
}
// hold a lock: rbd lock add // hold a lock: rbd lock add
args := []string{"lock", "add", b.Image, lock_id, "--pool", b.Pool, "--id", b.Id, "-m", mon} args := []string{"lock", "add", b.Image, lock_id, "--pool", b.Pool, "--id", b.Id, "-m", mon}
args = append(args, secret_opt...)
cmd, err = b.exec.Run("rbd", args...)
if err == nil {
glog.V(4).Infof("rbd: successfully add lock (locker_id: %s) on image: %s/%s with id %s mon %s", lock_id, b.Pool, b.Image, b.Id, mon)
}
} else {
// defencing, find locker name
ind := strings.LastIndex(output, lock_id) - 1
for i := ind; i >= 0; i-- {
if output[i] == '\n' {
locker = output[(i + 1):ind]
break
}
}
// remove a lock if found: rbd lock remove
if len(locker) > 0 {
args := []string{"lock", "remove", b.Image, lock_id, locker, "--pool", b.Pool, "--id", b.Id, "-m", mon}
args = append(args, secret_opt...) args = append(args, secret_opt...)
cmd, err = b.exec.Run("rbd", args...) cmd, err = b.exec.Run("rbd", args...)
if err == nil { if err == nil {
glog.V(4).Infof("rbd: successfully add lock (locker_id: %s) on image: %s/%s with id %s mon %s", lock_id, b.Pool, b.Image, b.Id, mon) glog.V(4).Infof("rbd: successfully remove lock (locker_id: %s) on image: %s/%s with id %s mon %s", lock_id, b.Pool, b.Image, b.Id, mon)
} }
} else {
// defencing, find locker name
ind := strings.LastIndex(output, lock_id) - 1
for i := ind; i >= 0; i-- {
if output[i] == '\n' {
locker = output[(i + 1):ind]
break
}
}
// remove a lock if found: rbd lock remove
if len(locker) > 0 {
args := []string{"lock", "remove", b.Image, lock_id, locker, "--pool", b.Pool, "--id", b.Id, "-m", mon}
args = append(args, secret_opt...)
cmd, err = b.exec.Run("rbd", args...)
if err == nil {
glog.V(4).Infof("rbd: successfully remove lock (locker_id: %s) on image: %s/%s with id %s mon %s", lock_id, b.Pool, b.Image, b.Id, mon)
}
}
}
if err == nil {
// break if operation succeeds
break
} }
} }
return err return err
} }
@ -274,27 +275,17 @@ func (util *RBDUtil) AttachDisk(b rbdMounter) (string, error) {
return "", fmt.Errorf("rbd image %s/%s is still being used. rbd output: %s", b.Pool, b.Image, rbdOutput) return "", fmt.Errorf("rbd image %s/%s is still being used. rbd output: %s", b.Pool, b.Image, rbdOutput)
} }
// rbd map mon := util.kernelRBDMonitorsOpt(b.Mon)
l := len(b.Mon) glog.V(1).Infof("rbd: map mon %s", mon)
// avoid mount storm, pick a host randomly if b.Secret != "" {
start := rand.Int() % l output, err = b.exec.Run("rbd",
// iterate all hosts until mount succeeds. "map", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon, "--key="+b.Secret)
for i := start; i < start+l; i++ { } else {
mon := b.Mon[i%l] output, err = b.exec.Run("rbd",
glog.V(1).Infof("rbd: map mon %s", mon) "map", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon, "-k", b.Keyring)
if b.Secret != "" {
output, err = b.exec.Run("rbd",
"map", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon, "--key="+b.Secret)
} else {
output, err = b.exec.Run("rbd",
"map", b.Image, "--pool", b.Pool, "--id", b.Id, "-m", mon, "-k", b.Keyring)
}
if err == nil {
break
}
glog.V(1).Infof("rbd: map error %v, rbd output: %s", err, string(output))
} }
if err != nil { if err != nil {
glog.V(1).Infof("rbd: map error %v, rbd output: %s", err, string(output))
return "", fmt.Errorf("rbd: map failed %v, rbd output: %s", err, string(output)) return "", fmt.Errorf("rbd: map failed %v, rbd output: %s", err, string(output))
} }
devicePath, found = waitForPath(b.Pool, b.Image, 10) devicePath, found = waitForPath(b.Pool, b.Image, 10)
@ -378,34 +369,23 @@ func (util *RBDUtil) CreateImage(p *rbdVolumeProvisioner) (r *v1.RBDPersistentVo
// convert to MB that rbd defaults on // convert to MB that rbd defaults on
sz := int(volume.RoundUpSize(volSizeBytes, 1024*1024)) sz := int(volume.RoundUpSize(volSizeBytes, 1024*1024))
volSz := fmt.Sprintf("%d", sz) volSz := fmt.Sprintf("%d", sz)
// rbd create mon := util.kernelRBDMonitorsOpt(p.Mon)
l := len(p.rbdMounter.Mon) if p.rbdMounter.imageFormat == rbdImageFormat2 {
// pick a mon randomly glog.V(4).Infof("rbd: create %s size %s format %s (features: %s) using mon %s, pool %s id %s key %s", p.rbdMounter.Image, volSz, p.rbdMounter.imageFormat, p.rbdMounter.imageFeatures, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
start := rand.Int() % l } else {
// iterate all monitors until create succeeds. glog.V(4).Infof("rbd: create %s size %s format %s using mon %s, pool %s id %s key %s", p.rbdMounter.Image, volSz, p.rbdMounter.imageFormat, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
for i := start; i < start+l; i++ {
mon := p.Mon[i%l]
if p.rbdMounter.imageFormat == rbdImageFormat2 {
glog.V(4).Infof("rbd: create %s size %s format %s (features: %s) using mon %s, pool %s id %s key %s", p.rbdMounter.Image, volSz, p.rbdMounter.imageFormat, p.rbdMounter.imageFeatures, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
} else {
glog.V(4).Infof("rbd: create %s size %s format %s using mon %s, pool %s id %s key %s", p.rbdMounter.Image, volSz, p.rbdMounter.imageFormat, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
}
args := []string{"create", p.rbdMounter.Image, "--size", volSz, "--pool", p.rbdMounter.Pool, "--id", p.rbdMounter.adminId, "-m", mon, "--key=" + p.rbdMounter.adminSecret, "--image-format", p.rbdMounter.imageFormat}
if p.rbdMounter.imageFormat == rbdImageFormat2 {
// if no image features is provided, it results in empty string
// which disable all RBD image format 2 features as we expected
features := strings.Join(p.rbdMounter.imageFeatures, ",")
args = append(args, "--image-feature", features)
}
output, err = p.exec.Run("rbd", args...)
if err == nil {
break
} else {
glog.Warningf("failed to create rbd image, output %v", string(output))
}
} }
args := []string{"create", p.rbdMounter.Image, "--size", volSz, "--pool", p.rbdMounter.Pool, "--id", p.rbdMounter.adminId, "-m", mon, "--key=" + p.rbdMounter.adminSecret, "--image-format", p.rbdMounter.imageFormat}
if p.rbdMounter.imageFormat == rbdImageFormat2 {
// if no image features is provided, it results in empty string
// which disable all RBD image format 2 features as we expected
features := strings.Join(p.rbdMounter.imageFeatures, ",")
args = append(args, "--image-feature", features)
}
output, err = p.exec.Run("rbd", args...)
if err != nil { if err != nil {
glog.Warningf("failed to create rbd image, output %v", string(output))
return nil, 0, fmt.Errorf("failed to create rbd image: %v, command output: %s", err, string(output)) return nil, 0, fmt.Errorf("failed to create rbd image: %v, command output: %s", err, string(output))
} }
@ -427,21 +407,15 @@ func (util *RBDUtil) DeleteImage(p *rbdVolumeDeleter) error {
return fmt.Errorf("rbd image %s/%s is still being used, rbd output: %v", p.rbdMounter.Pool, p.rbdMounter.Image, rbdOutput) return fmt.Errorf("rbd image %s/%s is still being used, rbd output: %v", p.rbdMounter.Pool, p.rbdMounter.Image, rbdOutput)
} }
// rbd rm // rbd rm
l := len(p.rbdMounter.Mon) mon := util.kernelRBDMonitorsOpt(p.rbdMounter.Mon)
// pick a mon randomly glog.V(4).Infof("rbd: rm %s using mon %s, pool %s id %s key %s", p.rbdMounter.Image, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
start := rand.Int() % l output, err = p.exec.Run("rbd",
// iterate all monitors until rm succeeds. "rm", p.rbdMounter.Image, "--pool", p.rbdMounter.Pool, "--id", p.rbdMounter.adminId, "-m", mon, "--key="+p.rbdMounter.adminSecret)
for i := start; i < start+l; i++ { if err == nil {
mon := p.rbdMounter.Mon[i%l] return nil
glog.V(4).Infof("rbd: rm %s using mon %s, pool %s id %s key %s", p.rbdMounter.Image, mon, p.rbdMounter.Pool, p.rbdMounter.adminId, p.rbdMounter.adminSecret)
output, err = p.exec.Run("rbd",
"rm", p.rbdMounter.Image, "--pool", p.rbdMounter.Pool, "--id", p.rbdMounter.adminId, "-m", mon, "--key="+p.rbdMounter.adminSecret)
if err == nil {
return nil
} else {
glog.Errorf("failed to delete rbd image: %v, command output: %s", err, string(output))
}
} }
glog.Errorf("failed to delete rbd image: %v, command output: %s", err, string(output))
return fmt.Errorf("error %v, rbd output: %v", err, string(output)) return fmt.Errorf("error %v, rbd output: %v", err, string(output))
} }
@ -465,21 +439,15 @@ func (util *RBDUtil) ExpandImage(rbdExpander *rbdVolumeExpander, oldSize resourc
} }
// rbd resize // rbd resize
l := len(rbdExpander.rbdMounter.Mon) mon := util.kernelRBDMonitorsOpt(rbdExpander.rbdMounter.Mon)
// pick a mon randomly glog.V(4).Infof("rbd: resize %s using mon %s, pool %s id %s key %s", rbdExpander.rbdMounter.Image, mon, rbdExpander.rbdMounter.Pool, rbdExpander.rbdMounter.adminId, rbdExpander.rbdMounter.adminSecret)
start := rand.Int() % l output, err = rbdExpander.exec.Run("rbd",
// iterate all monitors until resize succeeds. "resize", rbdExpander.rbdMounter.Image, "--size", newVolSz, "--pool", rbdExpander.rbdMounter.Pool, "--id", rbdExpander.rbdMounter.adminId, "-m", mon, "--key="+rbdExpander.rbdMounter.adminSecret)
for i := start; i < start+l; i++ { if err == nil {
mon := rbdExpander.rbdMounter.Mon[i%l] return newSizeQuant, nil
glog.V(4).Infof("rbd: resize %s using mon %s, pool %s id %s key %s", rbdExpander.rbdMounter.Image, mon, rbdExpander.rbdMounter.Pool, rbdExpander.rbdMounter.adminId, rbdExpander.rbdMounter.adminSecret)
output, err = rbdExpander.exec.Run("rbd",
"resize", rbdExpander.rbdMounter.Image, "--size", newVolSz, "--pool", rbdExpander.rbdMounter.Pool, "--id", rbdExpander.rbdMounter.adminId, "-m", mon, "--key="+rbdExpander.rbdMounter.adminSecret)
if err == nil {
return newSizeQuant, nil
} else {
glog.Errorf("failed to resize rbd image: %v, command output: %s", err, string(output))
}
} }
glog.Errorf("failed to resize rbd image: %v, command output: %s", err, string(output))
return oldSize, err return oldSize, err
} }
@ -497,44 +465,34 @@ func (util *RBDUtil) rbdInfo(b *rbdMounter) (int, error) {
secret = b.Secret secret = b.Secret
} }
l := len(b.Mon) mon := util.kernelRBDMonitorsOpt(b.Mon)
start := rand.Int() % l // cmd "rbd info" get the image info with the following output:
// iterate all hosts until rbd command succeeds. //
for i := start; i < start+l; i++ { // # image exists (exit=0)
mon := b.Mon[i%l] // rbd info volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08
// cmd "rbd info" get the image info with the following output: // size 1024 MB in 256 objects
// // order 22 (4096 kB objects)
// # image exists (exit=0) // block_name_prefix: rbd_data.1253ac238e1f29
// rbd info volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08 // format: 2
// size 1024 MB in 256 objects // ...
// order 22 (4096 kB objects) //
// block_name_prefix: rbd_data.1253ac238e1f29 // rbd info volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08 --format json
// format: 2 // {"name":"volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08","size":1073741824,"objects":256,"order":22,"object_size":4194304,"block_name_prefix":"rbd_data.1253ac238e1f29","format":2,"features":["layering","exclusive-lock","object-map","fast-diff","deep-flatten"],"flags":[]}
// ... //
// //
// rbd info volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08 --format json // # image does not exist (exit=2)
// {"name":"volume-4a5bcc8b-2b55-46da-ba04-0d3dc5227f08","size":1073741824,"objects":256,"order":22,"object_size":4194304,"block_name_prefix":"rbd_data.1253ac238e1f29","format":2,"features":["layering","exclusive-lock","object-map","fast-diff","deep-flatten"],"flags":[]} // rbd: error opening image 1234: (2) No such file or directory
// //
// glog.V(4).Infof("rbd: info %s using mon %s, pool %s id %s key %s", b.Image, mon, b.Pool, id, secret)
// # image does not exist (exit=2) cmd, err = b.exec.Run("rbd",
// rbd: error opening image 1234: (2) No such file or directory "info", b.Image, "--pool", b.Pool, "-m", mon, "--id", id, "--key="+secret)
// output = string(cmd)
glog.V(4).Infof("rbd: info %s using mon %s, pool %s id %s key %s", b.Image, mon, b.Pool, id, secret)
cmd, err = b.exec.Run("rbd",
"info", b.Image, "--pool", b.Pool, "-m", mon, "--id", id, "--key="+secret)
output = string(cmd)
// break if command succeeds if err, ok := err.(*exec.Error); ok {
if err == nil { if err.Err == exec.ErrNotFound {
break glog.Errorf("rbd cmd not found")
} // fail fast if command not found
return 0, err
if err, ok := err.(*exec.Error); ok {
if err.Err == exec.ErrNotFound {
glog.Errorf("rbd cmd not found")
// fail fast if command not found
return 0, err
}
} }
} }
@ -576,41 +534,31 @@ func (util *RBDUtil) rbdStatus(b *rbdMounter) (bool, string, error) {
secret = b.Secret secret = b.Secret
} }
l := len(b.Mon) mon := util.kernelRBDMonitorsOpt(b.Mon)
start := rand.Int() % l // cmd "rbd status" list the rbd client watch with the following output:
// iterate all hosts until rbd command succeeds. //
for i := start; i < start+l; i++ { // # there is a watcher (exit=0)
mon := b.Mon[i%l] // Watchers:
// cmd "rbd status" list the rbd client watch with the following output: // watcher=10.16.153.105:0/710245699 client.14163 cookie=1
// //
// # there is a watcher (exit=0) // # there is no watcher (exit=0)
// Watchers: // Watchers: none
// watcher=10.16.153.105:0/710245699 client.14163 cookie=1 //
// // Otherwise, exit is non-zero, for example:
// # there is no watcher (exit=0) //
// Watchers: none // # image does not exist (exit=2)
// // rbd: error opening image kubernetes-dynamic-pvc-<UUID>: (2) No such file or directory
// Otherwise, exit is non-zero, for example: //
// glog.V(4).Infof("rbd: status %s using mon %s, pool %s id %s key %s", b.Image, mon, b.Pool, id, secret)
// # image does not exist (exit=2) cmd, err = b.exec.Run("rbd",
// rbd: error opening image kubernetes-dynamic-pvc-<UUID>: (2) No such file or directory "status", b.Image, "--pool", b.Pool, "-m", mon, "--id", id, "--key="+secret)
// output = string(cmd)
glog.V(4).Infof("rbd: status %s using mon %s, pool %s id %s key %s", b.Image, mon, b.Pool, id, secret)
cmd, err = b.exec.Run("rbd",
"status", b.Image, "--pool", b.Pool, "-m", mon, "--id", id, "--key="+secret)
output = string(cmd)
// break if command succeeds if err, ok := err.(*exec.Error); ok {
if err == nil { if err.Err == exec.ErrNotFound {
break glog.Errorf("rbd cmd not found")
} // fail fast if command not found
return false, output, err
if err, ok := err.(*exec.Error); ok {
if err.Err == exec.ErrNotFound {
glog.Errorf("rbd cmd not found")
// fail fast if command not found
return false, output, err
}
} }
} }