Add comments & misc review fixes

Lots of comments describing the heuristics, how it fits together and the
limitations.

In particular, we can't guarantee correct volume placement if the set of
zones is changing between allocating volumes.
This commit is contained in:
Justin Santa Barbara 2016-06-21 15:22:16 -04:00
parent 9c2566572d
commit dd94997619
4 changed files with 34 additions and 3 deletions

View File

@ -919,7 +919,10 @@ func (aws *AWSCloud) List(filter string) ([]string, error) {
// getAllZones retrieves a list of all the zones in which nodes are running
// It currently involves querying all instances
func (c *AWSCloud) getAllZones() (sets.String, error) {
// TODO: Caching, although we currently only use this in volume creation
// We don't currently cache this; it is currently used only in volume
// creation which is expected to be a comparatively rare occurence.
// TODO: Caching / expose api.Nodes to the cloud provider?
// TODO: We could also query for subnets, I think
filters := []*ec2.Filter{newEc2Filter("instance-state-name", "running")}

View File

@ -2070,6 +2070,7 @@ func (gce *GCECloud) List(filter string) ([]string, error) {
// GetAllZones returns all the zones in which nodes are running
func (gce *GCECloud) GetAllZones() (sets.String, error) {
// Fast-path for non-multizone
if len(gce.managedZones) == 1 {
return sets.NewString(gce.managedZones...), nil
}
@ -2084,6 +2085,15 @@ func (gce *GCECloud) GetAllZones() (sets.String, error) {
listCall := gce.service.Instances.List(gce.projectID, zone)
// No filter: We assume that a zone is either used or unused
// We could only consider running nodes (like we do in List above),
// but probably if instances are starting we still want to consider them.
// I think we should wait until we have a reason to make the
// call one way or the other; we generally can't guarantee correct
// volume spreading if the set of zones is changing
// (and volume spreading is currently only a heuristic).
// Long term we want to replace GetAllZones (which primarily supports volume
// spreading) with a scheduler policy that is able to see the global state of
// volumes and the health of zones.
// Just a minimal set of fields - we only care about existence
listCall = listCall.Fields("items(name)")
@ -2258,6 +2268,12 @@ func (gce *GCECloud) GetAutoLabelsForPD(name string, zone string) (map[string]st
var disk *gceDisk
var err error
if zone == "" {
// We would like as far as possible to avoid this case,
// because GCE doesn't guarantee that volumes are uniquely named per region,
// just per zone. However, creation of GCE PDs was originally done only
// by name, so we have to continue to support that.
// However, wherever possible the zone should be passed (and it is passed
// for most cases that we can control, e.g. dynamic volume provisioning)
disk, err = gce.getDiskByNameUnknownZone(name)
if err != nil {
return nil, err

View File

@ -49,7 +49,7 @@ type VolumeOptions struct {
// PV.Name of the appropriate PersistentVolume. Used to generate cloud
// volume name.
PVName string
// PVC.Name of the PersistentVolumeClaim, if we are auto-provisioning.
// PVC.Name of the PersistentVolumeClaim; only set during dynamic provisioning.
PVCName string
// Unique name of Kubernetes cluster.
ClusterName string

View File

@ -194,6 +194,10 @@ func GenerateVolumeName(clusterName, pvName string, maxLength int) string {
}
// ChooseZone implements our heuristics for choosing a zone for volume creation based on the volume name
// Volumes are generally round-robin-ed across all active zones, using the hash of the PVC Name.
// However, if the PVCName ends with `-<integer>`, we will hash the prefix, and then add the integer to the hash.
// This means that a PetSet's volumes (`claimname-petsetname-id`) will spread across available zones,
// assuming the id values are consecutive.
func ChooseZoneForVolume(zones sets.String, pvcName string) string {
// We create the volume in a zone determined by the name
// Eventually the scheduler will coordinate placement into an available zone
@ -202,7 +206,7 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string {
if pvcName == "" {
// We should always be called with a name; this shouldn't happen
glog.Warningf("No Name defined during volume create; choosing random zone")
glog.Warningf("No name defined during volume create; choosing random zone")
hash = rand.Uint32()
} else {
@ -230,6 +234,14 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string {
hash = h.Sum32()
}
// Zones.List returns zones in a consistent order (sorted)
// We do have a potential failure case where volumes will not be properly spread,
// if the set of zones changes during PetSet volume creation. However, this is
// probably relatively unlikely because we expect the set of zones to be essentially
// static for clusters.
// Hopefully we can address this problem if/when we do full scheduler integration of
// PVC placement (which could also e.g. avoid putting volumes in overloaded or
// unhealthy zones)
zoneSlice := zones.List()
zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]