From dd949976199b4cb741ef0175353c2601dd9a8c90 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Tue, 21 Jun 2016 15:22:16 -0400 Subject: [PATCH] Add comments & misc review fixes Lots of comments describing the heuristics, how it fits together and the limitations. In particular, we can't guarantee correct volume placement if the set of zones is changing between allocating volumes. --- pkg/cloudprovider/providers/aws/aws.go | 5 ++++- pkg/cloudprovider/providers/gce/gce.go | 16 ++++++++++++++++ pkg/volume/plugins.go | 2 +- pkg/volume/util.go | 14 +++++++++++++- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index ee53528e9dd..5c1e2a465df 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -919,7 +919,10 @@ func (aws *AWSCloud) List(filter string) ([]string, error) { // getAllZones retrieves a list of all the zones in which nodes are running // It currently involves querying all instances func (c *AWSCloud) getAllZones() (sets.String, error) { - // TODO: Caching, although we currently only use this in volume creation + // We don't currently cache this; it is currently used only in volume + // creation which is expected to be a comparatively rare occurence. + + // TODO: Caching / expose api.Nodes to the cloud provider? // TODO: We could also query for subnets, I think filters := []*ec2.Filter{newEc2Filter("instance-state-name", "running")} diff --git a/pkg/cloudprovider/providers/gce/gce.go b/pkg/cloudprovider/providers/gce/gce.go index a2d3f5c5778..da8536342ca 100644 --- a/pkg/cloudprovider/providers/gce/gce.go +++ b/pkg/cloudprovider/providers/gce/gce.go @@ -2070,6 +2070,7 @@ func (gce *GCECloud) List(filter string) ([]string, error) { // GetAllZones returns all the zones in which nodes are running func (gce *GCECloud) GetAllZones() (sets.String, error) { + // Fast-path for non-multizone if len(gce.managedZones) == 1 { return sets.NewString(gce.managedZones...), nil } @@ -2084,6 +2085,15 @@ func (gce *GCECloud) GetAllZones() (sets.String, error) { listCall := gce.service.Instances.List(gce.projectID, zone) // No filter: We assume that a zone is either used or unused + // We could only consider running nodes (like we do in List above), + // but probably if instances are starting we still want to consider them. + // I think we should wait until we have a reason to make the + // call one way or the other; we generally can't guarantee correct + // volume spreading if the set of zones is changing + // (and volume spreading is currently only a heuristic). + // Long term we want to replace GetAllZones (which primarily supports volume + // spreading) with a scheduler policy that is able to see the global state of + // volumes and the health of zones. // Just a minimal set of fields - we only care about existence listCall = listCall.Fields("items(name)") @@ -2258,6 +2268,12 @@ func (gce *GCECloud) GetAutoLabelsForPD(name string, zone string) (map[string]st var disk *gceDisk var err error if zone == "" { + // We would like as far as possible to avoid this case, + // because GCE doesn't guarantee that volumes are uniquely named per region, + // just per zone. However, creation of GCE PDs was originally done only + // by name, so we have to continue to support that. + // However, wherever possible the zone should be passed (and it is passed + // for most cases that we can control, e.g. dynamic volume provisioning) disk, err = gce.getDiskByNameUnknownZone(name) if err != nil { return nil, err diff --git a/pkg/volume/plugins.go b/pkg/volume/plugins.go index 70760133f45..6a58b68812d 100644 --- a/pkg/volume/plugins.go +++ b/pkg/volume/plugins.go @@ -49,7 +49,7 @@ type VolumeOptions struct { // PV.Name of the appropriate PersistentVolume. Used to generate cloud // volume name. PVName string - // PVC.Name of the PersistentVolumeClaim, if we are auto-provisioning. + // PVC.Name of the PersistentVolumeClaim; only set during dynamic provisioning. PVCName string // Unique name of Kubernetes cluster. ClusterName string diff --git a/pkg/volume/util.go b/pkg/volume/util.go index 40d6c872f87..9c2b352b931 100644 --- a/pkg/volume/util.go +++ b/pkg/volume/util.go @@ -194,6 +194,10 @@ func GenerateVolumeName(clusterName, pvName string, maxLength int) string { } // ChooseZone implements our heuristics for choosing a zone for volume creation based on the volume name +// Volumes are generally round-robin-ed across all active zones, using the hash of the PVC Name. +// However, if the PVCName ends with `-`, we will hash the prefix, and then add the integer to the hash. +// This means that a PetSet's volumes (`claimname-petsetname-id`) will spread across available zones, +// assuming the id values are consecutive. func ChooseZoneForVolume(zones sets.String, pvcName string) string { // We create the volume in a zone determined by the name // Eventually the scheduler will coordinate placement into an available zone @@ -202,7 +206,7 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string { if pvcName == "" { // We should always be called with a name; this shouldn't happen - glog.Warningf("No Name defined during volume create; choosing random zone") + glog.Warningf("No name defined during volume create; choosing random zone") hash = rand.Uint32() } else { @@ -230,6 +234,14 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string { hash = h.Sum32() } + // Zones.List returns zones in a consistent order (sorted) + // We do have a potential failure case where volumes will not be properly spread, + // if the set of zones changes during PetSet volume creation. However, this is + // probably relatively unlikely because we expect the set of zones to be essentially + // static for clusters. + // Hopefully we can address this problem if/when we do full scheduler integration of + // PVC placement (which could also e.g. avoid putting volumes in overloaded or + // unhealthy zones) zoneSlice := zones.List() zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]