diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index ee53528e9dd..5c1e2a465df 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -919,7 +919,10 @@ func (aws *AWSCloud) List(filter string) ([]string, error) { // getAllZones retrieves a list of all the zones in which nodes are running // It currently involves querying all instances func (c *AWSCloud) getAllZones() (sets.String, error) { - // TODO: Caching, although we currently only use this in volume creation + // We don't currently cache this; it is currently used only in volume + // creation which is expected to be a comparatively rare occurence. + + // TODO: Caching / expose api.Nodes to the cloud provider? // TODO: We could also query for subnets, I think filters := []*ec2.Filter{newEc2Filter("instance-state-name", "running")} diff --git a/pkg/cloudprovider/providers/gce/gce.go b/pkg/cloudprovider/providers/gce/gce.go index a2d3f5c5778..da8536342ca 100644 --- a/pkg/cloudprovider/providers/gce/gce.go +++ b/pkg/cloudprovider/providers/gce/gce.go @@ -2070,6 +2070,7 @@ func (gce *GCECloud) List(filter string) ([]string, error) { // GetAllZones returns all the zones in which nodes are running func (gce *GCECloud) GetAllZones() (sets.String, error) { + // Fast-path for non-multizone if len(gce.managedZones) == 1 { return sets.NewString(gce.managedZones...), nil } @@ -2084,6 +2085,15 @@ func (gce *GCECloud) GetAllZones() (sets.String, error) { listCall := gce.service.Instances.List(gce.projectID, zone) // No filter: We assume that a zone is either used or unused + // We could only consider running nodes (like we do in List above), + // but probably if instances are starting we still want to consider them. + // I think we should wait until we have a reason to make the + // call one way or the other; we generally can't guarantee correct + // volume spreading if the set of zones is changing + // (and volume spreading is currently only a heuristic). + // Long term we want to replace GetAllZones (which primarily supports volume + // spreading) with a scheduler policy that is able to see the global state of + // volumes and the health of zones. // Just a minimal set of fields - we only care about existence listCall = listCall.Fields("items(name)") @@ -2258,6 +2268,12 @@ func (gce *GCECloud) GetAutoLabelsForPD(name string, zone string) (map[string]st var disk *gceDisk var err error if zone == "" { + // We would like as far as possible to avoid this case, + // because GCE doesn't guarantee that volumes are uniquely named per region, + // just per zone. However, creation of GCE PDs was originally done only + // by name, so we have to continue to support that. + // However, wherever possible the zone should be passed (and it is passed + // for most cases that we can control, e.g. dynamic volume provisioning) disk, err = gce.getDiskByNameUnknownZone(name) if err != nil { return nil, err diff --git a/pkg/volume/plugins.go b/pkg/volume/plugins.go index 70760133f45..6a58b68812d 100644 --- a/pkg/volume/plugins.go +++ b/pkg/volume/plugins.go @@ -49,7 +49,7 @@ type VolumeOptions struct { // PV.Name of the appropriate PersistentVolume. Used to generate cloud // volume name. PVName string - // PVC.Name of the PersistentVolumeClaim, if we are auto-provisioning. + // PVC.Name of the PersistentVolumeClaim; only set during dynamic provisioning. PVCName string // Unique name of Kubernetes cluster. ClusterName string diff --git a/pkg/volume/util.go b/pkg/volume/util.go index 40d6c872f87..9c2b352b931 100644 --- a/pkg/volume/util.go +++ b/pkg/volume/util.go @@ -194,6 +194,10 @@ func GenerateVolumeName(clusterName, pvName string, maxLength int) string { } // ChooseZone implements our heuristics for choosing a zone for volume creation based on the volume name +// Volumes are generally round-robin-ed across all active zones, using the hash of the PVC Name. +// However, if the PVCName ends with `-`, we will hash the prefix, and then add the integer to the hash. +// This means that a PetSet's volumes (`claimname-petsetname-id`) will spread across available zones, +// assuming the id values are consecutive. func ChooseZoneForVolume(zones sets.String, pvcName string) string { // We create the volume in a zone determined by the name // Eventually the scheduler will coordinate placement into an available zone @@ -202,7 +206,7 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string { if pvcName == "" { // We should always be called with a name; this shouldn't happen - glog.Warningf("No Name defined during volume create; choosing random zone") + glog.Warningf("No name defined during volume create; choosing random zone") hash = rand.Uint32() } else { @@ -230,6 +234,14 @@ func ChooseZoneForVolume(zones sets.String, pvcName string) string { hash = h.Sum32() } + // Zones.List returns zones in a consistent order (sorted) + // We do have a potential failure case where volumes will not be properly spread, + // if the set of zones changes during PetSet volume creation. However, this is + // probably relatively unlikely because we expect the set of zones to be essentially + // static for clusters. + // Hopefully we can address this problem if/when we do full scheduler integration of + // PVC placement (which could also e.g. avoid putting volumes in overloaded or + // unhealthy zones) zoneSlice := zones.List() zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]