From 900567288b32742c185c3a16a16d7f6e0a126eb1 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Fri, 22 Jan 2016 12:22:04 -0500 Subject: [PATCH 1/3] Ubernetes Lite: Label volumes with zone information When volumes are labeled, they will only be scheduled onto nodes in the same zone. --- pkg/cloudprovider/providers/gce/gce.go | 28 +++++++++++ .../persistentvolume/label/admission.go | 50 ++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/pkg/cloudprovider/providers/gce/gce.go b/pkg/cloudprovider/providers/gce/gce.go index 9119dc6f965..8ce4b960e86 100644 --- a/pkg/cloudprovider/providers/gce/gce.go +++ b/pkg/cloudprovider/providers/gce/gce.go @@ -29,6 +29,7 @@ import ( "time" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/unversioned" "k8s.io/kubernetes/pkg/cloudprovider" utilerrors "k8s.io/kubernetes/pkg/util/errors" "k8s.io/kubernetes/pkg/util/sets" @@ -1644,6 +1645,33 @@ func (gce *GCECloud) DeleteDisk(diskToDelete string) error { return gce.waitForZoneOp(deleteOp, disk.Zone) } +// Builds the labels that should be automatically added to a PersistentVolume backed by a GCE PD +// Specifically, this builds FailureDomain (zone) and Region labels. +// The PersistentVolumeLabel admission controller calls this and adds the labels when a PV is created. +func (gce *GCECloud) GetAutoLabelsForPD(name string) (map[string]string, error) { + disk, err := gce.getDiskByNameUnknownZone(name) + if err != nil { + return nil, err + } + + zone := disk.Zone + region, err := GetGCERegion(zone) + if err != nil { + return nil, err + } + + if zone == "" || region == "" { + // Unexpected, but sanity-check + return nil, fmt.Errorf("PD did not have zone/region information: %q", disk.Name) + } + + labels := make(map[string]string) + labels[unversioned.LabelZoneFailureDomain] = zone + labels[unversioned.LabelZoneRegion] = region + + return labels, nil +} + func (gce *GCECloud) AttachDisk(diskName, instanceID string, readOnly bool) error { instance, err := gce.getInstanceByName(instanceID) if err != nil { diff --git a/plugin/pkg/admission/persistentvolume/label/admission.go b/plugin/pkg/admission/persistentvolume/label/admission.go index 67004a112a6..de1b9240236 100644 --- a/plugin/pkg/admission/persistentvolume/label/admission.go +++ b/plugin/pkg/admission/persistentvolume/label/admission.go @@ -26,6 +26,7 @@ import ( client "k8s.io/kubernetes/pkg/client/unversioned" "k8s.io/kubernetes/pkg/cloudprovider" "k8s.io/kubernetes/pkg/cloudprovider/providers/aws" + "k8s.io/kubernetes/pkg/cloudprovider/providers/gce" ) func init() { @@ -40,8 +41,9 @@ var _ = admission.Interface(&persistentVolumeLabel{}) type persistentVolumeLabel struct { *admission.Handler - mutex sync.Mutex - ebsVolumes aws.Volumes + mutex sync.Mutex + ebsVolumes aws.Volumes + gceCloudProvider *gce.GCECloud } // NewPersistentVolumeLabel returns an admission.Interface implementation which adds labels to PersistentVolume CREATE requests, @@ -75,6 +77,13 @@ func (l *persistentVolumeLabel) Admit(a admission.Attributes) (err error) { } volumeLabels = labels } + if volume.Spec.GCEPersistentDisk != nil { + labels, err := l.findGCEPDLabels(volume) + if err != nil { + return admission.NewForbidden(a, fmt.Errorf("error querying GCE PD volume %s: %v", volume.Spec.GCEPersistentDisk.PDName, err)) + } + volumeLabels = labels + } if len(volumeLabels) != 0 { if volume.Labels == nil { @@ -129,3 +138,40 @@ func (l *persistentVolumeLabel) getEBSVolumes() (aws.Volumes, error) { } return l.ebsVolumes, nil } + +func (l *persistentVolumeLabel) findGCEPDLabels(volume *api.PersistentVolume) (map[string]string, error) { + provider, err := l.getGCECloudProvider() + if err != nil { + return nil, err + } + if provider == nil { + return nil, fmt.Errorf("unable to build GCE cloud provider for PD") + } + + labels, err := provider.GetAutoLabelsForPD(volume.Spec.GCEPersistentDisk.PDName) + if err != nil { + return nil, err + } + + return labels, err +} + +// getGCECloudProvider returns the GCE cloud provider, for use for querying volume labels +func (l *persistentVolumeLabel) getGCECloudProvider() (*gce.GCECloud, error) { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.gceCloudProvider == nil { + cloudProvider, err := cloudprovider.GetCloudProvider("gce", nil) + if err != nil || cloudProvider == nil { + return nil, err + } + gceCloudProvider, ok := cloudProvider.(*gce.GCECloud) + if !ok { + // GetCloudProvider has gone very wrong + return nil, fmt.Errorf("error retrieving GCE cloud provider") + } + l.gceCloudProvider = gceCloudProvider + } + return l.gceCloudProvider, nil +} From 1276675512f2289824e27057cd901a2c99df141f Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Fri, 22 Jan 2016 12:22:55 -0500 Subject: [PATCH 2/3] Ubernetes-Lite: Error if a PD name is ambiguous We don't cope well if a PD is in multiple zones, but this is actually fairly easy to detect. This is probably justified purely on the basis that we never want to delete the wrong volume (DeleteDisk), but also because this means that we now warn on creation if a disk is in multiple zones (with the labeling admission controller). This also means that with the scheduling predicate in place, that many of our volume problems "go away" in practice: you still can't create or delete a volume when it is ambiguous, but thereafter the volume will be labeled with the zone, that will match it only to nodes with the same zone, and then we query for the volume in that zone when we attach/detach it. --- pkg/cloudprovider/providers/gce/gce.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/cloudprovider/providers/gce/gce.go b/pkg/cloudprovider/providers/gce/gce.go index 8ce4b960e86..9da69182411 100644 --- a/pkg/cloudprovider/providers/gce/gce.go +++ b/pkg/cloudprovider/providers/gce/gce.go @@ -1764,14 +1764,20 @@ func (gce *GCECloud) getDiskByNameUnknownZone(diskName string) (*gceDisk, error) // "us-central1-a/mydisk". We could do this for them as part of // admission control, but that might be a little weird (values changing // on create) + + var found *gceDisk for _, zone := range gce.managedZones { disk, err := gce.findDiskByName(diskName, zone) if err != nil { return nil, err } - if disk != nil { - return disk, nil + if found != nil { + return nil, fmt.Errorf("GCE persistent disk name was found in multiple zones: %q", diskName) } + found = disk + } + if found != nil { + return found, nil } return nil, fmt.Errorf("GCE persistent disk not found: %q", diskName) } From 88eeec4f13579c3c80be1ee59143c66af7e50d91 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Fri, 22 Jan 2016 12:26:55 -0500 Subject: [PATCH 3/3] GCE: Register the PersistentVolumeLabel admission controller --- cluster/gce/config-default.sh | 2 +- cluster/gce/config-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index fcbcdc4dc29..dfd53a04a39 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -107,7 +107,7 @@ if [[ "${ENABLE_NODE_AUTOSCALER}" == "true" ]]; then fi # Admission Controllers to invoke prior to persisting objects in cluster -ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,ServiceAccount,ResourceQuota +ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,ServiceAccount,ResourceQuota,PersistentVolumeLabel # Optional: if set to true kube-up will automatically check for existing resources and clean them up. KUBE_UP_AUTOMATIC_CLEANUP=${KUBE_UP_AUTOMATIC_CLEANUP:-false} diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 708c53d53a4..cb856194d76 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -120,7 +120,7 @@ if [[ "${ENABLE_NODE_AUTOSCALER}" == "true" ]]; then TARGET_NODE_UTILIZATION="${KUBE_TARGET_NODE_UTILIZATION:-0.7}" fi -ADMISSION_CONTROL="${KUBE_ADMISSION_CONTROL:-NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota}" +ADMISSION_CONTROL="${KUBE_ADMISSION_CONTROL:-NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota,PersistentVolumeLabel}" # Optional: if set to true kube-up will automatically check for existing resources and clean them up. KUBE_UP_AUTOMATIC_CLEANUP=${KUBE_UP_AUTOMATIC_CLEANUP:-false}