diff --git a/pkg/controller/volume/events/event.go b/pkg/controller/volume/events/event.go index b5ac3fb7be8..c99c30c99a7 100644 --- a/pkg/controller/volume/events/event.go +++ b/pkg/controller/volume/events/event.go @@ -29,4 +29,5 @@ const ( ProvisioningFailed = "ProvisioningFailed" ProvisioningCleanupFailed = "ProvisioningCleanupFailed" ProvisioningSucceeded = "ProvisioningSucceeded" + WaitForFirstConsumer = "WaitForFirstConsumer" ) diff --git a/pkg/controller/volume/persistentvolume/BUILD b/pkg/controller/volume/persistentvolume/BUILD index 6fef2bf3839..01ab29857e6 100644 --- a/pkg/controller/volume/persistentvolume/BUILD +++ b/pkg/controller/volume/persistentvolume/BUILD @@ -12,6 +12,10 @@ go_library( "index.go", "pv_controller.go", "pv_controller_base.go", + "scheduler_assume_cache.go", + "scheduler_binder.go", + "scheduler_binder_cache.go", + "scheduler_binder_fake.go", "volume_host.go", ], importpath = "k8s.io/kubernetes/pkg/controller/volume/persistentvolume", @@ -63,12 +67,16 @@ go_test( "provision_test.go", "pv_controller_test.go", "recycle_test.go", + "scheduler_assume_cache_test.go", + "scheduler_binder_cache_test.go", + "scheduler_binder_test.go", ], importpath = "k8s.io/kubernetes/pkg/controller/volume/persistentvolume", library = ":go_default_library", deps = [ "//pkg/api/testapi:go_default_library", "//pkg/apis/core:go_default_library", + "//pkg/apis/core/v1/helper:go_default_library", "//pkg/controller:go_default_library", "//pkg/volume:go_default_library", "//vendor/github.com/golang/glog:go_default_library", diff --git a/pkg/controller/volume/persistentvolume/binder_test.go b/pkg/controller/volume/persistentvolume/binder_test.go index 8288cd0ca82..a34b595747f 100644 --- a/pkg/controller/volume/persistentvolume/binder_test.go +++ b/pkg/controller/volume/persistentvolume/binder_test.go @@ -21,6 +21,7 @@ import ( "k8s.io/api/core/v1" storage "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilfeature "k8s.io/apiserver/pkg/util/feature" ) @@ -178,6 +179,25 @@ func TestSync(t *testing.T) { []string{"Normal FailedBinding"}, noerrors, testSyncClaim, }, + { + // syncClaim does not do anything when binding is delayed + "1-13 - delayed binding", + newVolumeArray("volume1-1", "1Gi", "", "", v1.VolumePending, v1.PersistentVolumeReclaimRetain, classWait), + newVolumeArray("volume1-1", "1Gi", "", "", v1.VolumePending, v1.PersistentVolumeReclaimRetain, classWait), + newClaimArray("claim1-1", "uid1-1", "1Gi", "", v1.ClaimPending, &classWait), + newClaimArray("claim1-1", "uid1-1", "1Gi", "", v1.ClaimPending, &classWait), + []string{"Normal WaitForFirstConsumer"}, + noerrors, testSyncClaim, + }, + { + // syncClaim binds when binding is delayed but PV is prebound to PVC + "1-14 - successful prebound PV", + newVolumeArray("volume1-1", "1Gi", "", "claim1-1", v1.VolumePending, v1.PersistentVolumeReclaimRetain, classWait), + newVolumeArray("volume1-1", "1Gi", "uid1-1", "claim1-1", v1.VolumeBound, v1.PersistentVolumeReclaimRetain, classWait), + newClaimArray("claim1-1", "uid1-1", "1Gi", "", v1.ClaimPending, &classWait), + newClaimArray("claim1-1", "uid1-1", "1Gi", "volume1-1", v1.ClaimBound, &classWait, annBoundByController, annBindCompleted), + noevents, noerrors, testSyncClaim, + }, // [Unit test set 2] User asked for a specific PV. // Test the binding when pv.ClaimRef is already set by controller or @@ -570,7 +590,15 @@ func TestSync(t *testing.T) { }, } - runSyncTests(t, tests, []*storage.StorageClass{}) + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true") + defer utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false") + + runSyncTests(t, tests, []*storage.StorageClass{ + { + ObjectMeta: metav1.ObjectMeta{Name: classWait}, + VolumeBindingMode: &modeWait, + }, + }) } func TestSyncAlphaBlockVolume(t *testing.T) { diff --git a/pkg/controller/volume/persistentvolume/framework_test.go b/pkg/controller/volume/persistentvolume/framework_test.go index d826947191d..5c165abd946 100644 --- a/pkg/controller/volume/persistentvolume/framework_test.go +++ b/pkg/controller/volume/persistentvolume/framework_test.go @@ -196,6 +196,8 @@ func (r *volumeReactor) React(action core.Action) (handled bool, ret runtime.Obj if storedVer != requestedVer { return true, obj, versionConflictError } + // Don't modify the existing object + volume = volume.DeepCopy() volume.ResourceVersion = strconv.Itoa(storedVer + 1) } else { return true, nil, fmt.Errorf("Cannot update volume %s: volume not found", volume.Name) @@ -220,6 +222,8 @@ func (r *volumeReactor) React(action core.Action) (handled bool, ret runtime.Obj if storedVer != requestedVer { return true, obj, versionConflictError } + // Don't modify the existing object + claim = claim.DeepCopy() claim.ResourceVersion = strconv.Itoa(storedVer + 1) } else { return true, nil, fmt.Errorf("Cannot update claim %s: claim not found", claim.Name) @@ -301,7 +305,12 @@ func (r *volumeReactor) checkVolumes(expectedVolumes []*v1.PersistentVolume) err gotMap := make(map[string]*v1.PersistentVolume) // Clear any ResourceVersion from both sets for _, v := range expectedVolumes { + // Don't modify the existing object + v := v.DeepCopy() v.ResourceVersion = "" + if v.Spec.ClaimRef != nil { + v.Spec.ClaimRef.ResourceVersion = "" + } expectedMap[v.Name] = v } for _, v := range r.volumes { @@ -331,6 +340,8 @@ func (r *volumeReactor) checkClaims(expectedClaims []*v1.PersistentVolumeClaim) expectedMap := make(map[string]*v1.PersistentVolumeClaim) gotMap := make(map[string]*v1.PersistentVolumeClaim) for _, c := range expectedClaims { + // Don't modify the existing object + c = c.DeepCopy() c.ResourceVersion = "" expectedMap[c.Name] = c } @@ -822,6 +833,9 @@ var ( classUnknownInternal string = "unknown-internal" classUnsupportedMountOptions string = "unsupported-mountoptions" classLarge string = "large" + classWait string = "wait" + + modeWait = storage.VolumeBindingWaitForFirstConsumer ) // wrapTestWithPluginCalls returns a testCall that: diff --git a/pkg/controller/volume/persistentvolume/index.go b/pkg/controller/volume/persistentvolume/index.go index bf356f41f7c..dd652471d7a 100644 --- a/pkg/controller/volume/persistentvolume/index.go +++ b/pkg/controller/volume/persistentvolume/index.go @@ -29,6 +29,7 @@ import ( v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/volume" + volumeutil "k8s.io/kubernetes/pkg/volume/util" ) // persistentVolumeOrderedIndex is a cache.Store that keeps persistent volumes @@ -74,7 +75,7 @@ func (pvIndex *persistentVolumeOrderedIndex) listByAccessModes(modes []v1.Persis } // find returns the nearest PV from the ordered list or nil if a match is not found -func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, error) { +func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) { // PVs are indexed by their access modes to allow easier searching. Each // index is the string representation of a set of access modes. There is a // finite number of possible sets and PVs will only be indexed in one of @@ -90,6 +91,45 @@ func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVol // example above). allPossibleModes := pvIndex.allPossibleMatchingAccessModes(claim.Spec.AccessModes) + for _, modes := range allPossibleModes { + volumes, err := pvIndex.listByAccessModes(modes) + if err != nil { + return nil, err + } + + bestVol, err := findMatchingVolume(claim, volumes, nil /* node for topology binding*/, nil /* exclusion map */, delayBinding) + if err != nil { + return nil, err + } + + if bestVol != nil { + return bestVol, nil + } + } + return nil, nil +} + +// findMatchingVolume goes through the list of volumes to find the best matching volume +// for the claim. +// +// This function is used by both the PV controller and scheduler. +// +// delayBinding is true only in the PV controller path. When set, prebound PVs are still returned +// as a match for the claim, but unbound PVs are skipped. +// +// node is set only in the scheduler path. When set, the PV node affinity is checked against +// the node's labels. +// +// excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple +// unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen +// PV needs to be excluded from future matching. +func findMatchingVolume( + claim *v1.PersistentVolumeClaim, + volumes []*v1.PersistentVolume, + node *v1.Node, + excludedVolumes map[string]*v1.PersistentVolume, + delayBinding bool) (*v1.PersistentVolume, error) { + var smallestVolume *v1.PersistentVolume var smallestVolumeQty resource.Quantity requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)] @@ -105,67 +145,90 @@ func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVol selector = internalSelector } - for _, modes := range allPossibleModes { - volumes, err := pvIndex.listByAccessModes(modes) + // Go through all available volumes with two goals: + // - find a volume that is either pre-bound by user or dynamically + // provisioned for this claim. Because of this we need to loop through + // all volumes. + // - find the smallest matching one if there is no volume pre-bound to + // the claim. + for _, volume := range volumes { + if _, ok := excludedVolumes[volume.Name]; ok { + // Skip volumes in the excluded list + continue + } + + volumeQty := volume.Spec.Capacity[v1.ResourceStorage] + + // check if volumeModes do not match (Alpha and feature gate protected) + isMisMatch, err := checkVolumeModeMisMatches(&claim.Spec, &volume.Spec) if err != nil { - return nil, err + return nil, fmt.Errorf("error checking if volumeMode was a mismatch: %v", err) + } + // filter out mismatching volumeModes + if isMisMatch { + continue } - // Go through all available volumes with two goals: - // - find a volume that is either pre-bound by user or dynamically - // provisioned for this claim. Because of this we need to loop through - // all volumes. - // - find the smallest matching one if there is no volume pre-bound to - // the claim. - for _, volume := range volumes { - // check if volumeModes do not match (Alpha and feature gate protected) - isMisMatch, err := checkVolumeModeMisMatches(&claim.Spec, &volume.Spec) + if node != nil { + // Scheduler path, check that the PV NodeAffinity + // is satisfied by the node + err := volumeutil.CheckNodeAffinity(volume, node.Labels) if err != nil { - return nil, fmt.Errorf("error checking if volumeMode was a mismatch: %v", err) - } - // filter out mismatching volumeModes - if isMisMatch { continue } - - if isVolumeBoundToClaim(volume, claim) { - // this claim and volume are pre-bound; return - // the volume if the size request is satisfied, - // otherwise continue searching for a match - volumeQty := volume.Spec.Capacity[v1.ResourceStorage] - if volumeQty.Cmp(requestedQty) < 0 { - continue - } - return volume, nil - } - - // filter out: - // - volumes bound to another claim - // - volumes whose labels don't match the claim's selector, if specified - // - volumes in Class that is not requested - if volume.Spec.ClaimRef != nil { - continue - } else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) { - continue - } - if v1helper.GetPersistentVolumeClass(volume) != requestedClass { - continue - } - - volumeQty := volume.Spec.Capacity[v1.ResourceStorage] - if volumeQty.Cmp(requestedQty) >= 0 { - if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 { - smallestVolume = volume - smallestVolumeQty = volumeQty - } - } } - if smallestVolume != nil { - // Found a matching volume - return smallestVolume, nil + if isVolumeBoundToClaim(volume, claim) { + // this claim and volume are pre-bound; return + // the volume if the size request is satisfied, + // otherwise continue searching for a match + if volumeQty.Cmp(requestedQty) < 0 { + continue + } + return volume, nil + } + + if node == nil && delayBinding { + // PV controller does not bind this claim. + // Scheduler will handle binding unbound volumes + // Scheduler path will have node != nil + continue + } + + // filter out: + // - volumes bound to another claim + // - volumes whose labels don't match the claim's selector, if specified + // - volumes in Class that is not requested + if volume.Spec.ClaimRef != nil { + continue + } else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) { + continue + } + if v1helper.GetPersistentVolumeClass(volume) != requestedClass { + continue + } + + if node != nil { + // Scheduler path + // Check that the access modes match + if !checkAccessModes(claim, volume) { + continue + } + } + + if volumeQty.Cmp(requestedQty) >= 0 { + if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 { + smallestVolume = volume + smallestVolumeQty = volumeQty + } } } + + if smallestVolume != nil { + // Found a matching volume + return smallestVolume, nil + } + return nil, nil } @@ -191,8 +254,8 @@ func checkVolumeModeMisMatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1 } // findBestMatchForClaim is a convenience method that finds a volume by the claim's AccessModes and requests for Storage -func (pvIndex *persistentVolumeOrderedIndex) findBestMatchForClaim(claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, error) { - return pvIndex.findByClaim(claim) +func (pvIndex *persistentVolumeOrderedIndex) findBestMatchForClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) { + return pvIndex.findByClaim(claim, delayBinding) } // allPossibleMatchingAccessModes returns an array of AccessMode arrays that @@ -274,3 +337,19 @@ func claimToClaimKey(claim *v1.PersistentVolumeClaim) string { func claimrefToClaimKey(claimref *v1.ObjectReference) string { return fmt.Sprintf("%s/%s", claimref.Namespace, claimref.Name) } + +// Returns true if PV satisfies all the PVC's requested AccessModes +func checkAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool { + pvModesMap := map[v1.PersistentVolumeAccessMode]bool{} + for _, mode := range volume.Spec.AccessModes { + pvModesMap[mode] = true + } + + for _, mode := range claim.Spec.AccessModes { + _, ok := pvModesMap[mode] + if !ok { + return false + } + } + return true +} diff --git a/pkg/controller/volume/persistentvolume/index_test.go b/pkg/controller/volume/persistentvolume/index_test.go index a1edc3bb286..b734a67a991 100644 --- a/pkg/controller/volume/persistentvolume/index_test.go +++ b/pkg/controller/volume/persistentvolume/index_test.go @@ -20,6 +20,8 @@ import ( "sort" "testing" + "github.com/golang/glog" + "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -27,6 +29,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" ref "k8s.io/client-go/tools/reference" "k8s.io/kubernetes/pkg/api/testapi" + "k8s.io/kubernetes/pkg/apis/core/v1/helper" "k8s.io/kubernetes/pkg/volume" ) @@ -178,7 +181,7 @@ func TestMatchVolume(t *testing.T) { } for name, scenario := range scenarios { - volume, err := volList.findBestMatchForClaim(scenario.claim) + volume, err := volList.findBestMatchForClaim(scenario.claim, false) if err != nil { t.Errorf("Unexpected error matching volume by claim: %v", err) } @@ -249,7 +252,7 @@ func TestMatchingWithBoundVolumes(t *testing.T) { }, } - volume, err := volumeIndex.findBestMatchForClaim(claim) + volume, err := volumeIndex.findBestMatchForClaim(claim, false) if err != nil { t.Fatalf("Unexpected error matching volume by claim: %v", err) } @@ -372,27 +375,27 @@ func TestFindingVolumeWithDifferentAccessModes(t *testing.T) { index.store.Add(ebs) index.store.Add(nfs) - volume, _ := index.findBestMatchForClaim(claim) + volume, _ := index.findBestMatchForClaim(claim, false) if volume.Name != ebs.Name { t.Errorf("Expected %s but got volume %s instead", ebs.Name, volume.Name) } claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce, v1.ReadOnlyMany} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != gce.Name { t.Errorf("Expected %s but got volume %s instead", gce.Name, volume.Name) } // order of the requested modes should not matter claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany, v1.ReadWriteOnce, v1.ReadOnlyMany} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != nfs.Name { t.Errorf("Expected %s but got volume %s instead", nfs.Name, volume.Name) } // fewer modes requested should still match claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != nfs.Name { t.Errorf("Expected %s but got volume %s instead", nfs.Name, volume.Name) } @@ -400,7 +403,7 @@ func TestFindingVolumeWithDifferentAccessModes(t *testing.T) { // pretend the exact match is bound. should get the next level up of modes. ebs.Spec.ClaimRef = &v1.ObjectReference{} claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != gce.Name { t.Errorf("Expected %s but got volume %s instead", gce.Name, volume.Name) } @@ -408,7 +411,7 @@ func TestFindingVolumeWithDifferentAccessModes(t *testing.T) { // continue up the levels of modes. gce.Spec.ClaimRef = &v1.ObjectReference{} claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != nfs.Name { t.Errorf("Expected %s but got volume %s instead", nfs.Name, volume.Name) } @@ -416,7 +419,7 @@ func TestFindingVolumeWithDifferentAccessModes(t *testing.T) { // partial mode request gce.Spec.ClaimRef = nil claim.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadOnlyMany} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != gce.Name { t.Errorf("Expected %s but got volume %s instead", gce.Name, volume.Name) } @@ -675,6 +678,87 @@ func createTestVolumes() []*v1.PersistentVolume { StorageClassName: classLarge, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "affinity-pv", + Name: "affinity001", + Annotations: getAnnotationWithNodeAffinity("key1", "value1"), + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("100G"), + }, + PersistentVolumeSource: v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{}, + }, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + v1.ReadOnlyMany, + }, + StorageClassName: classWait, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "affinity-pv2", + Name: "affinity002", + Annotations: getAnnotationWithNodeAffinity("key1", "value1"), + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("150G"), + }, + PersistentVolumeSource: v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{}, + }, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + v1.ReadOnlyMany, + }, + StorageClassName: classWait, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "affinity-prebound", + Name: "affinity003", + Annotations: getAnnotationWithNodeAffinity("key1", "value1"), + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("100G"), + }, + PersistentVolumeSource: v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{}, + }, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + v1.ReadOnlyMany, + }, + StorageClassName: classWait, + ClaimRef: &v1.ObjectReference{Name: "claim02", Namespace: "myns"}, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "affinity-pv3", + Name: "affinity003", + Annotations: getAnnotationWithNodeAffinity("key1", "value3"), + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("200G"), + }, + PersistentVolumeSource: v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{}, + }, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + v1.ReadOnlyMany, + }, + StorageClassName: classWait, + }, + }, } } @@ -692,6 +776,32 @@ func testVolume(name, size string) *v1.PersistentVolume { } } +func getAnnotationWithNodeAffinity(key string, value string) map[string]string { + affinity := &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: key, + Operator: v1.NodeSelectorOpIn, + Values: []string{value}, + }, + }, + }, + }, + }, + } + + annotations := map[string]string{} + err := helper.StorageNodeAffinityToAlphaAnnotation(annotations, affinity) + if err != nil { + glog.Fatalf("Failed to get node affinity annotation: %v", err) + } + + return annotations +} + func createVolumeModeBlockTestVolume() *v1.PersistentVolume { blockMode := v1.PersistentVolumeBlock @@ -919,7 +1029,7 @@ func TestAlphaFilteringVolumeModes(t *testing.T) { for name, scenario := range scenarios { toggleBlockVolumeFeature(scenario.enableBlock, t) - pvmatch, err := scenario.vol.findBestMatchForClaim(scenario.pvc) + pvmatch, err := scenario.vol.findBestMatchForClaim(scenario.pvc, false) // expected to match but either got an error or no returned pvmatch if pvmatch == nil && scenario.isExpectedMatch { t.Errorf("Unexpected failure for scenario, no matching volume: %s", name) @@ -972,14 +1082,14 @@ func TestFindingPreboundVolumes(t *testing.T) { index.store.Add(pvBadMode) // expected exact match on size - volume, _ := index.findBestMatchForClaim(claim) + volume, _ := index.findBestMatchForClaim(claim, false) if volume.Name != pv1.Name { t.Errorf("Expected %s but got volume %s instead", pv1.Name, volume.Name) } // pretend the exact match is pre-bound. should get the next size up. pv1.Spec.ClaimRef = &v1.ObjectReference{Name: "foo", Namespace: "bar"} - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != pv5.Name { t.Errorf("Expected %s but got volume %s instead", pv5.Name, volume.Name) } @@ -987,7 +1097,7 @@ func TestFindingPreboundVolumes(t *testing.T) { // pretend the exact match is available but the largest volume is pre-bound to the claim. pv1.Spec.ClaimRef = nil pv8.Spec.ClaimRef = claimRef - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != pv8.Name { t.Errorf("Expected %s but got volume %s instead", pv8.Name, volume.Name) } @@ -995,7 +1105,7 @@ func TestFindingPreboundVolumes(t *testing.T) { // pretend the volume with too small a size is pre-bound to the claim. should get the exact match. pv8.Spec.ClaimRef = nil pvBadSize.Spec.ClaimRef = claimRef - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != pv1.Name { t.Errorf("Expected %s but got volume %s instead", pv1.Name, volume.Name) } @@ -1003,12 +1113,186 @@ func TestFindingPreboundVolumes(t *testing.T) { // pretend the volume without the right access mode is pre-bound to the claim. should get the exact match. pvBadSize.Spec.ClaimRef = nil pvBadMode.Spec.ClaimRef = claimRef - volume, _ = index.findBestMatchForClaim(claim) + volume, _ = index.findBestMatchForClaim(claim, false) if volume.Name != pv1.Name { t.Errorf("Expected %s but got volume %s instead", pv1.Name, volume.Name) } } +func TestBestMatchDelayed(t *testing.T) { + volList := newPersistentVolumeOrderedIndex() + for _, pv := range createTestVolumes() { + volList.store.Add(pv) + } + + // binding through PV controller should be delayed + claim := makePVC("8G", nil) + volume, err := volList.findBestMatchForClaim(claim, true) + if err != nil { + t.Errorf("Unexpected error matching volume by claim: %v", err) + } + if volume != nil { + t.Errorf("Unexpected match with %q", volume.UID) + } +} + +func TestFindMatchVolumeWithNode(t *testing.T) { + volumes := createTestVolumes() + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"key1": "value1"}, + }, + } + node2 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"key1": "value2"}, + }, + } + node3 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"key1": "value3"}, + }, + } + + scenarios := map[string]struct { + expectedMatch string + claim *v1.PersistentVolumeClaim + node *v1.Node + excludedVolumes map[string]*v1.PersistentVolume + }{ + "success-match": { + expectedMatch: "affinity-pv", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + }), + node: node1, + }, + "success-prebound": { + expectedMatch: "affinity-prebound", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + pvc.Name = "claim02" + }), + node: node1, + }, + "success-exclusion": { + expectedMatch: "affinity-pv2", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + }), + node: node1, + excludedVolumes: map[string]*v1.PersistentVolume{"affinity001": nil}, + }, + "fail-exclusion": { + expectedMatch: "", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + }), + node: node1, + excludedVolumes: map[string]*v1.PersistentVolume{"affinity001": nil, "affinity002": nil}, + }, + "fail-accessmode": { + expectedMatch: "", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany} + pvc.Spec.StorageClassName = &classWait + }), + node: node1, + }, + "fail-nodeaffinity": { + expectedMatch: "", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + }), + node: node2, + }, + "fail-prebound-node-affinity": { + expectedMatch: "", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + pvc.Name = "claim02" + }), + node: node2, + }, + "success-bad-and-good-node-affinity": { + expectedMatch: "affinity-pv3", + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce} + pvc.Spec.StorageClassName = &classWait + pvc.Name = "claim03" + }), + node: node3, + }, + } + + for name, scenario := range scenarios { + volume, err := findMatchingVolume(scenario.claim, volumes, scenario.node, scenario.excludedVolumes, true) + if err != nil { + t.Errorf("Unexpected error matching volume by claim: %v", err) + } + if len(scenario.expectedMatch) != 0 && volume == nil { + t.Errorf("Expected match but received nil volume for scenario: %s", name) + } + if len(scenario.expectedMatch) != 0 && volume != nil && string(volume.UID) != scenario.expectedMatch { + t.Errorf("Expected %s but got volume %s in scenario %s", scenario.expectedMatch, volume.UID, name) + } + if len(scenario.expectedMatch) == 0 && volume != nil { + t.Errorf("Unexpected match for scenario: %s, matched with %s instead", name, volume.UID) + } + } +} + +func TestCheckAccessModes(t *testing.T) { + volume := &v1.PersistentVolume{ + Spec: v1.PersistentVolumeSpec{ + AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce, v1.ReadWriteMany}, + }, + } + + scenarios := map[string]struct { + shouldSucceed bool + claim *v1.PersistentVolumeClaim + }{ + "success-single-mode": { + shouldSucceed: true, + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany} + }), + }, + "success-many-modes": { + shouldSucceed: true, + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany, v1.ReadWriteOnce} + }), + }, + "fail-single-mode": { + shouldSucceed: false, + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadOnlyMany} + }), + }, + "fail-many-modes": { + shouldSucceed: false, + claim: makePVC("100G", func(pvc *v1.PersistentVolumeClaim) { + pvc.Spec.AccessModes = []v1.PersistentVolumeAccessMode{v1.ReadWriteMany, v1.ReadOnlyMany} + }), + }, + } + + for name, scenario := range scenarios { + result := checkAccessModes(scenario.claim, volume) + if result != scenario.shouldSucceed { + t.Errorf("Test %q failed: Expected %v, got %v", name, scenario.shouldSucceed, result) + } + } +} + // byCapacity is used to order volumes by ascending storage size type byCapacity struct { volumes []*v1.PersistentVolume diff --git a/pkg/controller/volume/persistentvolume/pv_controller.go b/pkg/controller/volume/persistentvolume/pv_controller.go index 20e10a28b78..980d960c750 100644 --- a/pkg/controller/volume/persistentvolume/pv_controller.go +++ b/pkg/controller/volume/persistentvolume/pv_controller.go @@ -26,6 +26,7 @@ import ( storage "k8s.io/api/storage/v1" apierrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" corelisters "k8s.io/client-go/listers/core/v1" @@ -37,6 +38,7 @@ import ( v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" "k8s.io/kubernetes/pkg/cloudprovider" "k8s.io/kubernetes/pkg/controller/volume/events" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/util/goroutinemap" "k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff" vol "k8s.io/kubernetes/pkg/volume" @@ -254,6 +256,30 @@ func checkVolumeSatisfyClaim(volume *v1.PersistentVolume, claim *v1.PersistentVo return nil } +func (ctrl *PersistentVolumeController) shouldDelayBinding(claim *v1.PersistentVolumeClaim) (bool, error) { + if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + return false, nil + } + + className := v1helper.GetPersistentVolumeClaimClass(claim) + if className == "" { + return false, nil + } + + class, err := ctrl.classLister.Get(className) + if err != nil { + return false, nil + } + + if class.VolumeBindingMode == nil { + return false, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", className) + } + + // TODO: add check to handle dynamic provisioning later + + return *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer, nil +} + // syncUnboundClaim is the main controller method to decide what to do with an // unbound claim. func (ctrl *PersistentVolumeController) syncUnboundClaim(claim *v1.PersistentVolumeClaim) error { @@ -261,9 +287,13 @@ func (ctrl *PersistentVolumeController) syncUnboundClaim(claim *v1.PersistentVol // OBSERVATION: pvc is "Pending" if claim.Spec.VolumeName == "" { // User did not care which PV they get. + delayBinding, err := ctrl.shouldDelayBinding(claim) + if err != nil { + return err + } // [Unit test set 1] - volume, err := ctrl.volumes.findBestMatchForClaim(claim) + volume, err := ctrl.volumes.findBestMatchForClaim(claim, delayBinding) if err != nil { glog.V(2).Infof("synchronizing unbound PersistentVolumeClaim[%s]: Error finding PV for claim: %v", claimToClaimKey(claim), err) return fmt.Errorf("Error finding PV for claim %q: %v", claimToClaimKey(claim), err) @@ -272,15 +302,21 @@ func (ctrl *PersistentVolumeController) syncUnboundClaim(claim *v1.PersistentVol glog.V(4).Infof("synchronizing unbound PersistentVolumeClaim[%s]: no volume found", claimToClaimKey(claim)) // No PV could be found // OBSERVATION: pvc is "Pending", will retry - if v1helper.GetPersistentVolumeClaimClass(claim) != "" { + switch { + case delayBinding: + // TODO: Skip dynamic provisioning for now + ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.WaitForFirstConsumer, "waiting for first consumer to be created before binding") + case v1helper.GetPersistentVolumeClaimClass(claim) != "": if err = ctrl.provisionClaim(claim); err != nil { return err } return nil + default: + ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.FailedBinding, "no persistent volumes available for this claim and no storage class is set") } + // Mark the claim as Pending and try to find a match in the next // periodic syncClaim - ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.FailedBinding, "no persistent volumes available for this claim and no storage class is set") if _, err = ctrl.updateClaimStatus(claim, v1.ClaimPending, nil); err != nil { return err } @@ -748,6 +784,42 @@ func (ctrl *PersistentVolumeController) updateVolumePhaseWithEvent(volume *v1.Pe func (ctrl *PersistentVolumeController) bindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, error) { glog.V(4).Infof("updating PersistentVolume[%s]: binding to %q", volume.Name, claimToClaimKey(claim)) + volumeClone, dirty, err := ctrl.getBindVolumeToClaim(volume, claim) + if err != nil { + return nil, err + } + + // Save the volume only if something was changed + if dirty { + return ctrl.updateBindVolumeToClaim(volumeClone, claim, true) + } + + glog.V(4).Infof("updating PersistentVolume[%s]: already bound to %q", volume.Name, claimToClaimKey(claim)) + return volume, nil +} + +// bindVolumeToClaim modifies given volume to be bound to a claim and saves it to +// API server. The claim is not modified in this method! +func (ctrl *PersistentVolumeController) updateBindVolumeToClaim(volumeClone *v1.PersistentVolume, claim *v1.PersistentVolumeClaim, updateCache bool) (*v1.PersistentVolume, error) { + glog.V(2).Infof("claim %q bound to volume %q", claimToClaimKey(claim), volumeClone.Name) + newVol, err := ctrl.kubeClient.Core().PersistentVolumes().Update(volumeClone) + if err != nil { + glog.V(4).Infof("updating PersistentVolume[%s]: binding to %q failed: %v", volumeClone.Name, claimToClaimKey(claim), err) + return newVol, err + } + if updateCache { + _, err = ctrl.storeVolumeUpdate(newVol) + if err != nil { + glog.V(4).Infof("updating PersistentVolume[%s]: cannot update internal cache: %v", volumeClone.Name, err) + return newVol, err + } + } + glog.V(4).Infof("updating PersistentVolume[%s]: bound to %q", newVol.Name, claimToClaimKey(claim)) + return newVol, nil +} + +// Get new PV object only, no API or cache update +func (ctrl *PersistentVolumeController) getBindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, bool, error) { dirty := false // Check if the volume was already bound (either by user or by controller) @@ -768,7 +840,7 @@ func (ctrl *PersistentVolumeController) bindVolumeToClaim(volume *v1.PersistentV claimRef, err := ref.GetReference(scheme.Scheme, claim) if err != nil { - return nil, fmt.Errorf("Unexpected error getting claim reference: %v", err) + return nil, false, fmt.Errorf("Unexpected error getting claim reference: %v", err) } volumeClone.Spec.ClaimRef = claimRef dirty = true @@ -780,25 +852,7 @@ func (ctrl *PersistentVolumeController) bindVolumeToClaim(volume *v1.PersistentV dirty = true } - // Save the volume only if something was changed - if dirty { - glog.V(2).Infof("claim %q bound to volume %q", claimToClaimKey(claim), volume.Name) - newVol, err := ctrl.kubeClient.CoreV1().PersistentVolumes().Update(volumeClone) - if err != nil { - glog.V(4).Infof("updating PersistentVolume[%s]: binding to %q failed: %v", volume.Name, claimToClaimKey(claim), err) - return newVol, err - } - _, err = ctrl.storeVolumeUpdate(newVol) - if err != nil { - glog.V(4).Infof("updating PersistentVolume[%s]: cannot update internal cache: %v", volume.Name, err) - return newVol, err - } - glog.V(4).Infof("updating PersistentVolume[%s]: bound to %q", newVol.Name, claimToClaimKey(claim)) - return newVol, nil - } - - glog.V(4).Infof("updating PersistentVolume[%s]: already bound to %q", volume.Name, claimToClaimKey(claim)) - return volume, nil + return volumeClone, dirty, nil } // bindClaimToVolume modifies the given claim to be bound to a volume and diff --git a/pkg/controller/volume/persistentvolume/pv_controller_test.go b/pkg/controller/volume/persistentvolume/pv_controller_test.go index 41d5ce4b831..5454fe26ece 100644 --- a/pkg/controller/volume/persistentvolume/pv_controller_test.go +++ b/pkg/controller/volume/persistentvolume/pv_controller_test.go @@ -21,8 +21,12 @@ import ( "time" "github.com/golang/glog" + "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/watch" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" @@ -232,3 +236,106 @@ func addVolumeAnnotation(volume *v1.PersistentVolume, annName, annValue string) volume.Annotations[annName] = annValue return volume } + +func makePVCClass(scName *string) *v1.PersistentVolumeClaim { + return &v1.PersistentVolumeClaim{ + Spec: v1.PersistentVolumeClaimSpec{ + StorageClassName: scName, + }, + } +} + +func makeStorageClass(scName string, mode *storagev1.VolumeBindingMode) *storagev1.StorageClass { + return &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: scName, + }, + VolumeBindingMode: mode, + } +} + +func TestDelayBinding(t *testing.T) { + var ( + classNotHere = "not-here" + classNoMode = "no-mode" + classImmediateMode = "immediate-mode" + classWaitMode = "wait-mode" + + modeImmediate = storagev1.VolumeBindingImmediate + modeWait = storagev1.VolumeBindingWaitForFirstConsumer + ) + + tests := map[string]struct { + pvc *v1.PersistentVolumeClaim + shouldDelay bool + shouldFail bool + }{ + "nil-class": { + pvc: makePVCClass(nil), + shouldDelay: false, + }, + "class-not-found": { + pvc: makePVCClass(&classNotHere), + shouldDelay: false, + }, + "no-mode-class": { + pvc: makePVCClass(&classNoMode), + shouldDelay: false, + shouldFail: true, + }, + "immediate-mode-class": { + pvc: makePVCClass(&classImmediateMode), + shouldDelay: false, + }, + "wait-mode-class": { + pvc: makePVCClass(&classWaitMode), + shouldDelay: true, + }, + } + + classes := []*storagev1.StorageClass{ + makeStorageClass(classNoMode, nil), + makeStorageClass(classImmediateMode, &modeImmediate), + makeStorageClass(classWaitMode, &modeWait), + } + + client := &fake.Clientset{} + informerFactory := informers.NewSharedInformerFactory(client, controller.NoResyncPeriodFunc()) + classInformer := informerFactory.Storage().V1().StorageClasses() + ctrl := &PersistentVolumeController{ + classLister: classInformer.Lister(), + } + + for _, class := range classes { + if err := classInformer.Informer().GetIndexer().Add(class); err != nil { + t.Fatalf("Failed to add storage class %q: %v", class.Name, err) + } + } + + // When feature gate is disabled, should always be delayed + name := "feature-disabled" + shouldDelay, err := ctrl.shouldDelayBinding(makePVCClass(&classWaitMode)) + if err != nil { + t.Errorf("Test %q returned error: %v", name, err) + } + if shouldDelay { + t.Errorf("Test %q returned true, expected false", name) + } + + // Enable feature gate + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true") + defer utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false") + + for name, test := range tests { + shouldDelay, err = ctrl.shouldDelayBinding(test.pvc) + if err != nil && !test.shouldFail { + t.Errorf("Test %q returned error: %v", name, err) + } + if err == nil && test.shouldFail { + t.Errorf("Test %q returned success, expected error", name) + } + if shouldDelay != test.shouldDelay { + t.Errorf("Test %q returned unexpected %v", name, test.shouldDelay) + } + } +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_assume_cache.go b/pkg/controller/volume/persistentvolume/scheduler_assume_cache.go new file mode 100644 index 00000000000..28884004d7c --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_assume_cache.go @@ -0,0 +1,318 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "fmt" + "strconv" + "sync" + + "github.com/golang/glog" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/client-go/tools/cache" +) + +// AssumeCache is a cache on top of the informer that allows for updating +// objects outside of informer events and also restoring the informer +// cache's version of the object. Objects are assumed to be +// Kubernetes API objects that implement meta.Interface +type AssumeCache interface { + // Assume updates the object in-memory only + Assume(obj interface{}) error + + // Restore the informer cache's version of the object + Restore(objName string) + + // Get the object by name + Get(objName string) (interface{}, error) + + // List all the objects in the cache + List() []interface{} +} + +type errWrongType struct { + typeName string + object interface{} +} + +func (e *errWrongType) Error() string { + return fmt.Sprintf("could not convert object to type %v: %+v", e.typeName, e.object) +} + +type errNotFound struct { + typeName string + objectName string +} + +func (e *errNotFound) Error() string { + return fmt.Sprintf("could not find %v %q", e.typeName, e.objectName) +} + +type errObjectName struct { + detailedErr error +} + +func (e *errObjectName) Error() string { + return fmt.Sprintf("failed to get object name: %v", e.detailedErr) +} + +// assumeCache stores two pointers to represent a single object: +// * The pointer to the informer object. +// * The pointer to the latest object, which could be the same as +// the informer object, or an in-memory object. +// +// An informer update always overrides the latest object pointer. +// +// Assume() only updates the latest object pointer. +// Restore() sets the latest object pointer back to the informer object. +// Get/List() always returns the latest object pointer. +type assumeCache struct { + mutex sync.Mutex + + // describes the object stored + description string + + // Stores objInfo pointers + store cache.Store +} + +type objInfo struct { + // name of the object + name string + + // Latest version of object could be cached-only or from informer + latestObj interface{} + + // Latest object from informer + apiObj interface{} +} + +func objInfoKeyFunc(obj interface{}) (string, error) { + objInfo, ok := obj.(*objInfo) + if !ok { + return "", &errWrongType{"objInfo", obj} + } + return objInfo.name, nil +} + +func NewAssumeCache(informer cache.SharedIndexInformer, description string) *assumeCache { + // TODO: index by storageclass + c := &assumeCache{store: cache.NewStore(objInfoKeyFunc), description: description} + + // Unit tests don't use informers + if informer != nil { + informer.AddEventHandler( + cache.ResourceEventHandlerFuncs{ + AddFunc: c.add, + UpdateFunc: c.update, + DeleteFunc: c.delete, + }, + ) + } + return c +} + +func (c *assumeCache) add(obj interface{}) { + if obj == nil { + return + } + + name, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + glog.Errorf("add failed: %v", &errObjectName{err}) + return + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + objInfo := &objInfo{name: name, latestObj: obj, apiObj: obj} + c.store.Update(objInfo) +} + +func (c *assumeCache) update(oldObj interface{}, newObj interface{}) { + c.add(newObj) +} + +func (c *assumeCache) delete(obj interface{}) { + if obj == nil { + return + } + + name, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + glog.Errorf("delete failed: %v", &errObjectName{err}) + return + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + objInfo := &objInfo{name: name} + err = c.store.Delete(objInfo) + if err != nil { + glog.Errorf("delete: failed to delete %v %v: %v", c.description, name, err) + } +} + +func (c *assumeCache) getObjVersion(name string, obj interface{}) (int64, error) { + objAccessor, err := meta.Accessor(obj) + if err != nil { + return -1, err + } + + objResourceVersion, err := strconv.ParseInt(objAccessor.GetResourceVersion(), 10, 64) + if err != nil { + return -1, fmt.Errorf("error parsing ResourceVersion %q for %v %q: %s", objAccessor.GetResourceVersion(), c.description, name, err) + } + return objResourceVersion, nil +} + +func (c *assumeCache) getObjInfo(name string) (*objInfo, error) { + obj, ok, err := c.store.GetByKey(name) + if err != nil { + return nil, err + } + if !ok { + return nil, &errNotFound{c.description, name} + } + + objInfo, ok := obj.(*objInfo) + if !ok { + return nil, &errWrongType{"objInfo", obj} + } + return objInfo, nil +} + +func (c *assumeCache) Get(objName string) (interface{}, error) { + c.mutex.Lock() + defer c.mutex.Unlock() + + objInfo, err := c.getObjInfo(objName) + if err != nil { + return nil, err + } + return objInfo.latestObj, nil +} + +func (c *assumeCache) List() []interface{} { + c.mutex.Lock() + defer c.mutex.Unlock() + + allObjs := []interface{}{} + for _, obj := range c.store.List() { + objInfo, ok := obj.(*objInfo) + if !ok { + glog.Errorf("list error: %v", &errWrongType{"objInfo", obj}) + continue + } + allObjs = append(allObjs, objInfo.latestObj) + } + return allObjs +} + +func (c *assumeCache) Assume(obj interface{}) error { + name, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + return &errObjectName{err} + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + objInfo, err := c.getObjInfo(name) + if err != nil { + return err + } + + newVersion, err := c.getObjVersion(name, obj) + if err != nil { + return err + } + + storedVersion, err := c.getObjVersion(name, objInfo.latestObj) + if err != nil { + return err + } + + if newVersion < storedVersion { + return fmt.Errorf("%v %q is out of sync", c.description, name) + } + + // Only update the cached object + objInfo.latestObj = obj + glog.V(4).Infof("Assumed %v %q, version %v", c.description, name, newVersion) + return nil +} + +func (c *assumeCache) Restore(objName string) { + c.mutex.Lock() + defer c.mutex.Unlock() + + objInfo, err := c.getObjInfo(objName) + if err != nil { + // This could be expected if object got deleted + glog.V(5).Infof("Restore %v %q warning: %v", c.description, objName, err) + } else { + objInfo.latestObj = objInfo.apiObj + glog.V(4).Infof("Restored %v %q", c.description, objName) + } +} + +// PVAssumeCache is a AssumeCache for PersistentVolume objects +type PVAssumeCache interface { + AssumeCache + + GetPV(pvName string) (*v1.PersistentVolume, error) + ListPVs() []*v1.PersistentVolume +} + +type pvAssumeCache struct { + *assumeCache +} + +func NewPVAssumeCache(informer cache.SharedIndexInformer) PVAssumeCache { + return &pvAssumeCache{assumeCache: NewAssumeCache(informer, "v1.PersistentVolume")} +} + +func (c *pvAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) { + obj, err := c.Get(pvName) + if err != nil { + return nil, err + } + + pv, ok := obj.(*v1.PersistentVolume) + if !ok { + return nil, &errWrongType{"v1.PersistentVolume", obj} + } + return pv, nil +} + +func (c *pvAssumeCache) ListPVs() []*v1.PersistentVolume { + objs := c.List() + pvs := []*v1.PersistentVolume{} + for _, obj := range objs { + pv, ok := obj.(*v1.PersistentVolume) + if !ok { + glog.Errorf("ListPVs: %v", &errWrongType{"v1.PersistentVolume", obj}) + } + pvs = append(pvs, pv) + } + return pvs +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_assume_cache_test.go b/pkg/controller/volume/persistentvolume/scheduler_assume_cache_test.go new file mode 100644 index 00000000000..7332c4d474a --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_assume_cache_test.go @@ -0,0 +1,212 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "fmt" + "testing" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func makePV(name, version string) *v1.PersistentVolume { + return &v1.PersistentVolume{ObjectMeta: metav1.ObjectMeta{Name: name, ResourceVersion: version}} +} + +func TestAssumePV(t *testing.T) { + scenarios := map[string]struct { + oldPV *v1.PersistentVolume + newPV *v1.PersistentVolume + shouldSucceed bool + }{ + "success-same-version": { + oldPV: makePV("pv1", "5"), + newPV: makePV("pv1", "5"), + shouldSucceed: true, + }, + "success-new-higher-version": { + oldPV: makePV("pv1", "5"), + newPV: makePV("pv1", "6"), + shouldSucceed: true, + }, + "fail-old-not-found": { + oldPV: makePV("pv2", "5"), + newPV: makePV("pv1", "5"), + shouldSucceed: false, + }, + "fail-new-lower-version": { + oldPV: makePV("pv1", "5"), + newPV: makePV("pv1", "4"), + shouldSucceed: false, + }, + "fail-new-bad-version": { + oldPV: makePV("pv1", "5"), + newPV: makePV("pv1", "a"), + shouldSucceed: false, + }, + "fail-old-bad-version": { + oldPV: makePV("pv1", "a"), + newPV: makePV("pv1", "5"), + shouldSucceed: false, + }, + } + + for name, scenario := range scenarios { + cache := NewPVAssumeCache(nil) + internal_cache, ok := cache.(*pvAssumeCache) + if !ok { + t.Fatalf("Failed to get internal cache") + } + + // Add oldPV to cache + internal_cache.add(scenario.oldPV) + if err := getPV(cache, scenario.oldPV.Name, scenario.oldPV); err != nil { + t.Errorf("Failed to GetPV() after initial update: %v", err) + continue + } + + // Assume newPV + err := cache.Assume(scenario.newPV) + if scenario.shouldSucceed && err != nil { + t.Errorf("Test %q failed: Assume() returned error %v", name, err) + } + if !scenario.shouldSucceed && err == nil { + t.Errorf("Test %q failed: Assume() returned success but expected error", name) + } + + // Check that GetPV returns correct PV + expectedPV := scenario.newPV + if !scenario.shouldSucceed { + expectedPV = scenario.oldPV + } + if err := getPV(cache, scenario.oldPV.Name, expectedPV); err != nil { + t.Errorf("Failed to GetPV() after initial update: %v", err) + } + } +} + +func TestRestorePV(t *testing.T) { + cache := NewPVAssumeCache(nil) + internal_cache, ok := cache.(*pvAssumeCache) + if !ok { + t.Fatalf("Failed to get internal cache") + } + + oldPV := makePV("pv1", "5") + newPV := makePV("pv1", "5") + + // Restore PV that doesn't exist + cache.Restore("nothing") + + // Add oldPV to cache + internal_cache.add(oldPV) + if err := getPV(cache, oldPV.Name, oldPV); err != nil { + t.Fatalf("Failed to GetPV() after initial update: %v", err) + } + + // Restore PV + cache.Restore(oldPV.Name) + if err := getPV(cache, oldPV.Name, oldPV); err != nil { + t.Fatalf("Failed to GetPV() after iniital restore: %v", err) + } + + // Assume newPV + if err := cache.Assume(newPV); err != nil { + t.Fatalf("Assume() returned error %v", err) + } + if err := getPV(cache, oldPV.Name, newPV); err != nil { + t.Fatalf("Failed to GetPV() after Assume: %v", err) + } + + // Restore PV + cache.Restore(oldPV.Name) + if err := getPV(cache, oldPV.Name, oldPV); err != nil { + t.Fatalf("Failed to GetPV() after restore: %v", err) + } +} + +func TestBasicPVCache(t *testing.T) { + cache := NewPVAssumeCache(nil) + internal_cache, ok := cache.(*pvAssumeCache) + if !ok { + t.Fatalf("Failed to get internal cache") + } + + // Get object that doesn't exist + pv, err := cache.GetPV("nothere") + if err == nil { + t.Errorf("GetPV() returned unexpected success") + } + if pv != nil { + t.Errorf("GetPV() returned unexpected PV %q", pv.Name) + } + + // Add a bunch of PVs + pvs := map[string]*v1.PersistentVolume{} + for i := 0; i < 10; i++ { + pv := makePV(fmt.Sprintf("test-pv%v", i), "1") + pvs[pv.Name] = pv + internal_cache.add(pv) + } + + // List them + verifyListPVs(t, cache, pvs) + + // Update a PV + updatedPV := makePV("test-pv3", "2") + pvs[updatedPV.Name] = updatedPV + internal_cache.update(nil, updatedPV) + + // List them + verifyListPVs(t, cache, pvs) + + // Delete a PV + deletedPV := pvs["test-pv7"] + delete(pvs, deletedPV.Name) + internal_cache.delete(deletedPV) + + // List them + verifyListPVs(t, cache, pvs) +} + +func verifyListPVs(t *testing.T, cache PVAssumeCache, expectedPVs map[string]*v1.PersistentVolume) { + pvList := cache.ListPVs() + if len(pvList) != len(expectedPVs) { + t.Errorf("ListPVs() returned %v PVs, expected %v", len(pvList), len(expectedPVs)) + } + for _, pv := range pvList { + expectedPV, ok := expectedPVs[pv.Name] + if !ok { + t.Errorf("ListPVs() returned unexpected PV %q", pv.Name) + } + if expectedPV != pv { + t.Errorf("ListPVs() returned PV %p, expected %p", pv, expectedPV) + } + } +} + +func getPV(cache PVAssumeCache, name string, expectedPV *v1.PersistentVolume) error { + pv, err := cache.GetPV(name) + if err != nil { + return err + } + if pv != expectedPV { + return fmt.Errorf("GetPV() returned %p, expected %p", pv, expectedPV) + } + return nil +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_binder.go b/pkg/controller/volume/persistentvolume/scheduler_binder.go new file mode 100644 index 00000000000..7edd1ef459d --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_binder.go @@ -0,0 +1,420 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "fmt" + "sort" + + "github.com/golang/glog" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + coreinformers "k8s.io/client-go/informers/core/v1" + storageinformers "k8s.io/client-go/informers/storage/v1" + clientset "k8s.io/client-go/kubernetes" + corelisters "k8s.io/client-go/listers/core/v1" + volumeutil "k8s.io/kubernetes/pkg/volume/util" +) + +// SchedulerVolumeBinder is used by the scheduler to handle PVC/PV binding +// and dynamic provisioning. The binding decisions are integrated into the pod scheduling +// workflow so that the PV NodeAffinity is also considered along with the pod's other +// scheduling requirements. +// +// This integrates into the existing default scheduler workflow as follows: +// 1. The scheduler takes a Pod off the scheduler queue and processes it serially: +// a. Invokes all predicate functions, parallelized across nodes. FindPodVolumes() is invoked here. +// b. Invokes all priority functions. Future/TBD +// c. Selects the best node for the Pod. +// d. Cache the node selection for the Pod. (Assume phase) +// i. If PVC binding is required, cache in-memory only: +// * Updated PV objects for prebinding to the corresponding PVCs. +// * For the pod, which PVs need API updates. +// AssumePodVolumes() is invoked here. Then BindPodVolumes() is called asynchronously by the +// scheduler. After BindPodVolumes() is complete, the Pod is added back to the scheduler queue +// to be processed again until all PVCs are bound. +// ii. If PVC binding is not required, cache the Pod->Node binding in the scheduler's pod cache, +// and asynchronously bind the Pod to the Node. This is handled in the scheduler and not here. +// 2. Once the assume operation is done, the scheduler processes the next Pod in the scheduler queue +// while the actual binding operation occurs in the background. +type SchedulerVolumeBinder interface { + // FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the node. + // + // If a PVC is bound, it checks if the PV's NodeAffinity matches the Node. + // Otherwise, it tries to find an available PV to bind to the PVC. + // + // It returns true if there are matching PVs that can satisfy all of the Pod's PVCs, and returns true + // if bound volumes satisfy the PV NodeAffinity. + // + // This function is called by the volume binding scheduler predicate and can be called in parallel + FindPodVolumes(pod *v1.Pod, nodeName string) (unboundVolumesSatisified, boundVolumesSatisfied bool, err error) + + // AssumePodVolumes will take the PV matches for unbound PVCs and update the PV cache assuming + // that the PV is prebound to the PVC. + // + // It returns true if all volumes are fully bound, and returns true if any volume binding API operation needs + // to be done afterwards. + // + // This function will modify assumedPod with the node name. + // This function is called serially. + AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, bindingRequired bool, err error) + + // BindPodVolumes will initiate the volume binding by making the API call to prebind the PV + // to its matching PVC. + // + // This function can be called in parallel. + BindPodVolumes(assumedPod *v1.Pod) error + + // GetBindingsCache returns the cache used (if any) to store volume binding decisions. + GetBindingsCache() PodBindingCache +} + +type volumeBinder struct { + ctrl *PersistentVolumeController + + // TODO: Need AssumeCache for PVC for dynamic provisioning + pvcCache corelisters.PersistentVolumeClaimLister + nodeCache corelisters.NodeLister + pvCache PVAssumeCache + + // Stores binding decisions that were made in FindPodVolumes for use in AssumePodVolumes. + // AssumePodVolumes modifies the bindings again for use in BindPodVolumes. + podBindingCache PodBindingCache +} + +// NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions. +func NewVolumeBinder( + kubeClient clientset.Interface, + pvcInformer coreinformers.PersistentVolumeClaimInformer, + pvInformer coreinformers.PersistentVolumeInformer, + nodeInformer coreinformers.NodeInformer, + storageClassInformer storageinformers.StorageClassInformer) SchedulerVolumeBinder { + + // TODO: find better way... + ctrl := &PersistentVolumeController{ + kubeClient: kubeClient, + classLister: storageClassInformer.Lister(), + } + + b := &volumeBinder{ + ctrl: ctrl, + pvcCache: pvcInformer.Lister(), + nodeCache: nodeInformer.Lister(), + pvCache: NewPVAssumeCache(pvInformer.Informer()), + podBindingCache: NewPodBindingCache(), + } + + return b +} + +func (b *volumeBinder) GetBindingsCache() PodBindingCache { + return b.podBindingCache +} + +// FindPodVolumes caches the matching PVs per node in podBindingCache +func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, nodeName string) (unboundVolumesSatisfied, boundVolumesSatisfied bool, err error) { + podName := getPodName(pod) + + glog.V(4).Infof("FindPodVolumes for pod %q, node %q", podName, nodeName) + + // Initialize to true for pods that don't have volumes + unboundVolumesSatisfied = true + boundVolumesSatisfied = true + + node, err := b.nodeCache.Get(nodeName) + if node == nil || err != nil { + return false, false, fmt.Errorf("error getting node %q: %v", nodeName, err) + } + + // The pod's volumes need to be processed in one call to avoid the race condition where + // volumes can get bound in between calls. + boundClaims, unboundClaims, unboundClaimsImmediate, err := b.getPodVolumes(pod) + if err != nil { + return false, false, err + } + + // Immediate claims should be bound + if len(unboundClaimsImmediate) > 0 { + return false, false, fmt.Errorf("pod has unbound PersistentVolumeClaims") + } + + // Check PV node affinity on bound volumes + if len(boundClaims) > 0 { + boundVolumesSatisfied, err = b.checkBoundClaims(boundClaims, node, podName) + if err != nil { + return false, false, err + } + } + + // Find PVs for unbound volumes + if len(unboundClaims) > 0 { + unboundVolumesSatisfied, err = b.findMatchingVolumes(pod, unboundClaims, node) + if err != nil { + return false, false, err + } + } + + return unboundVolumesSatisfied, boundVolumesSatisfied, nil +} + +// AssumePodVolumes will take the cached matching PVs in podBindingCache for the chosen node +// and update the pvCache with the new prebound PV. It will update podBindingCache again +// with the PVs that need an API update. +func (b *volumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound, bindingRequired bool, err error) { + podName := getPodName(assumedPod) + + glog.V(4).Infof("AssumePodVolumes for pod %q, node %q", podName, nodeName) + + if allBound := b.arePodVolumesBound(assumedPod); allBound { + glog.V(4).Infof("AssumePodVolumes: all PVCs bound and nothing to do") + return true, false, nil + } + + assumedPod.Spec.NodeName = nodeName + claimsToBind := b.podBindingCache.GetBindings(assumedPod, nodeName) + newBindings := []*bindingInfo{} + + for _, binding := range claimsToBind { + newPV, dirty, err := b.ctrl.getBindVolumeToClaim(binding.pv, binding.pvc) + glog.V(5).Infof("AssumePodVolumes: getBindVolumeToClaim for PV %q, PVC %q. newPV %p, dirty %v, err: %v", + binding.pv.Name, + binding.pvc.Name, + newPV, + dirty, + err) + if err != nil { + b.revertAssumedPVs(newBindings) + return false, true, err + } + if dirty { + err = b.pvCache.Assume(newPV) + if err != nil { + b.revertAssumedPVs(newBindings) + return false, true, err + } + + newBindings = append(newBindings, &bindingInfo{pv: newPV, pvc: binding.pvc}) + } + } + + if len(newBindings) == 0 { + // Don't update cached bindings if no API updates are needed. This can happen if we + // previously updated the PV object and are waiting for the PV controller to finish binding. + glog.V(4).Infof("AssumePodVolumes: PVs already assumed") + return false, false, nil + } + b.podBindingCache.UpdateBindings(assumedPod, nodeName, newBindings) + + return false, true, nil +} + +// BindPodVolumes gets the cached bindings in podBindingCache and makes the API update for those PVs. +func (b *volumeBinder) BindPodVolumes(assumedPod *v1.Pod) error { + glog.V(4).Infof("BindPodVolumes for pod %q", getPodName(assumedPod)) + + bindings := b.podBindingCache.GetBindings(assumedPod, assumedPod.Spec.NodeName) + + // Do the actual prebinding. Let the PV controller take care of the rest + // There is no API rollback if the actual binding fails + for i, bindingInfo := range bindings { + _, err := b.ctrl.updateBindVolumeToClaim(bindingInfo.pv, bindingInfo.pvc, false) + if err != nil { + // only revert assumed cached updates for volumes we haven't successfully bound + b.revertAssumedPVs(bindings[i:]) + return err + } + } + + return nil +} + +func getPodName(pod *v1.Pod) string { + return pod.Namespace + "/" + pod.Name +} + +func getPVCName(pvc *v1.PersistentVolumeClaim) string { + return pvc.Namespace + "/" + pvc.Name +} + +func (b *volumeBinder) isVolumeBound(namespace string, vol *v1.Volume, checkFullyBound bool) (bool, *v1.PersistentVolumeClaim, error) { + if vol.PersistentVolumeClaim == nil { + return true, nil, nil + } + + pvcName := vol.PersistentVolumeClaim.ClaimName + pvc, err := b.pvcCache.PersistentVolumeClaims(namespace).Get(pvcName) + if err != nil || pvc == nil { + return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcName, err) + } + + pvName := pvc.Spec.VolumeName + if pvName != "" { + if checkFullyBound { + if metav1.HasAnnotation(pvc.ObjectMeta, annBindCompleted) { + glog.V(5).Infof("PVC %q is fully bound to PV %q", getPVCName(pvc), pvName) + return true, pvc, nil + } else { + glog.V(5).Infof("PVC %q is not fully bound to PV %q", getPVCName(pvc), pvName) + return false, pvc, nil + } + } + glog.V(5).Infof("PVC %q is bound or prebound to PV %q", getPVCName(pvc), pvName) + return true, pvc, nil + } + + glog.V(5).Infof("PVC %q is not bound", getPVCName(pvc)) + return false, pvc, nil +} + +// arePodVolumesBound returns true if all volumes are fully bound +func (b *volumeBinder) arePodVolumesBound(pod *v1.Pod) bool { + for _, vol := range pod.Spec.Volumes { + if isBound, _, _ := b.isVolumeBound(pod.Namespace, &vol, true); !isBound { + // Pod has at least one PVC that needs binding + return false + } + } + return true +} + +// getPodVolumes returns a pod's PVCs separated into bound (including prebound), unbound with delayed binding, +// and unbound with immediate binding +func (b *volumeBinder) getPodVolumes(pod *v1.Pod) (boundClaims []*v1.PersistentVolumeClaim, unboundClaims []*bindingInfo, unboundClaimsImmediate []*v1.PersistentVolumeClaim, err error) { + boundClaims = []*v1.PersistentVolumeClaim{} + unboundClaimsImmediate = []*v1.PersistentVolumeClaim{} + unboundClaims = []*bindingInfo{} + + for _, vol := range pod.Spec.Volumes { + volumeBound, pvc, err := b.isVolumeBound(pod.Namespace, &vol, false) + if err != nil { + return nil, nil, nil, err + } + if pvc == nil { + continue + } + if volumeBound { + boundClaims = append(boundClaims, pvc) + } else { + delayBinding, err := b.ctrl.shouldDelayBinding(pvc) + if err != nil { + return nil, nil, nil, err + } + if delayBinding { + // Scheduler path + unboundClaims = append(unboundClaims, &bindingInfo{pvc: pvc}) + } else { + // Immediate binding should have already been bound + unboundClaimsImmediate = append(unboundClaimsImmediate, pvc) + } + } + } + return boundClaims, unboundClaims, unboundClaimsImmediate, nil +} + +func (b *volumeBinder) checkBoundClaims(claims []*v1.PersistentVolumeClaim, node *v1.Node, podName string) (bool, error) { + for _, pvc := range claims { + pvName := pvc.Spec.VolumeName + pv, err := b.pvCache.GetPV(pvName) + if err != nil { + return false, err + } + + err = volumeutil.CheckNodeAffinity(pv, node.Labels) + if err != nil { + glog.V(4).Infof("PersistentVolume %q, Node %q mismatch for Pod %q: %v", pvName, node.Name, err.Error(), podName) + return false, nil + } + glog.V(5).Infof("PersistentVolume %q, Node %q matches for Pod %q", pvName, node.Name, podName) + } + + glog.V(4).Infof("All volumes for Pod %q match with Node %q", podName, node.Name) + return true, nil +} + +func (b *volumeBinder) findMatchingVolumes(pod *v1.Pod, claimsToBind []*bindingInfo, node *v1.Node) (foundMatches bool, err error) { + // Sort all the claims by increasing size request to get the smallest fits + sort.Sort(byPVCSize(claimsToBind)) + + allPVs := b.pvCache.ListPVs() + chosenPVs := map[string]*v1.PersistentVolume{} + + for _, bindingInfo := range claimsToBind { + // Find a matching PV + bindingInfo.pv, err = findMatchingVolume(bindingInfo.pvc, allPVs, node, chosenPVs, true) + if err != nil { + return false, err + } + if bindingInfo.pv == nil { + glog.V(4).Infof("No matching volumes for PVC %q on node %q", getPVCName(bindingInfo.pvc), node.Name) + return false, nil + } + + // matching PV needs to be excluded so we don't select it again + chosenPVs[bindingInfo.pv.Name] = bindingInfo.pv + } + + // Mark cache with all the matches for each PVC for this node + b.podBindingCache.UpdateBindings(pod, node.Name, claimsToBind) + glog.V(4).Infof("Found matching volumes on node %q", node.Name) + + return true, nil +} + +func (b *volumeBinder) revertAssumedPVs(bindings []*bindingInfo) { + for _, bindingInfo := range bindings { + b.pvCache.Restore(bindingInfo.pv.Name) + } +} + +type bindingInfo struct { + // Claim that needs to be bound + pvc *v1.PersistentVolumeClaim + + // Proposed PV to bind to this claim + pv *v1.PersistentVolume +} + +// Used in unit test errors +func (b bindingInfo) String() string { + pvcName := "" + pvName := "" + if b.pvc != nil { + pvcName = getPVCName(b.pvc) + } + if b.pv != nil { + pvName = b.pv.Name + } + return fmt.Sprintf("[PVC %q, PV %q]", pvcName, pvName) +} + +type byPVCSize []*bindingInfo + +func (a byPVCSize) Len() int { + return len(a) +} + +func (a byPVCSize) Swap(i, j int) { + a[i], a[j] = a[j], a[i] +} + +func (a byPVCSize) Less(i, j int) bool { + iSize := a[i].pvc.Spec.Resources.Requests[v1.ResourceStorage] + jSize := a[j].pvc.Spec.Resources.Requests[v1.ResourceStorage] + // return true if iSize is less than jSize + return iSize.Cmp(jSize) == -1 +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_binder_cache.go b/pkg/controller/volume/persistentvolume/scheduler_binder_cache.go new file mode 100644 index 00000000000..8a0a7796085 --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_binder_cache.go @@ -0,0 +1,87 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "sync" + + "k8s.io/api/core/v1" +) + +// podBindingCache stores PV binding decisions per pod per node. +// Pod entries are removed when the Pod is deleted or updated to +// no longer be schedulable. +type PodBindingCache interface { + // UpdateBindings will update the cache with the given bindings for the + // pod and node. + UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo) + + // DeleteBindings will remove all cached bindings for the given pod. + DeleteBindings(pod *v1.Pod) + + // GetBindings will return the cached bindings for the given pod and node. + GetBindings(pod *v1.Pod, node string) []*bindingInfo +} + +type podBindingCache struct { + mutex sync.Mutex + + // Key = pod name + // Value = nodeBindings + bindings map[string]nodeBindings +} + +// Key = nodeName +// Value = array of bindingInfo +type nodeBindings map[string][]*bindingInfo + +func NewPodBindingCache() PodBindingCache { + return &podBindingCache{bindings: map[string]nodeBindings{}} +} + +func (c *podBindingCache) DeleteBindings(pod *v1.Pod) { + c.mutex.Lock() + defer c.mutex.Unlock() + + podName := getPodName(pod) + delete(c.bindings, podName) +} + +func (c *podBindingCache) UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo) { + c.mutex.Lock() + defer c.mutex.Unlock() + + podName := getPodName(pod) + nodeBinding, ok := c.bindings[podName] + if !ok { + nodeBinding = nodeBindings{} + c.bindings[podName] = nodeBinding + } + nodeBinding[node] = bindings +} + +func (c *podBindingCache) GetBindings(pod *v1.Pod, node string) []*bindingInfo { + c.mutex.Lock() + defer c.mutex.Unlock() + + podName := getPodName(pod) + nodeBindings, ok := c.bindings[podName] + if !ok { + return nil + } + return nodeBindings[node] +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_binder_cache_test.go b/pkg/controller/volume/persistentvolume/scheduler_binder_cache_test.go new file mode 100644 index 00000000000..c73cea970d0 --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_binder_cache_test.go @@ -0,0 +1,112 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "reflect" + "testing" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestUpdateGetBindings(t *testing.T) { + scenarios := map[string]struct { + updateBindings []*bindingInfo + updatePod string + updateNode string + + getBindings []*bindingInfo + getPod string + getNode string + }{ + "no-pod": { + getPod: "pod1", + getNode: "node1", + }, + "no-node": { + updatePod: "pod1", + updateNode: "node1", + updateBindings: []*bindingInfo{}, + getPod: "pod1", + getNode: "node2", + }, + "binding-exists": { + updatePod: "pod1", + updateNode: "node1", + updateBindings: []*bindingInfo{{pvc: &v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "pvc1"}}}}, + getPod: "pod1", + getNode: "node1", + getBindings: []*bindingInfo{{pvc: &v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "pvc1"}}}}, + }, + } + + for name, scenario := range scenarios { + cache := NewPodBindingCache() + + // Perform updates + updatePod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: scenario.updatePod, Namespace: "ns"}} + cache.UpdateBindings(updatePod, scenario.updateNode, scenario.updateBindings) + + // Verify updated bindings + bindings := cache.GetBindings(updatePod, scenario.updateNode) + if !reflect.DeepEqual(bindings, scenario.updateBindings) { + t.Errorf("Test %v failed: returned bindings after update different. Got %+v, expected %+v", name, bindings, scenario.updateBindings) + } + + // Get bindings + getPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: scenario.getPod, Namespace: "ns"}} + bindings = cache.GetBindings(getPod, scenario.getNode) + if !reflect.DeepEqual(bindings, scenario.getBindings) { + t.Errorf("Test %v failed: unexpected bindings returned. Got %+v, expected %+v", name, bindings, scenario.updateBindings) + } + } +} + +func TestDeleteBindings(t *testing.T) { + initialBindings := []*bindingInfo{{pvc: &v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "pvc1"}}}} + cache := NewPodBindingCache() + + pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns"}} + + // Get nil bindings + bindings := cache.GetBindings(pod, "node1") + if bindings != nil { + t.Errorf("Test failed: expected inital nil bindings, got %+v", bindings) + } + + // Delete nothing + cache.DeleteBindings(pod) + + // Perform updates + cache.UpdateBindings(pod, "node1", initialBindings) + + // Get bindings + bindings = cache.GetBindings(pod, "node1") + if !reflect.DeepEqual(bindings, initialBindings) { + t.Errorf("Test failed: expected bindings %+v, got %+v", initialBindings, bindings) + } + + // Delete + cache.DeleteBindings(pod) + + // Get bindings + bindings = cache.GetBindings(pod, "node1") + if bindings != nil { + t.Errorf("Test failed: expected nil bindings, got %+v", bindings) + } +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_binder_fake.go b/pkg/controller/volume/persistentvolume/scheduler_binder_fake.go new file mode 100644 index 00000000000..2810276b161 --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_binder_fake.go @@ -0,0 +1,63 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "k8s.io/api/core/v1" +) + +type FakeVolumeBinderConfig struct { + AllBound bool + FindUnboundSatsified bool + FindBoundSatsified bool + FindErr error + AssumeBindingRequired bool + AssumeErr error + BindErr error +} + +// NewVolumeBinder sets up all the caches needed for the scheduler to make +// topology-aware volume binding decisions. +func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder { + return &FakeVolumeBinder{ + config: config, + } +} + +type FakeVolumeBinder struct { + config *FakeVolumeBinderConfig + AssumeCalled bool + BindCalled bool +} + +func (b *FakeVolumeBinder) FindPodVolumes(pod *v1.Pod, nodeName string) (unboundVolumesSatisfied, boundVolumesSatsified bool, err error) { + return b.config.FindUnboundSatsified, b.config.FindBoundSatsified, b.config.FindErr +} + +func (b *FakeVolumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (bool, bool, error) { + b.AssumeCalled = true + return b.config.AllBound, b.config.AssumeBindingRequired, b.config.AssumeErr +} + +func (b *FakeVolumeBinder) BindPodVolumes(assumedPod *v1.Pod) error { + b.BindCalled = true + return b.config.BindErr +} + +func (b *FakeVolumeBinder) GetBindingsCache() PodBindingCache { + return nil +} diff --git a/pkg/controller/volume/persistentvolume/scheduler_binder_test.go b/pkg/controller/volume/persistentvolume/scheduler_binder_test.go new file mode 100644 index 00000000000..c5f33f0409d --- /dev/null +++ b/pkg/controller/volume/persistentvolume/scheduler_binder_test.go @@ -0,0 +1,755 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package persistentvolume + +import ( + "fmt" + "reflect" + "testing" + + "github.com/golang/glog" + + "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/client-go/informers" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" + "k8s.io/kubernetes/pkg/api/testapi" + "k8s.io/kubernetes/pkg/controller" +) + +var ( + unboundPVC = makeTestPVC("unbound-pvc", "1G", pvcUnbound, "", &waitClass) + unboundPVC2 = makeTestPVC("unbound-pvc2", "5G", pvcUnbound, "", &waitClass) + preboundPVC = makeTestPVC("prebound-pvc", "1G", pvcPrebound, "pv-node1a", &waitClass) + boundPVC = makeTestPVC("bound-pvc", "1G", pvcBound, "pv-bound", &waitClass) + boundPVC2 = makeTestPVC("bound-pvc2", "1G", pvcBound, "pv-bound2", &waitClass) + badPVC = makeBadPVC() + immediateUnboundPVC = makeTestPVC("immediate-unbound-pvc", "1G", pvcUnbound, "", &immediateClass) + immediateBoundPVC = makeTestPVC("immediate-bound-pvc", "1G", pvcBound, "pv-bound-immediate", &immediateClass) + + pvNoNode = makeTestPV("pv-no-node", "", "1G", "1", nil, waitClass) + pvNode1a = makeTestPV("pv-node1a", "node1", "5G", "1", nil, waitClass) + pvNode1b = makeTestPV("pv-node1b", "node1", "10G", "1", nil, waitClass) + pvNode2 = makeTestPV("pv-node2", "node2", "1G", "1", nil, waitClass) + pvPrebound = makeTestPV("pv-prebound", "node1", "1G", "1", unboundPVC, waitClass) + pvBound = makeTestPV("pv-bound", "node1", "1G", "1", boundPVC, waitClass) + pvNode1aBound = makeTestPV("pv-node1a", "node1", "1G", "1", unboundPVC, waitClass) + pvNode1bBound = makeTestPV("pv-node1b", "node1", "5G", "1", unboundPVC2, waitClass) + pvNode1bBoundHigherVersion = makeTestPV("pv-node1b", "node1", "5G", "2", unboundPVC2, waitClass) + pvBoundImmediate = makeTestPV("pv-bound-immediate", "node1", "1G", "1", immediateBoundPVC, immediateClass) + pvBoundImmediateNode2 = makeTestPV("pv-bound-immediate", "node2", "1G", "1", immediateBoundPVC, immediateClass) + + binding1a = makeBinding(unboundPVC, pvNode1a) + binding1b = makeBinding(unboundPVC2, pvNode1b) + bindingNoNode = makeBinding(unboundPVC, pvNoNode) + bindingBad = makeBinding(badPVC, pvNode1b) + binding1aBound = makeBinding(unboundPVC, pvNode1aBound) + binding1bBound = makeBinding(unboundPVC2, pvNode1bBound) + + waitClass = "waitClass" + immediateClass = "immediateClass" +) + +type testEnv struct { + client clientset.Interface + reactor *volumeReactor + binder SchedulerVolumeBinder + internalBinder *volumeBinder + internalPVCache *pvAssumeCache + internalPVCCache cache.Indexer +} + +func newTestBinder(t *testing.T) *testEnv { + client := &fake.Clientset{} + reactor := newVolumeReactor(client, nil, nil, nil, nil) + informerFactory := informers.NewSharedInformerFactory(client, controller.NoResyncPeriodFunc()) + + pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims() + nodeInformer := informerFactory.Core().V1().Nodes() + classInformer := informerFactory.Storage().V1().StorageClasses() + + binder := NewVolumeBinder( + client, + pvcInformer, + informerFactory.Core().V1().PersistentVolumes(), + nodeInformer, + classInformer) + + // Add a node + err := nodeInformer.Informer().GetIndexer().Add(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Labels: map[string]string{"key1": "node1"}, + }, + }) + if err != nil { + t.Fatalf("Failed to add node to internal cache: %v", err) + } + + // Add storageclasses + waitMode := storagev1.VolumeBindingWaitForFirstConsumer + immediateMode := storagev1.VolumeBindingImmediate + classes := []*storagev1.StorageClass{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: waitClass, + }, + VolumeBindingMode: &waitMode, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: immediateClass, + }, + VolumeBindingMode: &immediateMode, + }, + } + for _, class := range classes { + if err = classInformer.Informer().GetIndexer().Add(class); err != nil { + t.Fatalf("Failed to add storage class to internal cache: %v", err) + } + } + + // Get internal types + internalBinder, ok := binder.(*volumeBinder) + if !ok { + t.Fatalf("Failed to convert to internal binder") + } + + pvCache := internalBinder.pvCache + internalPVCache, ok := pvCache.(*pvAssumeCache) + if !ok { + t.Fatalf("Failed to convert to internal PV cache") + } + + return &testEnv{ + client: client, + reactor: reactor, + binder: binder, + internalBinder: internalBinder, + internalPVCache: internalPVCache, + internalPVCCache: pvcInformer.Informer().GetIndexer(), + } +} + +func (env *testEnv) initClaims(t *testing.T, pvcs []*v1.PersistentVolumeClaim) { + for _, pvc := range pvcs { + err := env.internalPVCCache.Add(pvc) + if err != nil { + t.Fatalf("Failed to add PVC %q to internal cache: %v", pvc.Name, err) + } + env.reactor.claims[pvc.Name] = pvc + } +} + +func (env *testEnv) initVolumes(cachedPVs []*v1.PersistentVolume, apiPVs []*v1.PersistentVolume) { + internalPVCache := env.internalPVCache + for _, pv := range cachedPVs { + internalPVCache.add(pv) + if apiPVs == nil { + env.reactor.volumes[pv.Name] = pv + } + } + for _, pv := range apiPVs { + env.reactor.volumes[pv.Name] = pv + } + +} + +func (env *testEnv) assumeVolumes(t *testing.T, name, node string, pod *v1.Pod, bindings []*bindingInfo) { + pvCache := env.internalBinder.pvCache + for _, binding := range bindings { + if err := pvCache.Assume(binding.pv); err != nil { + t.Fatalf("Failed to setup test %q: error: %v", name, err) + } + } + + env.internalBinder.podBindingCache.UpdateBindings(pod, node, bindings) +} + +func (env *testEnv) initPodCache(pod *v1.Pod, node string, bindings []*bindingInfo) { + cache := env.internalBinder.podBindingCache + cache.UpdateBindings(pod, node, bindings) +} + +func (env *testEnv) validatePodCache(t *testing.T, name, node string, pod *v1.Pod, expectedBindings []*bindingInfo) { + cache := env.internalBinder.podBindingCache + bindings := cache.GetBindings(pod, node) + + if !reflect.DeepEqual(expectedBindings, bindings) { + t.Errorf("Test %q failed: Expected bindings %+v, got %+v", name, expectedBindings, bindings) + } +} + +func (env *testEnv) validateAssume(t *testing.T, name string, pod *v1.Pod, bindings []*bindingInfo) { + // TODO: Check binding cache + + // Check pv cache + pvCache := env.internalBinder.pvCache + for _, b := range bindings { + pv, err := pvCache.GetPV(b.pv.Name) + if err != nil { + t.Errorf("Test %q failed: GetPV %q returned error: %v", name, b.pv.Name, err) + continue + } + if pv.Spec.ClaimRef == nil { + t.Errorf("Test %q failed: PV %q ClaimRef is nil", name, b.pv.Name) + continue + } + if pv.Spec.ClaimRef.Name != b.pvc.Name { + t.Errorf("Test %q failed: expected PV.ClaimRef.Name %q, got %q", name, b.pvc.Name, pv.Spec.ClaimRef.Name) + } + if pv.Spec.ClaimRef.Namespace != b.pvc.Namespace { + t.Errorf("Test %q failed: expected PV.ClaimRef.Namespace %q, got %q", name, b.pvc.Namespace, pv.Spec.ClaimRef.Namespace) + } + } +} + +func (env *testEnv) validateFailedAssume(t *testing.T, name string, pod *v1.Pod, bindings []*bindingInfo) { + // All PVs have been unmodified in cache + pvCache := env.internalBinder.pvCache + for _, b := range bindings { + pv, _ := pvCache.GetPV(b.pv.Name) + // PV could be nil if it's missing from cache + if pv != nil && pv != b.pv { + t.Errorf("Test %q failed: PV %q was modified in cache", name, b.pv.Name) + } + } +} + +func (env *testEnv) validateBind( + t *testing.T, + name string, + pod *v1.Pod, + expectedPVs []*v1.PersistentVolume, + expectedAPIPVs []*v1.PersistentVolume) { + + // Check pv cache + pvCache := env.internalBinder.pvCache + for _, pv := range expectedPVs { + cachedPV, err := pvCache.GetPV(pv.Name) + if err != nil { + t.Errorf("Test %q failed: GetPV %q returned error: %v", name, pv.Name, err) + } + if !reflect.DeepEqual(cachedPV, pv) { + t.Errorf("Test %q failed: cached PV check failed [A-expected, B-got]:\n%s", name, diff.ObjectDiff(pv, cachedPV)) + } + } + + // Check reactor for API updates + if err := env.reactor.checkVolumes(expectedAPIPVs); err != nil { + t.Errorf("Test %q failed: API reactor validation failed: %v", name, err) + } +} + +const ( + pvcUnbound = iota + pvcPrebound + pvcBound +) + +func makeTestPVC(name, size string, pvcBoundState int, pvName string, className *string) *v1.PersistentVolumeClaim { + pvc := &v1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "testns", + UID: types.UID("pvc-uid"), + ResourceVersion: "1", + SelfLink: testapi.Default.SelfLink("pvc", name), + }, + Spec: v1.PersistentVolumeClaimSpec{ + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse(size), + }, + }, + StorageClassName: className, + }, + } + + switch pvcBoundState { + case pvcBound: + metav1.SetMetaDataAnnotation(&pvc.ObjectMeta, annBindCompleted, "yes") + fallthrough + case pvcPrebound: + pvc.Spec.VolumeName = pvName + } + return pvc +} + +func makeBadPVC() *v1.PersistentVolumeClaim { + return &v1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bad-pvc", + Namespace: "testns", + UID: types.UID("pvc-uid"), + ResourceVersion: "1", + // Don't include SefLink, so that GetReference will fail + }, + Spec: v1.PersistentVolumeClaimSpec{ + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("1G"), + }, + }, + StorageClassName: &waitClass, + }, + } +} + +func makeTestPV(name, node, capacity, version string, boundToPVC *v1.PersistentVolumeClaim, className string) *v1.PersistentVolume { + pv := &v1.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + ResourceVersion: version, + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse(capacity), + }, + StorageClassName: className, + }, + } + if node != "" { + pv.Annotations = getAnnotationWithNodeAffinity("key1", node) + } + + if boundToPVC != nil { + pv.Spec.ClaimRef = &v1.ObjectReference{ + Name: boundToPVC.Name, + Namespace: boundToPVC.Namespace, + UID: boundToPVC.UID, + } + metav1.SetMetaDataAnnotation(&pv.ObjectMeta, annBoundByController, "yes") + } + + return pv +} + +func makePod(pvcs []*v1.PersistentVolumeClaim) *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "testns", + }, + } + + volumes := []v1.Volume{} + for i, pvc := range pvcs { + pvcVol := v1.Volume{ + Name: fmt.Sprintf("vol%v", i), + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvc.Name, + }, + }, + } + volumes = append(volumes, pvcVol) + } + pod.Spec.Volumes = volumes + pod.Spec.NodeName = "node1" + return pod +} + +func makePodWithoutPVC() *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "testns", + }, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{}, + }, + }, + }, + }, + } + return pod +} + +func makeBinding(pvc *v1.PersistentVolumeClaim, pv *v1.PersistentVolume) *bindingInfo { + return &bindingInfo{pvc: pvc, pv: pv} +} + +func makeStringPtr(str string) *string { + s := fmt.Sprintf("%v", str) + return &s +} + +func TestFindPodVolumes(t *testing.T) { + scenarios := map[string]struct { + // Inputs + pvs []*v1.PersistentVolume + podPVCs []*v1.PersistentVolumeClaim + // Defaults to node1 + node string + // If nil, use pod PVCs + cachePVCs []*v1.PersistentVolumeClaim + // If nil, makePod with podPVCs + pod *v1.Pod + + // Expected podBindingCache fields + expectedBindings []*bindingInfo + + // Expected return values + expectedUnbound bool + expectedBound bool + shouldFail bool + }{ + "no-volumes": { + pod: makePod(nil), + expectedUnbound: true, + expectedBound: true, + }, + "no-pvcs": { + pod: makePodWithoutPVC(), + expectedUnbound: true, + expectedBound: true, + }, + "pvc-not-found": { + cachePVCs: []*v1.PersistentVolumeClaim{}, + podPVCs: []*v1.PersistentVolumeClaim{boundPVC}, + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + "bound-pvc": { + podPVCs: []*v1.PersistentVolumeClaim{boundPVC}, + pvs: []*v1.PersistentVolume{pvBound}, + expectedUnbound: true, + expectedBound: true, + }, + "bound-pvc,pv-not-exists": { + podPVCs: []*v1.PersistentVolumeClaim{boundPVC}, + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + "prebound-pvc": { + podPVCs: []*v1.PersistentVolumeClaim{preboundPVC}, + pvs: []*v1.PersistentVolume{pvNode1aBound}, + expectedUnbound: true, + expectedBound: true, + }, + "unbound-pvc,node-not-exists": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + node: "node12", + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + "unbound-pvc,pv-same-node": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + pvs: []*v1.PersistentVolume{pvNode2, pvNode1a, pvNode1b}, + expectedBindings: []*bindingInfo{binding1a}, + expectedUnbound: true, + expectedBound: true, + }, + "unbound-pvc,pv-different-node": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + pvs: []*v1.PersistentVolume{pvNode2}, + expectedUnbound: false, + expectedBound: true, + }, + "two-unbound-pvcs": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, unboundPVC2}, + pvs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + expectedBindings: []*bindingInfo{binding1a, binding1b}, + expectedUnbound: true, + expectedBound: true, + }, + "two-unbound-pvcs,order-by-size": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC2, unboundPVC}, + pvs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + expectedBindings: []*bindingInfo{binding1a, binding1b}, + expectedUnbound: true, + expectedBound: true, + }, + "two-unbound-pvcs,partial-match": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, unboundPVC2}, + pvs: []*v1.PersistentVolume{pvNode1a}, + expectedUnbound: false, + expectedBound: true, + }, + "one-bound,one-unbound": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, boundPVC}, + pvs: []*v1.PersistentVolume{pvBound, pvNode1a}, + expectedBindings: []*bindingInfo{binding1a}, + expectedUnbound: true, + expectedBound: true, + }, + "one-bound,one-unbound,no-match": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, boundPVC}, + pvs: []*v1.PersistentVolume{pvBound, pvNode2}, + expectedUnbound: false, + expectedBound: true, + }, + "one-prebound,one-unbound": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, preboundPVC}, + pvs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + expectedBindings: []*bindingInfo{binding1a}, + expectedUnbound: true, + expectedBound: true, + }, + "immediate-bound-pvc": { + podPVCs: []*v1.PersistentVolumeClaim{immediateBoundPVC}, + pvs: []*v1.PersistentVolume{pvBoundImmediate}, + expectedUnbound: true, + expectedBound: true, + }, + "immediate-bound-pvc-wrong-node": { + podPVCs: []*v1.PersistentVolumeClaim{immediateBoundPVC}, + pvs: []*v1.PersistentVolume{pvBoundImmediateNode2}, + expectedUnbound: true, + expectedBound: false, + }, + "immediate-unbound-pvc": { + podPVCs: []*v1.PersistentVolumeClaim{immediateUnboundPVC}, + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + "immediate-unbound-pvc,delayed-mode-bound": { + podPVCs: []*v1.PersistentVolumeClaim{immediateUnboundPVC, boundPVC}, + pvs: []*v1.PersistentVolume{pvBound}, + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + "immediate-unbound-pvc,delayed-mode-unbound": { + podPVCs: []*v1.PersistentVolumeClaim{immediateUnboundPVC, unboundPVC}, + expectedUnbound: false, + expectedBound: false, + shouldFail: true, + }, + } + + // Set feature gate + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true") + defer utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false") + + for name, scenario := range scenarios { + glog.V(5).Infof("Running test case %q", name) + + // Setup + testEnv := newTestBinder(t) + testEnv.initVolumes(scenario.pvs, scenario.pvs) + if scenario.node == "" { + scenario.node = "node1" + } + + // a. Init pvc cache + if scenario.cachePVCs == nil { + scenario.cachePVCs = scenario.podPVCs + } + testEnv.initClaims(t, scenario.cachePVCs) + + // b. Generate pod with given claims + if scenario.pod == nil { + scenario.pod = makePod(scenario.podPVCs) + } + + // Execute + unboundSatisfied, boundSatisfied, err := testEnv.binder.FindPodVolumes(scenario.pod, scenario.node) + + // Validate + if !scenario.shouldFail && err != nil { + t.Errorf("Test %q failed: returned error: %v", name, err) + } + if scenario.shouldFail && err == nil { + t.Errorf("Test %q failed: returned success but expected error", name) + } + if boundSatisfied != scenario.expectedBound { + t.Errorf("Test %q failed: expected boundSatsified %v, got %v", name, scenario.expectedBound, boundSatisfied) + } + if unboundSatisfied != scenario.expectedUnbound { + t.Errorf("Test %q failed: expected unboundSatsified %v, got %v", name, scenario.expectedUnbound, unboundSatisfied) + } + testEnv.validatePodCache(t, name, scenario.node, scenario.pod, scenario.expectedBindings) + } +} + +func TestAssumePodVolumes(t *testing.T) { + scenarios := map[string]struct { + // Inputs + podPVCs []*v1.PersistentVolumeClaim + pvs []*v1.PersistentVolume + bindings []*bindingInfo + + // Expected return values + shouldFail bool + expectedBindingRequired bool + expectedAllBound bool + + // if nil, use bindings + expectedBindings []*bindingInfo + }{ + "all-bound": { + podPVCs: []*v1.PersistentVolumeClaim{boundPVC}, + pvs: []*v1.PersistentVolume{pvBound}, + expectedAllBound: true, + }, + "prebound-pvc": { + podPVCs: []*v1.PersistentVolumeClaim{preboundPVC}, + pvs: []*v1.PersistentVolume{pvNode1a}, + }, + "one-binding": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + bindings: []*bindingInfo{binding1a}, + pvs: []*v1.PersistentVolume{pvNode1a}, + expectedBindingRequired: true, + }, + "two-bindings": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC, unboundPVC2}, + bindings: []*bindingInfo{binding1a, binding1b}, + pvs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + expectedBindingRequired: true, + }, + "pv-already-bound": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + bindings: []*bindingInfo{binding1aBound}, + pvs: []*v1.PersistentVolume{pvNode1aBound}, + expectedBindingRequired: false, + expectedBindings: []*bindingInfo{}, + }, + "claimref-failed": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + bindings: []*bindingInfo{binding1a, bindingBad}, + pvs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + shouldFail: true, + expectedBindingRequired: true, + }, + "tmpupdate-failed": { + podPVCs: []*v1.PersistentVolumeClaim{unboundPVC}, + bindings: []*bindingInfo{binding1a, binding1b}, + pvs: []*v1.PersistentVolume{pvNode1a}, + shouldFail: true, + expectedBindingRequired: true, + }, + } + + for name, scenario := range scenarios { + glog.V(5).Infof("Running test case %q", name) + + // Setup + testEnv := newTestBinder(t) + testEnv.initClaims(t, scenario.podPVCs) + pod := makePod(scenario.podPVCs) + testEnv.initPodCache(pod, "node1", scenario.bindings) + testEnv.initVolumes(scenario.pvs, scenario.pvs) + + // Execute + allBound, bindingRequired, err := testEnv.binder.AssumePodVolumes(pod, "node1") + + // Validate + if !scenario.shouldFail && err != nil { + t.Errorf("Test %q failed: returned error: %v", name, err) + } + if scenario.shouldFail && err == nil { + t.Errorf("Test %q failed: returned success but expected error", name) + } + if scenario.expectedBindingRequired != bindingRequired { + t.Errorf("Test %q failed: returned unexpected bindingRequired: %v", name, bindingRequired) + } + if scenario.expectedAllBound != allBound { + t.Errorf("Test %q failed: returned unexpected allBound: %v", name, allBound) + } + if scenario.expectedBindings == nil { + scenario.expectedBindings = scenario.bindings + } + if scenario.shouldFail { + testEnv.validateFailedAssume(t, name, pod, scenario.expectedBindings) + } else { + testEnv.validateAssume(t, name, pod, scenario.expectedBindings) + } + } +} + +func TestBindPodVolumes(t *testing.T) { + scenarios := map[string]struct { + // Inputs + bindings []*bindingInfo + cachedPVs []*v1.PersistentVolume + // if nil, use cachedPVs + apiPVs []*v1.PersistentVolume + + // Expected return values + shouldFail bool + expectedPVs []*v1.PersistentVolume + // if nil, use expectedPVs + expectedAPIPVs []*v1.PersistentVolume + }{ + "all-bound": {}, + "not-fully-bound": { + bindings: []*bindingInfo{}, + }, + "one-binding": { + bindings: []*bindingInfo{binding1aBound}, + cachedPVs: []*v1.PersistentVolume{pvNode1a}, + expectedPVs: []*v1.PersistentVolume{pvNode1aBound}, + }, + "two-bindings": { + bindings: []*bindingInfo{binding1aBound, binding1bBound}, + cachedPVs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + expectedPVs: []*v1.PersistentVolume{pvNode1aBound, pvNode1bBound}, + }, + "api-update-failed": { + bindings: []*bindingInfo{binding1aBound, binding1bBound}, + cachedPVs: []*v1.PersistentVolume{pvNode1a, pvNode1b}, + apiPVs: []*v1.PersistentVolume{pvNode1a, pvNode1bBoundHigherVersion}, + expectedPVs: []*v1.PersistentVolume{pvNode1aBound, pvNode1b}, + expectedAPIPVs: []*v1.PersistentVolume{pvNode1aBound, pvNode1bBoundHigherVersion}, + shouldFail: true, + }, + } + for name, scenario := range scenarios { + glog.V(5).Infof("Running test case %q", name) + + // Setup + testEnv := newTestBinder(t) + pod := makePod(nil) + if scenario.apiPVs == nil { + scenario.apiPVs = scenario.cachedPVs + } + testEnv.initVolumes(scenario.cachedPVs, scenario.apiPVs) + testEnv.assumeVolumes(t, name, "node1", pod, scenario.bindings) + + // Execute + err := testEnv.binder.BindPodVolumes(pod) + + // Validate + if !scenario.shouldFail && err != nil { + t.Errorf("Test %q failed: returned error: %v", name, err) + } + if scenario.shouldFail && err == nil { + t.Errorf("Test %q failed: returned success but expected error", name) + } + if scenario.expectedAPIPVs == nil { + scenario.expectedAPIPVs = scenario.expectedPVs + } + testEnv.validateBind(t, name, pod, scenario.expectedPVs, scenario.expectedAPIPVs) + } +} diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index db4e759ca2e..d9bd748dc98 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -180,6 +180,7 @@ const ( // alpha: v1.9 // // Extend the default scheduler to be aware of PV topology and handle PV binding + // Before moving to beta, resolve Kubernetes issue #56180 VolumeScheduling utilfeature.Feature = "VolumeScheduling" // owner: @vladimirvivien diff --git a/pkg/kubectl/.import-restrictions b/pkg/kubectl/.import-restrictions index cedbea71e7f..875f997dd52 100644 --- a/pkg/kubectl/.import-restrictions +++ b/pkg/kubectl/.import-restrictions @@ -120,6 +120,7 @@ "k8s.io/kubernetes/pkg/security/apparmor", "k8s.io/kubernetes/pkg/serviceaccount", "k8s.io/kubernetes/pkg/util/file", + "k8s.io/kubernetes/pkg/util/goroutinemap", "k8s.io/kubernetes/pkg/util/hash", "k8s.io/kubernetes/pkg/util/interrupt", "k8s.io/kubernetes/pkg/util/io", @@ -146,4 +147,4 @@ ], "ForbiddenPrefixes": [] }] -} \ No newline at end of file +} diff --git a/plugin/cmd/kube-scheduler/app/BUILD b/plugin/cmd/kube-scheduler/app/BUILD index 5edfadd024b..9de3152c18c 100644 --- a/plugin/cmd/kube-scheduler/app/BUILD +++ b/plugin/cmd/kube-scheduler/app/BUILD @@ -40,6 +40,7 @@ go_library( "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/informers:go_default_library", "//vendor/k8s.io/client-go/informers/core/v1:go_default_library", + "//vendor/k8s.io/client-go/informers/storage/v1:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library", "//vendor/k8s.io/client-go/rest:go_default_library", diff --git a/plugin/cmd/kube-scheduler/app/server.go b/plugin/cmd/kube-scheduler/app/server.go index d7f7abe2cda..93982f898e3 100644 --- a/plugin/cmd/kube-scheduler/app/server.go +++ b/plugin/cmd/kube-scheduler/app/server.go @@ -40,6 +40,7 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" coreinformers "k8s.io/client-go/informers/core/v1" + storageinformers "k8s.io/client-go/informers/storage/v1" clientset "k8s.io/client-go/kubernetes" v1core "k8s.io/client-go/kubernetes/typed/core/v1" restclient "k8s.io/client-go/rest" @@ -625,6 +626,11 @@ func (s *SchedulerServer) Run(stop chan struct{}) error { // SchedulerConfig creates the scheduler configuration. This is exposed for use // by tests. func (s *SchedulerServer) SchedulerConfig() (*scheduler.Config, error) { + var storageClassInformer storageinformers.StorageClassInformer + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + storageClassInformer = s.InformerFactory.Storage().V1().StorageClasses() + } + // Set up the configurator which can create schedulers from configs. configurator := factory.NewConfigFactory( s.SchedulerName, @@ -638,6 +644,7 @@ func (s *SchedulerServer) SchedulerConfig() (*scheduler.Config, error) { s.InformerFactory.Apps().V1beta1().StatefulSets(), s.InformerFactory.Core().V1().Services(), s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(), + storageClassInformer, s.HardPodAffinitySymmetricWeight, utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache), ) diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go index 29bd87f2a76..6fb6aefdb23 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go @@ -439,6 +439,18 @@ func ClusterRoles() []rbac.ClusterRole { }) } + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + // Find the scheduler role + for i, role := range roles { + if role.Name == "system:kube-scheduler" { + pvRule := rbac.NewRule("update").Groups(legacyGroup).Resources("persistentvolumes").RuleOrDie() + scRule := rbac.NewRule(Read...).Groups(storageGroup).Resources("storageclasses").RuleOrDie() + roles[i].Rules = append(role.Rules, pvRule, scRule) + break + } + } + } + addClusterRoleLabel(roles) return roles } diff --git a/plugin/pkg/scheduler/BUILD b/plugin/pkg/scheduler/BUILD index 41136590537..e20acec4060 100644 --- a/plugin/pkg/scheduler/BUILD +++ b/plugin/pkg/scheduler/BUILD @@ -13,18 +13,21 @@ go_test( library = ":go_default_library", deps = [ "//pkg/api/legacyscheme:go_default_library", + "//pkg/controller/volume/persistentvolume:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", "//plugin/pkg/scheduler/algorithm/predicates:go_default_library", "//plugin/pkg/scheduler/core:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/testing:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", + "//plugin/pkg/scheduler/volumebinder:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/diff:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/tools/cache:go_default_library", "//vendor/k8s.io/client-go/tools/record:go_default_library", ], @@ -38,17 +41,21 @@ go_library( ], importpath = "k8s.io/kubernetes/plugin/pkg/scheduler", deps = [ + "//pkg/features:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", + "//plugin/pkg/scheduler/algorithm/predicates:go_default_library", "//plugin/pkg/scheduler/api:go_default_library", "//plugin/pkg/scheduler/core:go_default_library", "//plugin/pkg/scheduler/metrics:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", + "//plugin/pkg/scheduler/volumebinder:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", "//vendor/k8s.io/client-go/tools/record:go_default_library", @@ -75,6 +82,7 @@ filegroup( "//plugin/pkg/scheduler/schedulercache:all-srcs", "//plugin/pkg/scheduler/testing:all-srcs", "//plugin/pkg/scheduler/util:all-srcs", + "//plugin/pkg/scheduler/volumebinder:all-srcs", ], tags = ["automanaged"], ) diff --git a/plugin/pkg/scheduler/algorithm/predicates/BUILD b/plugin/pkg/scheduler/algorithm/predicates/BUILD index 8b5dbfbb3c1..5ccf2d1f4c4 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/BUILD +++ b/plugin/pkg/scheduler/algorithm/predicates/BUILD @@ -26,16 +26,18 @@ go_library( "//plugin/pkg/scheduler/algorithm/priorities/util:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", + "//plugin/pkg/scheduler/volumebinder:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/api/storage/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/rand:go_default_library", "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", + "//vendor/k8s.io/client-go/listers/storage/v1:go_default_library", "//vendor/k8s.io/client-go/util/workqueue:go_default_library", - "//vendor/k8s.io/metrics/pkg/client/clientset_generated/clientset:go_default_library", ], ) @@ -56,9 +58,11 @@ go_test( "//plugin/pkg/scheduler/testing:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/api/storage/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", ], ) diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go index b4cbc0bb59d..a4450bc5643 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/error.go +++ b/plugin/pkg/scheduler/algorithm/predicates/error.go @@ -50,7 +50,8 @@ var ( ErrNodeNetworkUnavailable = newPredicateFailureError("NodeNetworkUnavailable") ErrNodeUnschedulable = newPredicateFailureError("NodeUnschedulable") ErrNodeUnknownCondition = newPredicateFailureError("NodeUnknownCondition") - ErrVolumeNodeConflict = newPredicateFailureError("NoVolumeNodeConflict") + ErrVolumeNodeConflict = newPredicateFailureError("VolumeNodeAffinityConflict") + ErrVolumeBindConflict = newPredicateFailureError("VolumeBindingNoMatch") // ErrFakePredicate is used for test only. The fake predicates returning false also returns error // as ErrFakePredicate. ErrFakePredicate = newPredicateFailureError("FakePredicateError") diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 65d8f5c9ca5..917b35acf51 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -24,12 +24,14 @@ import ( "sync" "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/rand" utilfeature "k8s.io/apiserver/pkg/util/feature" corelisters "k8s.io/client-go/listers/core/v1" + storagelisters "k8s.io/client-go/listers/storage/v1" "k8s.io/client-go/util/workqueue" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" @@ -41,13 +43,14 @@ import ( priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" schedutil "k8s.io/kubernetes/plugin/pkg/scheduler/util" - "k8s.io/metrics/pkg/client/clientset_generated/clientset" "github.com/golang/glog" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) const ( MatchInterPodAffinity = "MatchInterPodAffinity" + CheckVolumeBinding = "CheckVolumeBinding" // DefaultMaxGCEPDVolumes defines the maximum number of PD Volumes for GCE // GCE instances can have up to 16 PD volumes attached. @@ -127,6 +130,19 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*v1.Node, error) { return node, nil } +type StorageClassInfo interface { + GetStorageClassInfo(className string) (*storagev1.StorageClass, error) +} + +// CachedStorageClassInfo implements StorageClassInfo +type CachedStorageClassInfo struct { + storagelisters.StorageClassLister +} + +func (c *CachedStorageClassInfo) GetStorageClassInfo(className string) (*storagev1.StorageClass, error) { + return c.Get(className) +} + func isVolumeConflict(volume v1.Volume, pod *v1.Pod) bool { // fast path if there is no conflict checking targets. if volume.GCEPersistentDisk == nil && volume.AWSElasticBlockStore == nil && volume.RBD == nil && volume.ISCSI == nil { @@ -416,8 +432,9 @@ var AzureDiskVolumeFilter VolumeFilter = VolumeFilter{ } type VolumeZoneChecker struct { - pvInfo PersistentVolumeInfo - pvcInfo PersistentVolumeClaimInfo + pvInfo PersistentVolumeInfo + pvcInfo PersistentVolumeClaimInfo + classInfo StorageClassInfo } // NewVolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given @@ -434,10 +451,11 @@ type VolumeZoneChecker struct { // determining the zone of a volume during scheduling, and that is likely to // require calling out to the cloud provider. It seems that we are moving away // from inline volume declarations anyway. -func NewVolumeZonePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate { +func NewVolumeZonePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo, classInfo StorageClassInfo) algorithm.FitPredicate { c := &VolumeZoneChecker{ - pvInfo: pvInfo, - pvcInfo: pvcInfo, + pvInfo: pvInfo, + pvcInfo: pvcInfo, + classInfo: classInfo, } return c.predicate } @@ -489,6 +507,21 @@ func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetad pvName := pvc.Spec.VolumeName if pvName == "" { + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + scName := pvc.Spec.StorageClassName + if scName != nil && len(*scName) > 0 { + class, _ := c.classInfo.GetStorageClassInfo(*scName) + if class != nil { + if class.VolumeBindingMode == nil { + return false, nil, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", scName) + } + if *class.VolumeBindingMode == storagev1.VolumeBindingWaitForFirstConsumer { + // Skip unbound volumes + continue + } + } + } + } return false, nil, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName) } @@ -1403,33 +1436,30 @@ func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, return len(reasons) == 0, reasons, nil } -type VolumeNodeChecker struct { - pvInfo PersistentVolumeInfo - pvcInfo PersistentVolumeClaimInfo - client clientset.Interface +type VolumeBindingChecker struct { + binder *volumebinder.VolumeBinder } -// NewVolumeNodePredicate evaluates if a pod can fit due to the volumes it requests, given -// that some volumes have node topology constraints, particularly when using Local PVs. -// The requirement is that any pod that uses a PVC that is bound to a PV with topology constraints -// must be scheduled to a node that satisfies the PV's topology labels. -func NewVolumeNodePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo, client clientset.Interface) algorithm.FitPredicate { - c := &VolumeNodeChecker{ - pvInfo: pvInfo, - pvcInfo: pvcInfo, - client: client, +// NewVolumeBindingPredicate evaluates if a pod can fit due to the volumes it requests, +// for both bound and unbound PVCs. +// +// For PVCs that are bound, then it checks that the corresponding PV's node affinity is +// satisfied by the given node. +// +// For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements +// and that the PV node affinity is satisfied by the given node. +// +// The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound +// PVCs can be matched with an available and node-compatible PV. +func NewVolumeBindingPredicate(binder *volumebinder.VolumeBinder) algorithm.FitPredicate { + c := &VolumeBindingChecker{ + binder: binder, } return c.predicate } -func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { - if !utilfeature.DefaultFeatureGate.Enabled(features.PersistentLocalVolumes) { - return true, nil, nil - } - - // If a pod doesn't have any volume attached to it, the predicate will always be true. - // Thus we make a fast path for it, to avoid unnecessary computations in this case. - if len(pod.Spec.Volumes) == 0 { +func (c *VolumeBindingChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { + if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { return true, nil, nil } @@ -1438,45 +1468,27 @@ func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetad return false, nil, fmt.Errorf("node not found") } - glog.V(2).Infof("Checking for prebound volumes with node affinity") - namespace := pod.Namespace - manifest := &(pod.Spec) - for i := range manifest.Volumes { - volume := &manifest.Volumes[i] - if volume.PersistentVolumeClaim == nil { - continue - } - pvcName := volume.PersistentVolumeClaim.ClaimName - if pvcName == "" { - return false, nil, fmt.Errorf("PersistentVolumeClaim had no name") - } - pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName) - if err != nil { - return false, nil, err - } - - if pvc == nil { - return false, nil, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName) - } - pvName := pvc.Spec.VolumeName - if pvName == "" { - return false, nil, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName) - } - - pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName) - if err != nil { - return false, nil, err - } - if pv == nil { - return false, nil, fmt.Errorf("PersistentVolume not found: %q", pvName) - } - - err = volumeutil.CheckNodeAffinity(pv, node.Labels) - if err != nil { - glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q node mismatch: %v", pod.Name, node.Name, pvName, err.Error()) - return false, []algorithm.PredicateFailureReason{ErrVolumeNodeConflict}, nil - } - glog.V(4).Infof("VolumeNode predicate allows node %q for pod %q due to volume %q", node.Name, pod.Name, pvName) + unboundSatisfied, boundSatisfied, err := c.binder.Binder.FindPodVolumes(pod, node.Name) + if err != nil { + return false, nil, err } + + failReasons := []algorithm.PredicateFailureReason{} + if !boundSatisfied { + glog.V(5).Info("Bound PVs not satisfied for pod %v/%v, node %q", pod.Namespace, pod.Name, node.Name) + failReasons = append(failReasons, ErrVolumeNodeConflict) + } + + if !unboundSatisfied { + glog.V(5).Info("Couldn't find matching PVs for pod %v/%v, node %q", pod.Namespace, pod.Name, node.Name) + failReasons = append(failReasons, ErrVolumeBindConflict) + } + + if len(failReasons) > 0 { + return false, failReasons, nil + } + + // All volumes bound or matching PVs found for all unbound PVCs + glog.V(5).Info("All PVCs found matches for pod %v/%v, node %q", pod.Namespace, pod.Name, node.Name) return true, nil, nil } diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index b521651a577..c9808dec6e0 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -24,8 +24,10 @@ import ( "testing" "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" @@ -74,6 +76,17 @@ func (pvs FakePersistentVolumeInfo) GetPersistentVolumeInfo(pvID string) (*v1.Pe return nil, fmt.Errorf("Unable to find persistent volume: %s", pvID) } +type FakeStorageClassInfo []storagev1.StorageClass + +func (classes FakeStorageClassInfo) GetStorageClassInfo(name string) (*storagev1.StorageClass, error) { + for _, sc := range classes { + if sc.Name == name { + return &sc, nil + } + } + return nil, fmt.Errorf("Unable to find storage class: %s", name) +} + var ( extendedResourceA = v1.ResourceName("example.com/aaa") extendedResourceB = v1.ResourceName("example.com/bbb") @@ -3834,7 +3847,7 @@ func TestVolumeZonePredicate(t *testing.T) { expectedFailureReasons := []algorithm.PredicateFailureReason{ErrVolumeZoneConflict} for _, test := range tests { - fit := NewVolumeZonePredicate(pvInfo, pvcInfo) + fit := NewVolumeZonePredicate(pvInfo, pvcInfo, nil) node := &schedulercache.NodeInfo{} node.SetNode(test.Node) @@ -3927,7 +3940,7 @@ func TestVolumeZonePredicateMultiZone(t *testing.T) { expectedFailureReasons := []algorithm.PredicateFailureReason{ErrVolumeZoneConflict} for _, test := range tests { - fit := NewVolumeZonePredicate(pvInfo, pvcInfo) + fit := NewVolumeZonePredicate(pvInfo, pvcInfo, nil) node := &schedulercache.NodeInfo{} node.SetNode(test.Node) @@ -3945,6 +3958,130 @@ func TestVolumeZonePredicateMultiZone(t *testing.T) { } } +func TestVolumeZonePredicateWithVolumeBinding(t *testing.T) { + var ( + modeWait = storagev1.VolumeBindingWaitForFirstConsumer + + class0 = "Class_0" + classWait = "Class_Wait" + classImmediate = "Class_Immediate" + ) + + classInfo := FakeStorageClassInfo{ + { + ObjectMeta: metav1.ObjectMeta{Name: classImmediate}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: classWait}, + VolumeBindingMode: &modeWait, + }, + } + + pvInfo := FakePersistentVolumeInfo{ + { + ObjectMeta: metav1.ObjectMeta{Name: "Vol_1", Labels: map[string]string{kubeletapis.LabelZoneFailureDomain: "us-west1-a"}}, + }, + } + + pvcInfo := FakePersistentVolumeClaimInfo{ + { + ObjectMeta: metav1.ObjectMeta{Name: "PVC_1", Namespace: "default"}, + Spec: v1.PersistentVolumeClaimSpec{VolumeName: "Vol_1"}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "PVC_NoSC", Namespace: "default"}, + Spec: v1.PersistentVolumeClaimSpec{StorageClassName: &class0}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "PVC_EmptySC", Namespace: "default"}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "PVC_WaitSC", Namespace: "default"}, + Spec: v1.PersistentVolumeClaimSpec{StorageClassName: &classWait}, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "PVC_ImmediateSC", Namespace: "default"}, + Spec: v1.PersistentVolumeClaimSpec{StorageClassName: &classImmediate}, + }, + } + + testNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "host1", + Labels: map[string]string{kubeletapis.LabelZoneFailureDomain: "us-west1-a", "uselessLabel": "none"}, + }, + } + + tests := []struct { + Name string + Pod *v1.Pod + Fits bool + Node *v1.Node + ExpectFailure bool + }{ + { + Name: "label zone failure domain matched", + Pod: createPodWithVolume("pod_1", "vol_1", "PVC_1"), + Node: testNode, + Fits: true, + }, + { + Name: "unbound volume empty storage class", + Pod: createPodWithVolume("pod_1", "vol_1", "PVC_EmptySC"), + Node: testNode, + Fits: false, + ExpectFailure: true, + }, + { + Name: "unbound volume no storage class", + Pod: createPodWithVolume("pod_1", "vol_1", "PVC_NoSC"), + Node: testNode, + Fits: false, + ExpectFailure: true, + }, + { + Name: "unbound volume immediate binding mode", + Pod: createPodWithVolume("pod_1", "vol_1", "PVC_ImmediateSC"), + Node: testNode, + Fits: false, + ExpectFailure: true, + }, + { + Name: "unbound volume wait binding mode", + Pod: createPodWithVolume("pod_1", "vol_1", "PVC_WaitSC"), + Node: testNode, + Fits: true, + }, + } + + err := utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true") + if err != nil { + t.Fatalf("Failed to enable feature gate for VolumeScheduling: %v", err) + } + + for _, test := range tests { + fit := NewVolumeZonePredicate(pvInfo, pvcInfo, classInfo) + node := &schedulercache.NodeInfo{} + node.SetNode(test.Node) + + fits, _, err := fit(test.Pod, nil, node) + if !test.ExpectFailure && err != nil { + t.Errorf("%s: unexpected error: %v", test.Name, err) + } + if test.ExpectFailure && err == nil { + t.Errorf("%s: expected error, got success", test.Name) + } + if fits != test.Fits { + t.Errorf("%s: expected %v got %v", test.Name, test.Fits, fits) + } + } + + err = utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false") + if err != nil { + t.Fatalf("Failed to disable feature gate for VolumeScheduling: %v", err) + } +} + func TestGetMaxVols(t *testing.T) { previousValue := os.Getenv(KubeMaxPDVols) defaultValue := 39 diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/BUILD b/plugin/pkg/scheduler/algorithmprovider/defaults/BUILD index 11bd8b92895..e72930b90f9 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/BUILD +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/BUILD @@ -34,6 +34,7 @@ go_test( deps = [ "//pkg/api/legacyscheme:go_default_library", "//pkg/apis/core/install:go_default_library", + "//plugin/pkg/scheduler/algorithm/predicates:go_default_library", "//plugin/pkg/scheduler/api:go_default_library", "//plugin/pkg/scheduler/api/latest:go_default_library", "//plugin/pkg/scheduler/factory:go_default_library", diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go index 71283193a35..332fb4f6796 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go @@ -337,8 +337,7 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { {"name": "MatchInterPodAffinity"}, {"name": "GeneralPredicates"}, {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, - {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}, - {"name": "NoVolumeNodeConflict"} + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} ],"priorities": [ {"name": "EqualPriority", "weight": 2}, {"name": "ImageLocalityPriority", "weight": 2}, @@ -370,7 +369,6 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { {Name: "GeneralPredicates"}, {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, - {Name: "NoVolumeNodeConflict"}, }, Priorities: []schedulerapi.PriorityPolicy{ {Name: "EqualPriority", Weight: 2}, @@ -409,8 +407,7 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { {"name": "MatchInterPodAffinity"}, {"name": "GeneralPredicates"}, {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, - {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}, - {"name": "NoVolumeNodeConflict"} + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} ],"priorities": [ {"name": "EqualPriority", "weight": 2}, {"name": "ImageLocalityPriority", "weight": 2}, @@ -443,7 +440,80 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { {Name: "GeneralPredicates"}, {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, - {Name: "NoVolumeNodeConflict"}, + }, + Priorities: []schedulerapi.PriorityPolicy{ + {Name: "EqualPriority", Weight: 2}, + {Name: "ImageLocalityPriority", Weight: 2}, + {Name: "LeastRequestedPriority", Weight: 2}, + {Name: "BalancedResourceAllocation", Weight: 2}, + {Name: "SelectorSpreadPriority", Weight: 2}, + {Name: "NodePreferAvoidPodsPriority", Weight: 2}, + {Name: "NodeAffinityPriority", Weight: 2}, + {Name: "TaintTolerationPriority", Weight: 2}, + {Name: "InterPodAffinityPriority", Weight: 2}, + {Name: "MostRequestedPriority", Weight: 2}, + }, + }, + }, + // Do not change this JSON after the corresponding release has been tagged. + // A failure indicates backwards compatibility with the specified release was broken. + "1.9": { + JSON: `{ + "kind": "Policy", + "apiVersion": "v1", + "predicates": [ + {"name": "MatchNodeSelector"}, + {"name": "PodFitsResources"}, + {"name": "PodFitsHostPorts"}, + {"name": "HostName"}, + {"name": "NoDiskConflict"}, + {"name": "NoVolumeZoneConflict"}, + {"name": "PodToleratesNodeTaints"}, + {"name": "CheckNodeMemoryPressure"}, + {"name": "CheckNodeDiskPressure"}, + {"name": "CheckNodeCondition"}, + {"name": "MaxEBSVolumeCount"}, + {"name": "MaxGCEPDVolumeCount"}, + {"name": "MaxAzureDiskVolumeCount"}, + {"name": "MatchInterPodAffinity"}, + {"name": "GeneralPredicates"}, + {"name": "CheckVolumeBinding"}, + {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} + + ],"priorities": [ + {"name": "EqualPriority", "weight": 2}, + {"name": "ImageLocalityPriority", "weight": 2}, + {"name": "LeastRequestedPriority", "weight": 2}, + {"name": "BalancedResourceAllocation", "weight": 2}, + {"name": "SelectorSpreadPriority", "weight": 2}, + {"name": "NodePreferAvoidPodsPriority", "weight": 2}, + {"name": "NodeAffinityPriority", "weight": 2}, + {"name": "TaintTolerationPriority", "weight": 2}, + {"name": "InterPodAffinityPriority", "weight": 2}, + {"name": "MostRequestedPriority", "weight": 2} + ] + }`, + ExpectedPolicy: schedulerapi.Policy{ + Predicates: []schedulerapi.PredicatePolicy{ + {Name: "MatchNodeSelector"}, + {Name: "PodFitsResources"}, + {Name: "PodFitsHostPorts"}, + {Name: "HostName"}, + {Name: "NoDiskConflict"}, + {Name: "NoVolumeZoneConflict"}, + {Name: "PodToleratesNodeTaints"}, + {Name: "CheckNodeMemoryPressure"}, + {Name: "CheckNodeDiskPressure"}, + {Name: "CheckNodeCondition"}, + {Name: "MaxEBSVolumeCount"}, + {Name: "MaxGCEPDVolumeCount"}, + {Name: "MaxAzureDiskVolumeCount"}, + {Name: "MatchInterPodAffinity"}, + {Name: "GeneralPredicates"}, + {Name: "CheckVolumeBinding"}, + {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, + {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, }, Priorities: []schedulerapi.PriorityPolicy{ {Name: "EqualPriority", Weight: 2}, @@ -506,6 +576,7 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ).CreateFromConfig(policy); err != nil { diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 99d80e566d7..5bb5f136192 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -114,7 +114,7 @@ func defaultPredicates() sets.String { factory.RegisterFitPredicateFactory( "NoVolumeZoneConflict", func(args factory.PluginFactoryArgs) algorithm.FitPredicate { - return predicates.NewVolumeZonePredicate(args.PVInfo, args.PVCInfo) + return predicates.NewVolumeZonePredicate(args.PVInfo, args.PVCInfo, args.StorageClassInfo) }, ), // Fit is determined by whether or not there would be too many AWS EBS volumes attached to the node @@ -165,11 +165,11 @@ func defaultPredicates() sets.String { // Fit is determined based on whether a pod can tolerate all of the node's taints factory.RegisterFitPredicate("PodToleratesNodeTaints", predicates.PodToleratesNodeTaints), - // Fit is determined by volume zone requirements. + // Fit is determined by volume topology requirements. factory.RegisterFitPredicateFactory( - "NoVolumeNodeConflict", + predicates.CheckVolumeBinding, func(args factory.PluginFactoryArgs) algorithm.FitPredicate { - return predicates.NewVolumeNodePredicate(args.PVInfo, args.PVCInfo, nil) + return predicates.NewVolumeBindingPredicate(args.VolumeBinder) }, ), ) diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults_test.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults_test.go index fe13e1b39dc..a6aa09aadfd 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults_test.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults_test.go @@ -20,6 +20,7 @@ import ( "testing" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" ) func TestCopyAndReplace(t *testing.T) { @@ -75,9 +76,9 @@ func TestDefaultPredicates(t *testing.T) { "GeneralPredicates", "CheckNodeMemoryPressure", "CheckNodeDiskPressure", - "NoVolumeNodeConflict", "CheckNodeCondition", "PodToleratesNodeTaints", + predicates.CheckVolumeBinding, ) if expected := defaultPredicates(); !result.Equal(expected) { diff --git a/plugin/pkg/scheduler/core/BUILD b/plugin/pkg/scheduler/core/BUILD index 60d7cde6902..203b09b36c9 100644 --- a/plugin/pkg/scheduler/core/BUILD +++ b/plugin/pkg/scheduler/core/BUILD @@ -53,6 +53,7 @@ go_library( "//plugin/pkg/scheduler/api:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", + "//plugin/pkg/scheduler/volumebinder:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/golang/groupcache/lru:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", diff --git a/plugin/pkg/scheduler/core/extender_test.go b/plugin/pkg/scheduler/core/extender_test.go index fafd2ae6bd4..143ba795dd6 100644 --- a/plugin/pkg/scheduler/core/extender_test.go +++ b/plugin/pkg/scheduler/core/extender_test.go @@ -317,7 +317,7 @@ func TestGenericSchedulerWithExtenders(t *testing.T) { } queue := NewSchedulingQueue() scheduler := NewGenericScheduler( - cache, nil, queue, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, extenders) + cache, nil, queue, test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, extenders, nil) podIgnored := &v1.Pod{} machine, err := scheduler.Schedule(podIgnored, schedulertesting.FakeNodeLister(makeNodeList(test.nodes))) if test.expectsErr { diff --git a/plugin/pkg/scheduler/core/generic_scheduler.go b/plugin/pkg/scheduler/core/generic_scheduler.go index 2c2f3e3dd6e..b50d475d82e 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler.go +++ b/plugin/pkg/scheduler/core/generic_scheduler.go @@ -36,6 +36,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/util" "github.com/golang/glog" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) type FailedPredicateMap map[string][]algorithm.PredicateFailureReason @@ -91,6 +92,7 @@ type genericScheduler struct { lastNodeIndex uint64 cachedNodeInfoMap map[string]*schedulercache.NodeInfo + volumeBinder *volumebinder.VolumeBinder } // Schedule tries to schedule the given pod to one of node in the node list. @@ -867,7 +869,10 @@ func nodesWherePreemptionMightHelp(pod *v1.Pod, nodes []*v1.Node, failedPredicat predicates.ErrNodeNotReady, predicates.ErrNodeNetworkUnavailable, predicates.ErrNodeUnschedulable, - predicates.ErrNodeUnknownCondition: + predicates.ErrNodeUnknownCondition, + predicates.ErrVolumeZoneConflict, + predicates.ErrVolumeNodeConflict, + predicates.ErrVolumeBindConflict: unresolvableReasonExist = true break // TODO(bsalamat): Please add affinity failure cases once we have specific affinity failure errors. @@ -909,7 +914,8 @@ func NewGenericScheduler( predicateMetaProducer algorithm.PredicateMetadataProducer, prioritizers []algorithm.PriorityConfig, priorityMetaProducer algorithm.MetadataProducer, - extenders []algorithm.SchedulerExtender) algorithm.ScheduleAlgorithm { + extenders []algorithm.SchedulerExtender, + volumeBinder *volumebinder.VolumeBinder) algorithm.ScheduleAlgorithm { return &genericScheduler{ cache: cache, equivalenceCache: eCache, @@ -920,5 +926,6 @@ func NewGenericScheduler( priorityMetaProducer: priorityMetaProducer, extenders: extenders, cachedNodeInfoMap: make(map[string]*schedulercache.NodeInfo), + volumeBinder: volumeBinder, } } diff --git a/plugin/pkg/scheduler/core/generic_scheduler_test.go b/plugin/pkg/scheduler/core/generic_scheduler_test.go index 99015676537..d5d9b341096 100644 --- a/plugin/pkg/scheduler/core/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/core/generic_scheduler_test.go @@ -311,7 +311,7 @@ func TestGenericScheduler(t *testing.T) { } scheduler := NewGenericScheduler( - cache, nil, NewSchedulingQueue(), test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}) + cache, nil, NewSchedulingQueue(), test.predicates, algorithm.EmptyPredicateMetadataProducer, test.prioritizers, algorithm.EmptyMetadataProducer, []algorithm.SchedulerExtender{}, nil) machine, err := scheduler.Schedule(test.pod, schedulertesting.FakeNodeLister(makeNodeList(test.nodes))) if !reflect.DeepEqual(err, test.wErr) { @@ -1190,7 +1190,7 @@ func TestPreempt(t *testing.T) { extenders = append(extenders, extender) } scheduler := NewGenericScheduler( - cache, nil, NewSchedulingQueue(), map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, algorithm.EmptyMetadataProducer, extenders) + cache, nil, NewSchedulingQueue(), map[string]algorithm.FitPredicate{"matches": algorithmpredicates.PodFitsResources}, algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, algorithm.EmptyMetadataProducer, extenders, nil) // Call Preempt and check the expected results. node, victims, _, err := scheduler.Preempt(test.pod, schedulertesting.FakeNodeLister(makeNodeList(nodeNames)), error(&FitError{Pod: test.pod, FailedPredicates: failedPredMap})) if err != nil { diff --git a/plugin/pkg/scheduler/factory/BUILD b/plugin/pkg/scheduler/factory/BUILD index 62cdd123c70..c4fe5102bb5 100644 --- a/plugin/pkg/scheduler/factory/BUILD +++ b/plugin/pkg/scheduler/factory/BUILD @@ -16,6 +16,7 @@ go_library( deps = [ "//pkg/api/v1/pod:go_default_library", "//pkg/apis/core/helper:go_default_library", + "//pkg/features:go_default_library", "//pkg/kubelet/apis:go_default_library", "//plugin/pkg/scheduler:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", @@ -26,6 +27,7 @@ go_library( "//plugin/pkg/scheduler/core:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/util:go_default_library", + "//plugin/pkg/scheduler/volumebinder:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/policy/v1beta1:go_default_library", @@ -38,15 +40,18 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/runtime:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/informers/apps/v1beta1:go_default_library", "//vendor/k8s.io/client-go/informers/core/v1:go_default_library", "//vendor/k8s.io/client-go/informers/extensions/v1beta1:go_default_library", "//vendor/k8s.io/client-go/informers/policy/v1beta1:go_default_library", + "//vendor/k8s.io/client-go/informers/storage/v1:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/listers/apps/v1beta1:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", "//vendor/k8s.io/client-go/listers/extensions/v1beta1:go_default_library", "//vendor/k8s.io/client-go/listers/policy/v1beta1:go_default_library", + "//vendor/k8s.io/client-go/listers/storage/v1:go_default_library", "//vendor/k8s.io/client-go/tools/cache:go_default_library", ], ) diff --git a/plugin/pkg/scheduler/factory/factory.go b/plugin/pkg/scheduler/factory/factory.go index e77d4958303..764f449ccc4 100644 --- a/plugin/pkg/scheduler/factory/factory.go +++ b/plugin/pkg/scheduler/factory/factory.go @@ -37,18 +37,22 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" appsinformers "k8s.io/client-go/informers/apps/v1beta1" coreinformers "k8s.io/client-go/informers/core/v1" extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1" policyinformers "k8s.io/client-go/informers/policy/v1beta1" + storageinformers "k8s.io/client-go/informers/storage/v1" clientset "k8s.io/client-go/kubernetes" appslisters "k8s.io/client-go/listers/apps/v1beta1" corelisters "k8s.io/client-go/listers/core/v1" extensionslisters "k8s.io/client-go/listers/extensions/v1beta1" policylisters "k8s.io/client-go/listers/policy/v1beta1" + storagelisters "k8s.io/client-go/listers/storage/v1" "k8s.io/client-go/tools/cache" podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/apis/core/helper" + "k8s.io/kubernetes/pkg/features" kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis" "k8s.io/kubernetes/plugin/pkg/scheduler" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" @@ -58,6 +62,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/core" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" "k8s.io/kubernetes/plugin/pkg/scheduler/util" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) const ( @@ -98,6 +103,8 @@ type configFactory struct { statefulSetLister appslisters.StatefulSetLister // a means to list all PodDisruptionBudgets pdbLister policylisters.PodDisruptionBudgetLister + // a means to list all StorageClasses + storageClassLister storagelisters.StorageClassLister // Close this to stop all reflectors StopEverything chan struct{} @@ -120,6 +127,9 @@ type configFactory struct { // Enable equivalence class cache enableEquivalenceClassCache bool + + // Handles volume binding decisions + volumeBinder *volumebinder.VolumeBinder } // NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only @@ -136,12 +146,19 @@ func NewConfigFactory( statefulSetInformer appsinformers.StatefulSetInformer, serviceInformer coreinformers.ServiceInformer, pdbInformer policyinformers.PodDisruptionBudgetInformer, + storageClassInformer storageinformers.StorageClassInformer, hardPodAffinitySymmetricWeight int32, enableEquivalenceClassCache bool, ) scheduler.Configurator { stopEverything := make(chan struct{}) schedulerCache := schedulercache.New(30*time.Second, stopEverything) + // storageClassInformer is only enabled through VolumeScheduling feature gate + var storageClassLister storagelisters.StorageClassLister + if storageClassInformer != nil { + storageClassLister = storageClassInformer.Lister() + } + c := &configFactory{ client: client, podLister: schedulerCache, @@ -153,6 +170,7 @@ func NewConfigFactory( replicaSetLister: replicaSetInformer.Lister(), statefulSetLister: statefulSetInformer.Lister(), pdbLister: pdbInformer.Lister(), + storageClassLister: storageClassLister, schedulerCache: schedulerCache, StopEverything: stopEverything, schedulerName: schedulerName, @@ -208,9 +226,14 @@ func NewConfigFactory( } }, DeleteFunc: func(obj interface{}) { - if err := c.podQueue.Delete(obj.(*v1.Pod)); err != nil { + pod := obj.(*v1.Pod) + if err := c.podQueue.Delete(pod); err != nil { runtime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err)) } + if c.volumeBinder != nil { + // Volume binder only wants to keep unassigned pods + c.volumeBinder.DeletePodBindings(pod) + } }, }, }, @@ -252,6 +275,7 @@ func NewConfigFactory( pvcInformer.Informer().AddEventHandler( cache.ResourceEventHandlerFuncs{ AddFunc: c.onPvcAdd, + UpdateFunc: c.onPvcUpdate, DeleteFunc: c.onPvcDelete, }, ) @@ -272,6 +296,11 @@ func NewConfigFactory( // Existing equivalence cache should not be affected by add/delete RC/Deployment etc, // it only make sense when pod is scheduled or deleted + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + // Setup volume binder + c.volumeBinder = volumebinder.NewVolumeBinder(client, pvcInformer, pvInformer, nodeInformer, storageClassInformer) + } + return c } @@ -365,6 +394,12 @@ func (c *configFactory) invalidatePredicatesForPv(pv *v1.PersistentVolume) { if pv.Spec.AzureDisk != nil { invalidPredicates.Insert("MaxAzureDiskVolumeCount") } + + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + // Add/delete impacts the available PVs to choose from + invalidPredicates.Insert(predicates.CheckVolumeBinding) + } + c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates) } @@ -380,6 +415,27 @@ func (c *configFactory) onPvcAdd(obj interface{}) { c.podQueue.MoveAllToActiveQueue() } +func (c *configFactory) onPvcUpdate(old, new interface{}) { + if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + return + } + + if c.enableEquivalenceClassCache { + newPVC, ok := new.(*v1.PersistentVolumeClaim) + if !ok { + glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", new) + return + } + oldPVC, ok := old.(*v1.PersistentVolumeClaim) + if !ok { + glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", old) + return + } + c.invalidatePredicatesForPvcUpdate(oldPVC, newPVC) + } + c.podQueue.MoveAllToActiveQueue() +} + func (c *configFactory) onPvcDelete(obj interface{}) { if c.enableEquivalenceClassCache { var pvc *v1.PersistentVolumeClaim @@ -407,6 +463,21 @@ func (c *configFactory) invalidatePredicatesForPvc(pvc *v1.PersistentVolumeClaim } } +func (c *configFactory) invalidatePredicatesForPvcUpdate(old, new *v1.PersistentVolumeClaim) { + invalidPredicates := sets.NewString() + + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + if old.Spec.VolumeName != new.Spec.VolumeName { + // PVC volume binding has changed + invalidPredicates.Insert(predicates.CheckVolumeBinding) + } + } + + if invalidPredicates.Len() > 0 { + c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates) + } +} + func (c *configFactory) onServiceAdd(obj interface{}) { if c.enableEquivalenceClassCache { c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet) @@ -468,6 +539,7 @@ func (c *configFactory) addPodToCache(obj interface{}) { } c.podQueue.AssignedPodAdded(pod) + // NOTE: Updating equivalence cache of addPodToCache has been // handled optimistically in InvalidateCachedPredicateItemForPodAdd. } @@ -830,7 +902,8 @@ func (f *configFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, f.equivalencePodCache = core.NewEquivalenceCache(getEquivalencePodFunc) glog.Info("Created equivalence class cache") } - algo := core.NewGenericScheduler(f.schedulerCache, f.equivalencePodCache, f.podQueue, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders) + + algo := core.NewGenericScheduler(f.schedulerCache, f.equivalencePodCache, f.podQueue, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders, f.volumeBinder) podBackoff := util.CreateDefaultPodBackoff() return &scheduler.Config{ @@ -850,6 +923,7 @@ func (f *configFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, }, Error: f.MakeDefaultErrorFunc(podBackoff, f.podQueue), StopEverything: f.StopEverything, + VolumeBinder: f.volumeBinder, }, nil } @@ -898,15 +972,17 @@ func (f *configFactory) GetPredicates(predicateKeys sets.String) (map[string]alg func (f *configFactory) getPluginArgs() (*PluginFactoryArgs, error) { return &PluginFactoryArgs{ - PodLister: f.podLister, - ServiceLister: f.serviceLister, - ControllerLister: f.controllerLister, - ReplicaSetLister: f.replicaSetLister, - StatefulSetLister: f.statefulSetLister, - NodeLister: &nodeLister{f.nodeLister}, - NodeInfo: &predicates.CachedNodeInfo{NodeLister: f.nodeLister}, - PVInfo: &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister}, - PVCInfo: &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister}, + PodLister: f.podLister, + ServiceLister: f.serviceLister, + ControllerLister: f.controllerLister, + ReplicaSetLister: f.replicaSetLister, + StatefulSetLister: f.statefulSetLister, + NodeLister: &nodeLister{f.nodeLister}, + NodeInfo: &predicates.CachedNodeInfo{NodeLister: f.nodeLister}, + PVInfo: &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister}, + PVCInfo: &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister}, + StorageClassInfo: &predicates.CachedStorageClassInfo{StorageClassLister: f.storageClassLister}, + VolumeBinder: f.volumeBinder, HardPodAffinitySymmetricWeight: f.hardPodAffinitySymmetricWeight, }, nil } @@ -1047,6 +1123,7 @@ func (factory *configFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, pod Namespace: pod.Namespace, Name: pod.Name, } + origPod := pod // When pod priority is enabled, we would like to place an unschedulable // pod in the unschedulable queue. This ensures that if the pod is nominated @@ -1066,11 +1143,21 @@ func (factory *configFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, pod if err == nil { if len(pod.Spec.NodeName) == 0 { podQueue.AddUnschedulableIfNotPresent(pod) + } else { + if factory.volumeBinder != nil { + // Volume binder only wants to keep unassigned pods + factory.volumeBinder.DeletePodBindings(pod) + } } break } if errors.IsNotFound(err) { glog.Warningf("A pod %v no longer exists", podID) + + if factory.volumeBinder != nil { + // Volume binder only wants to keep unassigned pods + factory.volumeBinder.DeletePodBindings(origPod) + } return } glog.Errorf("Error getting pod %v for retry: %v; retrying...", podID, err) diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index db73370ac09..437e9d4d6a0 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -66,6 +66,7 @@ func TestCreate(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) @@ -99,6 +100,7 @@ func TestCreateFromConfig(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) @@ -159,6 +161,7 @@ func TestCreateFromConfigWithHardPodAffinitySymmetricWeight(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) @@ -220,6 +223,7 @@ func TestCreateFromEmptyConfig(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) @@ -278,6 +282,7 @@ func TestDefaultErrorFunc(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) @@ -388,6 +393,7 @@ func TestInvalidHardPodAffinitySymmetricWeight(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), -1, enableEquivalenceCache, ) @@ -435,6 +441,7 @@ func TestInvalidFactoryArgs(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), test.hardPodAffinitySymmetricWeight, enableEquivalenceCache, ) diff --git a/plugin/pkg/scheduler/factory/plugins.go b/plugin/pkg/scheduler/factory/plugins.go index 028546a71dc..a0de0f67fda 100644 --- a/plugin/pkg/scheduler/factory/plugins.go +++ b/plugin/pkg/scheduler/factory/plugins.go @@ -30,6 +30,7 @@ import ( schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "github.com/golang/glog" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) // PluginFactoryArgs are passed to all plugin factory functions. @@ -43,6 +44,8 @@ type PluginFactoryArgs struct { NodeInfo predicates.NodeInfo PVInfo predicates.PersistentVolumeInfo PVCInfo predicates.PersistentVolumeClaimInfo + StorageClassInfo predicates.StorageClassInfo + VolumeBinder *volumebinder.VolumeBinder HardPodAffinitySymmetricWeight int32 } diff --git a/plugin/pkg/scheduler/scheduler.go b/plugin/pkg/scheduler/scheduler.go index d9af3d67c96..b69aacc8eaf 100644 --- a/plugin/pkg/scheduler/scheduler.go +++ b/plugin/pkg/scheduler/scheduler.go @@ -17,16 +17,20 @@ limitations under the License. package scheduler import ( + "fmt" "time" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" + "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" "k8s.io/kubernetes/plugin/pkg/scheduler/core" "k8s.io/kubernetes/plugin/pkg/scheduler/metrics" @@ -34,6 +38,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/util" "github.com/golang/glog" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) // Binder knows how to write a binding. @@ -129,6 +134,9 @@ type Config struct { // Close this to shut down the scheduler. StopEverything chan struct{} + + // VolumeBinder handles PVC/PV binding for the pod. + VolumeBinder *volumebinder.VolumeBinder } // NewFromConfigurator returns a new scheduler that is created entirely by the Configurator. Assumes Create() is implemented. @@ -164,6 +172,10 @@ func (sched *Scheduler) Run() { return } + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + go sched.config.VolumeBinder.Run(sched.bindVolumesWorker, sched.config.StopEverything) + } + go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything) } @@ -240,6 +252,114 @@ func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, e return nodeName, err } +// assumeAndBindVolumes will update the volume cache and then asynchronously bind volumes if required. +// +// If volume binding is required, then the bind volumes routine will update the pod to send it back through +// the scheduler. +// +// Otherwise, return nil error and continue to assume the pod. +// +// This function modifies assumed if volume binding is required. +func (sched *Scheduler) assumeAndBindVolumes(assumed *v1.Pod, host string) error { + if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) { + allBound, bindingRequired, err := sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host) + if err != nil { + sched.config.Error(assumed, err) + sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePodVolumes failed: %v", err) + sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: "SchedulerError", + Message: err.Error(), + }) + return err + } + if !allBound { + err = fmt.Errorf("Volume binding started, waiting for completion") + if bindingRequired { + if sched.config.Ecache != nil { + invalidPredicates := sets.NewString(predicates.CheckVolumeBinding) + sched.config.Ecache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates) + } + + // bindVolumesWorker() will update the Pod object to put it back in the scheduler queue + sched.config.VolumeBinder.BindQueue.Add(assumed) + } else { + // We are just waiting for PV controller to finish binding, put it back in the + // scheduler queue + sched.config.Error(assumed, err) + sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "FailedScheduling", "%v", err) + sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: "VolumeBindingWaiting", + }) + } + return err + } + } + return nil +} + +// bindVolumesWorker() processes pods queued in assumeAndBindVolumes() and tries to +// make the API update for volume binding. +// This function runs forever until the volume BindQueue is closed. +func (sched *Scheduler) bindVolumesWorker() { + workFunc := func() bool { + keyObj, quit := sched.config.VolumeBinder.BindQueue.Get() + if quit { + return true + } + defer sched.config.VolumeBinder.BindQueue.Done(keyObj) + + assumed, ok := keyObj.(*v1.Pod) + if !ok { + glog.V(4).Infof("Object is not a *v1.Pod") + return false + } + + // TODO: add metrics + var reason string + var eventType string + + glog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name) + + // The Pod is always sent back to the scheduler afterwards. + err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed) + if err != nil { + glog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name, err) + reason = "VolumeBindingFailed" + eventType = v1.EventTypeWarning + } else { + glog.V(4).Infof("Successfully bound volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name) + reason = "VolumeBindingWaiting" + eventType = v1.EventTypeNormal + err = fmt.Errorf("Volume binding started, waiting for completion") + } + + // Always fail scheduling regardless of binding success. + // The Pod needs to be sent back through the scheduler to: + // * Retry volume binding if it fails. + // * Retry volume binding if dynamic provisioning fails. + // * Bind the Pod to the Node once all volumes are bound. + sched.config.Error(assumed, err) + sched.config.Recorder.Eventf(assumed, eventType, "FailedScheduling", "%v", err) + sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: reason, + }) + return false + } + + for { + if quit := workFunc(); quit { + glog.V(4).Infof("bindVolumesWorker shutting down") + break + } + } +} + // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. // assume modifies `assumed`. func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { @@ -334,15 +454,32 @@ func (sched *Scheduler) scheduleOne() { // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. // This allows us to keep scheduling without waiting on binding to occur. - assumedPod := *pod + assumedPod := pod.DeepCopy() + + // Assume volumes first before assuming the pod. + // + // If no volumes need binding, then nil is returned, and continue to assume the pod. + // + // Otherwise, error is returned and volume binding is started asynchronously for all of the pod's volumes. + // scheduleOne() returns immediately on error, so that it doesn't continue to assume the pod. + // + // After the asynchronous volume binding updates are made, it will send the pod back through the scheduler for + // subsequent passes until all volumes are fully bound. + // + // This function modifies 'assumedPod' if volume binding is required. + err = sched.assumeAndBindVolumes(assumedPod, suggestedHost) + if err != nil { + return + } + // assume modifies `assumedPod` by setting NodeName=suggestedHost - err = sched.assume(&assumedPod, suggestedHost) + err = sched.assume(assumedPod, suggestedHost) if err != nil { return } // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). go func() { - err := sched.bind(&assumedPod, &v1.Binding{ + err := sched.bind(assumedPod, &v1.Binding{ ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID}, Target: v1.ObjectReference{ Kind: "Node", diff --git a/plugin/pkg/scheduler/scheduler_test.go b/plugin/pkg/scheduler/scheduler_test.go index e44bf2bbe9c..e08397723f7 100644 --- a/plugin/pkg/scheduler/scheduler_test.go +++ b/plugin/pkg/scheduler/scheduler_test.go @@ -29,15 +29,18 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientcache "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/kubernetes/pkg/api/legacyscheme" + "k8s.io/kubernetes/pkg/controller/volume/persistentvolume" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" "k8s.io/kubernetes/plugin/pkg/scheduler/core" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" schedulertesting "k8s.io/kubernetes/plugin/pkg/scheduler/testing" "k8s.io/kubernetes/plugin/pkg/scheduler/util" + "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder" ) type fakeBinder struct { @@ -420,7 +423,7 @@ func TestSchedulerErrorWithLongBinding(t *testing.T) { func setupTestSchedulerWithOnePodOnNode(t *testing.T, queuedPodStore *clientcache.FIFO, scache schedulercache.Cache, nodeLister schedulertesting.FakeNodeLister, predicateMap map[string]algorithm.FitPredicate, pod *v1.Pod, node *v1.Node) (*Scheduler, chan *v1.Binding, chan error) { - scheduler, bindingChan, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap) + scheduler, bindingChan, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap, nil) queuedPodStore.Add(pod) // queuedPodStore: [foo:8080] @@ -495,7 +498,7 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) { predicates.NewInsufficientResourceError(v1.ResourceMemory, 500, 0, 100), } } - scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap) + scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap, nil) queuedPodStore.Add(podWithTooBigResourceRequests) scheduler.scheduleOne() @@ -519,7 +522,7 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) { // queuedPodStore: pods queued before processing. // scache: scheduler cache that might contain assumed pods. -func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache schedulercache.Cache, nodeLister schedulertesting.FakeNodeLister, predicateMap map[string]algorithm.FitPredicate) (*Scheduler, chan *v1.Binding, chan error) { +func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache schedulercache.Cache, nodeLister schedulertesting.FakeNodeLister, predicateMap map[string]algorithm.FitPredicate, recorder record.EventRecorder) (*Scheduler, chan *v1.Binding, chan error) { algo := core.NewGenericScheduler( scache, nil, @@ -528,7 +531,8 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache schedulercache. algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{}, algorithm.EmptyMetadataProducer, - []algorithm.SchedulerExtender{}) + []algorithm.SchedulerExtender{}, + nil) bindingChan := make(chan *v1.Binding, 1) errChan := make(chan error, 1) configurator := &FakeConfigurator{ @@ -552,6 +556,10 @@ func setupTestScheduler(queuedPodStore *clientcache.FIFO, scache schedulercache. }, } + if recorder != nil { + configurator.Config.Recorder = recorder + } + sched, _ := NewFromConfigurator(configurator, nil...) return sched, bindingChan, errChan @@ -566,7 +574,8 @@ func setupTestSchedulerLongBindingWithRetry(queuedPodStore *clientcache.FIFO, sc algorithm.EmptyPredicateMetadataProducer, []algorithm.PriorityConfig{}, algorithm.EmptyMetadataProducer, - []algorithm.SchedulerExtender{}) + []algorithm.SchedulerExtender{}, + nil) bindingChan := make(chan *v1.Binding, 2) configurator := &FakeConfigurator{ Config: &Config{ @@ -598,3 +607,205 @@ func setupTestSchedulerLongBindingWithRetry(queuedPodStore *clientcache.FIFO, sc return sched, bindingChan } + +func setupTestSchedulerWithVolumeBinding(fakeVolumeBinder *volumebinder.VolumeBinder, stop <-chan struct{}, broadcaster record.EventBroadcaster) (*Scheduler, chan *v1.Binding, chan error) { + testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1"}} + nodeLister := schedulertesting.FakeNodeLister([]*v1.Node{&testNode}) + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + queuedPodStore.Add(podWithID("foo", "")) + scache := schedulercache.New(10*time.Minute, stop) + scache.AddNode(&testNode) + + predicateMap := map[string]algorithm.FitPredicate{ + "VolumeBindingChecker": predicates.NewVolumeBindingPredicate(fakeVolumeBinder), + } + + recorder := broadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: "scheduler"}) + s, bindingChan, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap, recorder) + s.config.VolumeBinder = fakeVolumeBinder + return s, bindingChan, errChan +} + +// This is a workaround because golint complains that errors cannot +// end with punctuation. However, the real predicate error message does +// end with a period. +func makePredicateError(failReason string) error { + s := fmt.Sprintf("0/1 nodes are available: %v.", failReason) + return fmt.Errorf(s) +} + +func TestSchedulerWithVolumeBinding(t *testing.T) { + findErr := fmt.Errorf("find err") + assumeErr := fmt.Errorf("assume err") + bindErr := fmt.Errorf("bind err") + + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(t.Logf).Stop() + + // This can be small because we wait for pod to finish scheduling first + chanTimeout := 2 * time.Second + + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true") + defer utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false") + + table := map[string]struct { + expectError error + expectPodBind *v1.Binding + expectAssumeCalled bool + expectBindCalled bool + eventReason string + volumeBinderConfig *persistentvolume.FakeVolumeBinderConfig + }{ + "all-bound": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + AllBound: true, + FindUnboundSatsified: true, + FindBoundSatsified: true, + }, + expectAssumeCalled: true, + expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo"}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}}, + eventReason: "Scheduled", + }, + "bound,invalid-pv-affinity": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + AllBound: true, + FindUnboundSatsified: true, + FindBoundSatsified: false, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 VolumeNodeAffinityConflict"), + }, + "unbound,no-matches": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: false, + FindBoundSatsified: true, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 VolumeBindingNoMatch"), + }, + "bound-and-unbound-unsatisfied": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: false, + FindBoundSatsified: false, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 VolumeBindingNoMatch, 1 VolumeNodeAffinityConflict"), + }, + "unbound,found-matches": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: true, + FindBoundSatsified: true, + AssumeBindingRequired: true, + }, + expectAssumeCalled: true, + expectBindCalled: true, + eventReason: "FailedScheduling", + expectError: fmt.Errorf("Volume binding started, waiting for completion"), + }, + "unbound,found-matches,already-bound": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: true, + FindBoundSatsified: true, + AssumeBindingRequired: false, + }, + expectAssumeCalled: true, + expectBindCalled: false, + eventReason: "FailedScheduling", + expectError: fmt.Errorf("Volume binding started, waiting for completion"), + }, + "predicate-error": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindErr: findErr, + }, + eventReason: "FailedScheduling", + expectError: findErr, + }, + "assume-error": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: true, + FindBoundSatsified: true, + AssumeErr: assumeErr, + }, + expectAssumeCalled: true, + eventReason: "FailedScheduling", + expectError: assumeErr, + }, + "bind-error": { + volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{ + FindUnboundSatsified: true, + FindBoundSatsified: true, + AssumeBindingRequired: true, + BindErr: bindErr, + }, + expectAssumeCalled: true, + expectBindCalled: true, + eventReason: "FailedScheduling", + expectError: bindErr, + }, + } + + for name, item := range table { + stop := make(chan struct{}) + fakeVolumeBinder := volumebinder.NewFakeVolumeBinder(item.volumeBinderConfig) + internalBinder, ok := fakeVolumeBinder.Binder.(*persistentvolume.FakeVolumeBinder) + if !ok { + t.Fatalf("Failed to get fake volume binder") + } + s, bindingChan, errChan := setupTestSchedulerWithVolumeBinding(fakeVolumeBinder, stop, eventBroadcaster) + + eventChan := make(chan struct{}) + events := eventBroadcaster.StartEventWatcher(func(e *v1.Event) { + if e, a := item.eventReason, e.Reason; e != a { + t.Errorf("%v: expected %v, got %v", name, e, a) + } + close(eventChan) + }) + + go fakeVolumeBinder.Run(s.bindVolumesWorker, stop) + + s.scheduleOne() + + // Wait for pod to succeed or fail scheduling + select { + case <-eventChan: + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("%v: scheduling timeout after %v", name, wait.ForeverTestTimeout) + } + + events.Stop() + + // Wait for scheduling to return an error + select { + case err := <-errChan: + if item.expectError == nil || !reflect.DeepEqual(item.expectError.Error(), err.Error()) { + t.Errorf("%v: \n err \nWANT=%+v,\nGOT=%+v", name, item.expectError, err) + } + case <-time.After(chanTimeout): + if item.expectError != nil { + t.Errorf("%v: did not receive error after %v", name, chanTimeout) + } + } + + // Wait for pod to succeed binding + select { + case b := <-bindingChan: + if !reflect.DeepEqual(item.expectPodBind, b) { + t.Errorf("%v: \n err \nWANT=%+v,\nGOT=%+v", name, item.expectPodBind, b) + } + case <-time.After(chanTimeout): + if item.expectPodBind != nil { + t.Errorf("%v: did not receive pod binding after %v", name, chanTimeout) + } + } + + if item.expectAssumeCalled != internalBinder.AssumeCalled { + t.Errorf("%v: expectedAssumeCall %v", name, item.expectAssumeCalled) + } + + if item.expectBindCalled != internalBinder.BindCalled { + t.Errorf("%v: expectedBindCall %v", name, item.expectBindCalled) + } + + close(stop) + } +} diff --git a/plugin/pkg/scheduler/volumebinder/BUILD b/plugin/pkg/scheduler/volumebinder/BUILD new file mode 100644 index 00000000000..f942bfecdb8 --- /dev/null +++ b/plugin/pkg/scheduler/volumebinder/BUILD @@ -0,0 +1,31 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = ["volume_binder.go"], + importpath = "k8s.io/kubernetes/plugin/pkg/scheduler/volumebinder", + visibility = ["//visibility:public"], + deps = [ + "//pkg/controller/volume/persistentvolume:go_default_library", + "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/client-go/informers/core/v1:go_default_library", + "//vendor/k8s.io/client-go/informers/storage/v1:go_default_library", + "//vendor/k8s.io/client-go/kubernetes:go_default_library", + "//vendor/k8s.io/client-go/util/workqueue:go_default_library", + ], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], + visibility = ["//visibility:public"], +) diff --git a/plugin/pkg/scheduler/volumebinder/volume_binder.go b/plugin/pkg/scheduler/volumebinder/volume_binder.go new file mode 100644 index 00000000000..957c4e18aac --- /dev/null +++ b/plugin/pkg/scheduler/volumebinder/volume_binder.go @@ -0,0 +1,74 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumebinder + +import ( + "time" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + coreinformers "k8s.io/client-go/informers/core/v1" + storageinformers "k8s.io/client-go/informers/storage/v1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/workqueue" + "k8s.io/kubernetes/pkg/controller/volume/persistentvolume" +) + +// VolumeBinder sets up the volume binding library and manages +// the volume binding operations with a queue. +type VolumeBinder struct { + Binder persistentvolume.SchedulerVolumeBinder + BindQueue *workqueue.Type +} + +// NewVolumeBinder sets up the volume binding library and binding queue +func NewVolumeBinder( + client clientset.Interface, + pvcInformer coreinformers.PersistentVolumeClaimInformer, + pvInformer coreinformers.PersistentVolumeInformer, + nodeInformer coreinformers.NodeInformer, + storageClassInformer storageinformers.StorageClassInformer) *VolumeBinder { + + return &VolumeBinder{ + Binder: persistentvolume.NewVolumeBinder(client, pvcInformer, pvInformer, nodeInformer, storageClassInformer), + BindQueue: workqueue.NewNamed("podsToBind"), + } +} + +// NewFakeVolumeBinder sets up a fake volume binder and binding queue +func NewFakeVolumeBinder(config *persistentvolume.FakeVolumeBinderConfig) *VolumeBinder { + return &VolumeBinder{ + Binder: persistentvolume.NewFakeVolumeBinder(config), + BindQueue: workqueue.NewNamed("podsToBind"), + } +} + +// Run starts a goroutine to handle the binding queue with the given function. +func (b *VolumeBinder) Run(bindWorkFunc func(), stopCh <-chan struct{}) { + go wait.Until(bindWorkFunc, time.Second, stopCh) + + <-stopCh + b.BindQueue.ShutDown() +} + +// DeletePodBindings will delete the cached volume bindings for the given pod. +func (b *VolumeBinder) DeletePodBindings(pod *v1.Pod) { + cache := b.Binder.GetBindingsCache() + if cache != nil && pod != nil { + cache.DeleteBindings(pod) + } +} diff --git a/test/e2e/storage/BUILD b/test/e2e/storage/BUILD index 5071d682e28..c65c4413f9b 100644 --- a/test/e2e/storage/BUILD +++ b/test/e2e/storage/BUILD @@ -62,6 +62,7 @@ go_library( "//vendor/github.com/vmware/govmomi/vim25/types:go_default_library", "//vendor/golang.org/x/net/context:go_default_library", "//vendor/google.golang.org/api/googleapi:go_default_library", + "//vendor/k8s.io/api/apps/v1beta1:go_default_library", "//vendor/k8s.io/api/batch/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/extensions/v1beta1:go_default_library", diff --git a/test/e2e/storage/persistent_volumes-local.go b/test/e2e/storage/persistent_volumes-local.go index 49b3dbe91d3..5dd4cba67d9 100644 --- a/test/e2e/storage/persistent_volumes-local.go +++ b/test/e2e/storage/persistent_volumes-local.go @@ -19,6 +19,7 @@ package storage import ( "encoding/json" "fmt" + "math/rand" "path" "path/filepath" "strconv" @@ -28,45 +29,51 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + appsv1beta1 "k8s.io/api/apps/v1beta1" batchv1 "k8s.io/api/batch/v1" "k8s.io/api/core/v1" rbacv1beta1 "k8s.io/api/rbac/v1beta1" + storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime/schema" utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/test/e2e/framework" + imageutils "k8s.io/kubernetes/test/utils/image" ) type localTestConfig struct { - ns string - nodes *v1.NodeList - node0 *v1.Node - client clientset.Interface - scName string + ns string + nodes []v1.Node + node0 *v1.Node + client clientset.Interface + scName string + ssTester *framework.StatefulSetTester } -type LocalVolumeType string +type localVolumeType string const ( // default local volume type, aka a directory - DirectoryLocalVolumeType LocalVolumeType = "dir" + DirectoryLocalVolumeType localVolumeType = "dir" // creates a tmpfs and mounts it - TmpfsLocalVolumeType LocalVolumeType = "tmpfs" + TmpfsLocalVolumeType localVolumeType = "tmpfs" // tests based on local ssd at /mnt/disks/by-uuid/ - GCELocalSSDVolumeType LocalVolumeType = "gce-localssd-scsi-fs" + GCELocalSSDVolumeType localVolumeType = "gce-localssd-scsi-fs" ) -var setupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig) *localTestVolume{ +var setupLocalVolumeMap = map[localVolumeType]func(*localTestConfig, *v1.Node) *localTestVolume{ GCELocalSSDVolumeType: setupLocalVolumeGCELocalSSD, TmpfsLocalVolumeType: setupLocalVolumeTmpfs, DirectoryLocalVolumeType: setupLocalVolumeDirectory, } -var cleanupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig, *localTestVolume){ +var cleanupLocalVolumeMap = map[localVolumeType]func(*localTestConfig, *localTestVolume){ GCELocalSSDVolumeType: cleanupLocalVolumeGCELocalSSD, TmpfsLocalVolumeType: cleanupLocalVolumeTmpfs, DirectoryLocalVolumeType: cleanupLocalVolumeDirectory, @@ -82,7 +89,7 @@ type localTestVolume struct { // PV for this volume pv *v1.PersistentVolume // Type of local volume - localVolumeType LocalVolumeType + localVolumeType localVolumeType } const ( @@ -121,9 +128,15 @@ const ( testRequestSize = "10Mi" ) -// Common selinux labels -var selinuxLabel = &v1.SELinuxOptions{ - Level: "s0:c0,c1"} +var ( + // storage class volume binding modes + waitMode = storagev1.VolumeBindingWaitForFirstConsumer + immediateMode = storagev1.VolumeBindingImmediate + + // Common selinux labels + selinuxLabel = &v1.SELinuxOptions{ + Level: "s0:c0,c1"} +) var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [Serial]", func() { f := framework.NewDefaultFramework("persistent-local-volumes-test") @@ -137,16 +150,18 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S // Get all the schedulable nodes nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet) Expect(len(nodes.Items)).NotTo(BeZero(), "No available nodes for scheduling") - scName = fmt.Sprintf("%v-%v", testSCPrefix, f.Namespace.Name) + scName = fmt.Sprintf("%v-%v-%v", testSCPrefix, f.Namespace.Name, rand.Int()) // Choose the first node node0 := &nodes.Items[0] + ssTester := framework.NewStatefulSetTester(f.ClientSet) config = &localTestConfig{ - ns: f.Namespace.Name, - client: f.ClientSet, - nodes: nodes, - node0: node0, - scName: scName, + ns: f.Namespace.Name, + client: f.ClientSet, + nodes: nodes.Items, + node0: node0, + scName: scName, + ssTester: ssTester, } }) @@ -155,22 +170,21 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S var testVol *localTestVolume BeforeEach(func() { - testVol = setupLocalVolumePVCPV(config, DirectoryLocalVolumeType) + setupStorageClass(config, &waitMode) + testVols := setupLocalVolumesPVCsPVs(config, DirectoryLocalVolumeType, config.node0, 1, waitMode) + testVol = testVols[0] }) AfterEach(func() { - cleanupLocalVolume(config, testVol) + cleanupLocalVolumes(config, []*localTestVolume{testVol}) + cleanupStorageClass(config) }) It("should be able to mount volume and read from pod1", func() { By("Creating pod1") pod1, pod1Err := createLocalPod(config, testVol) Expect(pod1Err).NotTo(HaveOccurred()) - - pod1NodeName, pod1NodeNameErr := podNodeName(config, pod1) - Expect(pod1NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("pod1 %q created on Node %q", pod1.Name, pod1NodeName) - Expect(pod1NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod1, config.node0.Name) By("Reading in pod1") // testFileContent was written during setupLocalVolume @@ -184,11 +198,7 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S By("Creating pod1") pod1, pod1Err := createLocalPod(config, testVol) Expect(pod1Err).NotTo(HaveOccurred()) - - pod1NodeName, pod1NodeNameErr := podNodeName(config, pod1) - Expect(pod1NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("pod1 %q created on Node %q", pod1.Name, pod1NodeName) - Expect(pod1NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod1, config.node0.Name) // testFileContent was written during setupLocalVolume testReadFileContent(volumeDir, testFile, testFileContent, pod1) @@ -202,48 +212,48 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S }) }) - Context("when two pods request one prebound PVC one after other", func() { + localVolumeTypes := []localVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType, GCELocalSSDVolumeType} + for _, tempTestVolType := range localVolumeTypes { - var testVol *localTestVolume - - BeforeEach(func() { - testVol = setupLocalVolumePVCPV(config, DirectoryLocalVolumeType) - }) - - AfterEach(func() { - cleanupLocalVolume(config, testVol) - }) - }) - - LocalVolumeTypes := []LocalVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType, GCELocalSSDVolumeType} - for _, tempTestVolType := range LocalVolumeTypes { // New variable required for gingko test closures testVolType := tempTestVolType - ctxString := fmt.Sprintf("when using volume type %s", testVolType) + ctxString := fmt.Sprintf("[Volume type: %s]", testVolType) + testMode := immediateMode + Context(ctxString, func() { + BeforeEach(func() { if testVolType == GCELocalSSDVolumeType { SkipUnlessLocalSSDExists("scsi", "fs", config.node0) } + setupStorageClass(config, &testMode) + + }) + + AfterEach(func() { + cleanupStorageClass(config) }) Context("when two pods mount a local volume at the same time", func() { It("should be able to write from pod1 and read from pod2", func() { var testVol *localTestVolume - testVol = setupLocalVolumePVCPV(config, testVolType) + testVols := setupLocalVolumesPVCsPVs(config, testVolType, config.node0, 1, testMode) + testVol = testVols[0] twoPodsReadWriteTest(config, testVol) - cleanupLocalVolume(config, testVol) + cleanupLocalVolumes(config, testVols) }) - }) + Context("when two pods mount a local volume one after the other", func() { It("should be able to write from pod1 and read from pod2", func() { var testVol *localTestVolume - testVol = setupLocalVolumePVCPV(config, testVolType) + testVols := setupLocalVolumesPVCsPVs(config, testVolType, config.node0, 1, testMode) + testVol = testVols[0] twoPodsReadWriteSerialTest(config, testVol) - cleanupLocalVolume(config, testVol) + cleanupLocalVolumes(config, testVols) }) }) + Context("when pod using local volume with non-existant path", func() { ep := &eventPatterns{ @@ -259,17 +269,19 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S localVolumeType: testVolType, } By("Creating local PVC and PV") - createLocalPVCPV(config, testVol) + createLocalPVCsPVs(config, []*localTestVolume{testVol}, testMode) pod, err := createLocalPod(config, testVol) Expect(err).To(HaveOccurred()) checkPodEvents(config, pod.Name, ep) + verifyLocalVolume(config, testVol) + cleanupLocalPVCsPVs(config, []*localTestVolume{testVol}) }) - }) + Context("when pod's node is different from PV's NodeAffinity", func() { BeforeEach(func() { - if len(config.nodes.Items) < 2 { + if len(config.nodes) < 2 { framework.Skipf("Runs only when number of nodes >= 2") } }) @@ -278,20 +290,22 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S reason: "FailedScheduling", pattern: make([]string, 2)} ep.pattern = append(ep.pattern, "MatchNodeSelector") - ep.pattern = append(ep.pattern, "NoVolumeNodeConflict") + ep.pattern = append(ep.pattern, "VolumeNodeAffinityConflict") It("should not be able to mount due to different NodeAffinity", func() { - testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeAffinity) + testPodWithNodeName(config, testVolType, ep, config.nodes[1].Name, makeLocalPodWithNodeAffinity, testMode) }) + It("should not be able to mount due to different NodeSelector", func() { - testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeSelector) + testPodWithNodeName(config, testVolType, ep, config.nodes[1].Name, makeLocalPodWithNodeSelector, testMode) }) }) + Context("when pod's node is different from PV's NodeName", func() { BeforeEach(func() { - if len(config.nodes.Items) < 2 { + if len(config.nodes) < 2 { framework.Skipf("Runs only when number of nodes >= 2") } }) @@ -301,12 +315,11 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S pattern: make([]string, 2)} ep.pattern = append(ep.pattern, "NodeSelectorTerm") ep.pattern = append(ep.pattern, "Storage node affinity check failed") + It("should not be able to mount due to different NodeName", func() { - testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeName) + testPodWithNodeName(config, testVolType, ep, config.nodes[1].Name, makeLocalPodWithNodeName, testMode) }) - }) - }) } @@ -314,6 +327,7 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S var volumePath string BeforeEach(func() { + setupStorageClass(config, &immediateMode) setupLocalVolumeProvisioner(config) volumePath = path.Join( hostBase, discoveryDir, fmt.Sprintf("vol-%v", string(uuid.NewUUID()))) @@ -321,6 +335,7 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S AfterEach(func() { cleanupLocalVolumeProvisioner(config, volumePath) + cleanupStorageClass(config) }) It("should create and recreate local persistent volume", func() { @@ -367,13 +382,52 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S Expect(err).NotTo(HaveOccurred()) }) }) + + Context("when StatefulSet has pod anti-affinity", func() { + var testVols map[string][]*localTestVolume + const ( + ssReplicas = 3 + volsPerNode = 2 + ) + + BeforeEach(func() { + if len(config.nodes) < ssReplicas { + framework.Skipf("Runs only when number of nodes >= %v", ssReplicas) + } + setupStorageClass(config, &waitMode) + + testVols = map[string][]*localTestVolume{} + for i, node := range config.nodes { + // The PVCs created here won't be used + By(fmt.Sprintf("Setting up local volumes on node %q", node.Name)) + vols := setupLocalVolumesPVCsPVs(config, DirectoryLocalVolumeType, &config.nodes[i], volsPerNode, waitMode) + testVols[node.Name] = vols + } + }) + + AfterEach(func() { + for _, vols := range testVols { + cleanupLocalVolumes(config, vols) + } + cleanupStorageClass(config) + }) + + It("should use volumes spread across nodes", func() { + By("Creating a StatefulSet with pod anti-affinity on nodes") + ss := createStatefulSet(config, ssReplicas, volsPerNode) + validateStatefulSet(config, ss) + }) + }) + + // TODO: add stress test that creates many pods in parallel across multiple nodes }) type makeLocalPodWith func(config *localTestConfig, volume *localTestVolume, nodeName string) *v1.Pod -func testPodWithNodeName(config *localTestConfig, testVolType LocalVolumeType, ep *eventPatterns, nodeName string, makeLocalPodFunc makeLocalPodWith) { - var testVol *localTestVolume - testVol = setupLocalVolumePVCPV(config, testVolType) +func testPodWithNodeName(config *localTestConfig, testVolType localVolumeType, ep *eventPatterns, nodeName string, makeLocalPodFunc makeLocalPodWith, bindingMode storagev1.VolumeBindingMode) { + By(fmt.Sprintf("local-volume-type: %s", testVolType)) + testVols := setupLocalVolumesPVCsPVs(config, testVolType, config.node0, 1, bindingMode) + testVol := testVols[0] pod := makeLocalPodFunc(config, testVol, nodeName) pod, err := config.client.CoreV1().Pods(config.ns).Create(pod) @@ -381,7 +435,8 @@ func testPodWithNodeName(config *localTestConfig, testVolType LocalVolumeType, e err = framework.WaitForPodRunningInNamespace(config.client, pod) Expect(err).To(HaveOccurred()) checkPodEvents(config, pod.Name, ep) - cleanupLocalVolume(config, testVol) + + cleanupLocalVolumes(config, []*localTestVolume{testVol}) } type eventPatterns struct { @@ -413,12 +468,7 @@ func twoPodsReadWriteTest(config *localTestConfig, testVol *localTestVolume) { By("Creating pod1 to write to the PV") pod1, pod1Err := createLocalPod(config, testVol) Expect(pod1Err).NotTo(HaveOccurred()) - - framework.ExpectNoError(framework.WaitForPodRunningInNamespace(config.client, pod1)) - pod1NodeName, pod1NodeNameErr := podNodeName(config, pod1) - Expect(pod1NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("Pod1 %q created on Node %q", pod1.Name, pod1NodeName) - Expect(pod1NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod1, config.node0.Name) // testFileContent was written during setupLocalVolume testReadFileContent(volumeDir, testFile, testFileContent, pod1) @@ -426,12 +476,7 @@ func twoPodsReadWriteTest(config *localTestConfig, testVol *localTestVolume) { By("Creating pod2 to read from the PV") pod2, pod2Err := createLocalPod(config, testVol) Expect(pod2Err).NotTo(HaveOccurred()) - - framework.ExpectNoError(framework.WaitForPodRunningInNamespace(config.client, pod2)) - pod2NodeName, pod2NodeNameErr := podNodeName(config, pod2) - Expect(pod2NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("Pod2 %q created on Node %q", pod2.Name, pod2NodeName) - Expect(pod2NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod2, config.node0.Name) // testFileContent was written during setupLocalVolume testReadFileContent(volumeDir, testFile, testFileContent, pod2) @@ -455,12 +500,7 @@ func twoPodsReadWriteSerialTest(config *localTestConfig, testVol *localTestVolum By("Creating pod1") pod1, pod1Err := createLocalPod(config, testVol) Expect(pod1Err).NotTo(HaveOccurred()) - - framework.ExpectNoError(framework.WaitForPodRunningInNamespace(config.client, pod1)) - pod1NodeName, pod1NodeNameErr := podNodeName(config, pod1) - Expect(pod1NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("Pod1 %q created on Node %q", pod1.Name, pod1NodeName) - Expect(pod1NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod1, config.node0.Name) // testFileContent was written during setupLocalVolume testReadFileContent(volumeDir, testFile, testFileContent, pod1) @@ -476,12 +516,7 @@ func twoPodsReadWriteSerialTest(config *localTestConfig, testVol *localTestVolum By("Creating pod2") pod2, pod2Err := createLocalPod(config, testVol) Expect(pod2Err).NotTo(HaveOccurred()) - - framework.ExpectNoError(framework.WaitForPodRunningInNamespace(config.client, pod2)) - pod2NodeName, pod2NodeNameErr := podNodeName(config, pod2) - Expect(pod2NodeNameErr).NotTo(HaveOccurred()) - framework.Logf("Pod2 %q created on Node %q", pod2.Name, pod2NodeName) - Expect(pod2NodeName).To(Equal(config.node0.Name)) + verifyLocalPod(config, testVol, pod2, config.node0.Name) By("Reading in pod2") testReadFileContent(volumeDir, testFile, testVol.hostDir, pod2) @@ -490,78 +525,123 @@ func twoPodsReadWriteSerialTest(config *localTestConfig, testVol *localTestVolum framework.DeletePodOrFail(config.client, config.ns, pod2.Name) } +func setupStorageClass(config *localTestConfig, mode *storagev1.VolumeBindingMode) { + sc := &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: config.scName, + }, + Provisioner: "kubernetes.io/no-provisioner", + VolumeBindingMode: mode, + } + + sc, err := config.client.StorageV1().StorageClasses().Create(sc) + Expect(err).NotTo(HaveOccurred()) +} + +func cleanupStorageClass(config *localTestConfig) { + framework.ExpectNoError(config.client.StorageV1().StorageClasses().Delete(config.scName, nil)) +} + // podNode wraps RunKubectl to get node where pod is running func podNodeName(config *localTestConfig, pod *v1.Pod) (string, error) { runtimePod, runtimePodErr := config.client.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}) return runtimePod.Spec.NodeName, runtimePodErr } -func setupWriteTestFile(hostDir string, config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume { +// setupLocalVolumes sets up directories to use for local PV +func setupLocalVolumes(config *localTestConfig, localVolumeType localVolumeType, node *v1.Node, count int) []*localTestVolume { + vols := []*localTestVolume{} + for i := 0; i < count; i++ { + setupLocalVolume, ok := setupLocalVolumeMap[localVolumeType] + Expect(ok).To(BeTrue()) + testVol := setupLocalVolume(config, node) + vols = append(vols, testVol) + } + return vols +} + +func cleanupLocalPVCsPVs(config *localTestConfig, volumes []*localTestVolume) { + for _, volume := range volumes { + By("Cleaning up PVC and PV") + errs := framework.PVPVCCleanup(config.client, config.ns, volume.pv, volume.pvc) + if len(errs) > 0 { + framework.Failf("Failed to delete PV and/or PVC: %v", utilerrors.NewAggregate(errs)) + } + } +} + +// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory +func cleanupLocalVolumes(config *localTestConfig, volumes []*localTestVolume) { + cleanupLocalPVCsPVs(config, volumes) + + for _, volume := range volumes { + cleanup := cleanupLocalVolumeMap[volume.localVolumeType] + cleanup(config, volume) + } +} + +func setupWriteTestFile(hostDir string, config *localTestConfig, localVolumeType localVolumeType, node *v1.Node) *localTestVolume { writeCmd, _ := createWriteAndReadCmds(hostDir, testFile, testFileContent) - By(fmt.Sprintf("Creating local volume on node %q at path %q", config.node0.Name, hostDir)) - err := framework.IssueSSHCommand(writeCmd, framework.TestContext.Provider, config.node0) + By(fmt.Sprintf("Creating local volume on node %q at path %q", node.Name, hostDir)) + err := framework.IssueSSHCommand(writeCmd, framework.TestContext.Provider, node) Expect(err).NotTo(HaveOccurred()) return &localTestVolume{ - node: config.node0, + node: node, hostDir: hostDir, localVolumeType: localVolumeType, } } -func setupLocalVolumeTmpfs(config *localTestConfig) *localTestVolume { +func setupLocalVolumeTmpfs(config *localTestConfig, node *v1.Node) *localTestVolume { testDirName := "local-volume-test-" + string(uuid.NewUUID()) hostDir := filepath.Join(hostBase, testDirName) - createAndMountTmpfsLocalVolume(config, hostDir) + createAndMountTmpfsLocalVolume(config, hostDir, node) // populate volume with testFile containing testFileContent - return setupWriteTestFile(hostDir, config, TmpfsLocalVolumeType) + return setupWriteTestFile(hostDir, config, TmpfsLocalVolumeType, node) } -func setupLocalVolumeGCELocalSSD(config *localTestConfig) *localTestVolume { - res, err := framework.IssueSSHCommandWithResult("ls /mnt/disks/by-uuid/google-local-ssds-scsi-fs/", framework.TestContext.Provider, config.node0) +func setupLocalVolumeGCELocalSSD(config *localTestConfig, node *v1.Node) *localTestVolume { + res, err := framework.IssueSSHCommandWithResult("ls /mnt/disks/by-uuid/google-local-ssds-scsi-fs/", framework.TestContext.Provider, node) Expect(err).NotTo(HaveOccurred()) dirName := strings.Fields(res.Stdout)[0] hostDir := "/mnt/disks/by-uuid/google-local-ssds-scsi-fs/" + dirName // populate volume with testFile containing testFileContent - return setupWriteTestFile(hostDir, config, GCELocalSSDVolumeType) + return setupWriteTestFile(hostDir, config, GCELocalSSDVolumeType, node) } -func setupLocalVolumeDirectory(config *localTestConfig) *localTestVolume { +func setupLocalVolumeDirectory(config *localTestConfig, node *v1.Node) *localTestVolume { testDirName := "local-volume-test-" + string(uuid.NewUUID()) hostDir := filepath.Join(hostBase, testDirName) // populate volume with testFile containing testFileContent - return setupWriteTestFile(hostDir, config, DirectoryLocalVolumeType) + return setupWriteTestFile(hostDir, config, DirectoryLocalVolumeType, node) } -func cleanupLocalVolume(config *localTestConfig, volume *localTestVolume) { - if volume == nil { - return - } +func verifyLocalVolume(config *localTestConfig, volume *localTestVolume) { + framework.ExpectNoError(framework.WaitOnPVandPVC(config.client, config.ns, volume.pv, volume.pvc)) +} - By("Cleaning up PVC and PV") - errs := framework.PVPVCCleanup(config.client, config.ns, volume.pv, volume.pvc) - if len(errs) > 0 { - framework.Failf("Failed to delete PV and/or PVC: %v", utilerrors.NewAggregate(errs)) - } - - cleanup := cleanupLocalVolumeMap[volume.localVolumeType] - cleanup(config, volume) +func verifyLocalPod(config *localTestConfig, volume *localTestVolume, pod *v1.Pod, expectedNodeName string) { + podNodeName, err := podNodeName(config, pod) + Expect(err).NotTo(HaveOccurred()) + framework.Logf("pod %q created on Node %q", pod.Name, podNodeName) + Expect(podNodeName).To(Equal(expectedNodeName)) } // Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory func cleanupLocalVolumeGCELocalSSD(config *localTestConfig, volume *localTestVolume) { By("Removing the test directory") removeCmd := fmt.Sprintf("rm %s", volume.hostDir+"/"+testFile) - err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0) + err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, volume.node) Expect(err).NotTo(HaveOccurred()) } // Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory func cleanupLocalVolumeTmpfs(config *localTestConfig, volume *localTestVolume) { - unmountTmpfsLocalVolume(config, volume.hostDir) + unmountTmpfsLocalVolume(config, volume.hostDir, volume.node) By("Removing the test directory") removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir) - err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0) + err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, volume.node) Expect(err).NotTo(HaveOccurred()) } @@ -569,7 +649,7 @@ func cleanupLocalVolumeTmpfs(config *localTestConfig, volume *localTestVolume) { func cleanupLocalVolumeDirectory(config *localTestConfig, volume *localTestVolume) { By("Removing the test directory") removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir) - err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0) + err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, volume.node) Expect(err).NotTo(HaveOccurred()) } @@ -618,13 +698,33 @@ func makeLocalPVConfig(config *localTestConfig, volume *localTestVolume) framewo } // Creates a PVC and PV with prebinding -func createLocalPVCPV(config *localTestConfig, volume *localTestVolume) { - pvcConfig := makeLocalPVCConfig(config) - pvConfig := makeLocalPVConfig(config, volume) +func createLocalPVCsPVs(config *localTestConfig, volumes []*localTestVolume, mode storagev1.VolumeBindingMode) { var err error - volume.pv, volume.pvc, err = framework.CreatePVPVC(config.client, pvConfig, pvcConfig, config.ns, true) - framework.ExpectNoError(err) - framework.ExpectNoError(framework.WaitOnPVandPVC(config.client, config.ns, volume.pv, volume.pvc)) + + for _, volume := range volumes { + pvcConfig := makeLocalPVCConfig(config) + pvConfig := makeLocalPVConfig(config, volume) + volume.pv, volume.pvc, err = framework.CreatePVPVC(config.client, pvConfig, pvcConfig, config.ns, false) + framework.ExpectNoError(err) + } + + if mode == storagev1.VolumeBindingImmediate { + for _, volume := range volumes { + verifyLocalVolume(config, volume) + } + } else { + // Verify PVCs are not bound + // There isn't really a great way to verify this without making the test be slow... + err = wait.PollImmediate(time.Second, 10*time.Second, func() (done bool, err error) { + for _, volume := range volumes { + pvc, err := config.client.CoreV1().PersistentVolumeClaims(volume.pvc.Namespace).Get(volume.pvc.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + Expect(pvc.Status.Phase).To(Equal(v1.ClaimPending)) + } + return false, nil + }) + Expect(err).To(HaveOccurred()) + } } func makeLocalPod(config *localTestConfig, volume *localTestVolume, cmd string) *v1.Pod { @@ -692,15 +792,15 @@ func createLocalPod(config *localTestConfig, volume *localTestVolume) (*v1.Pod, return framework.CreateSecPod(config.client, config.ns, []*v1.PersistentVolumeClaim{volume.pvc}, false, "", false, false, selinuxLabel) } -func createAndMountTmpfsLocalVolume(config *localTestConfig, dir string) { - By(fmt.Sprintf("Creating tmpfs mount point on node %q at path %q", config.node0.Name, dir)) - err := framework.IssueSSHCommand(fmt.Sprintf("mkdir -p %q && sudo mount -t tmpfs -o size=1m tmpfs-%q %q", dir, dir, dir), framework.TestContext.Provider, config.node0) +func createAndMountTmpfsLocalVolume(config *localTestConfig, dir string, node *v1.Node) { + By(fmt.Sprintf("Creating tmpfs mount point on node %q at path %q", node.Name, dir)) + err := framework.IssueSSHCommand(fmt.Sprintf("mkdir -p %q && sudo mount -t tmpfs -o size=1m tmpfs-%q %q", dir, dir, dir), framework.TestContext.Provider, node) Expect(err).NotTo(HaveOccurred()) } -func unmountTmpfsLocalVolume(config *localTestConfig, dir string) { - By(fmt.Sprintf("Unmount tmpfs mount point on node %q at path %q", config.node0.Name, dir)) - err := framework.IssueSSHCommand(fmt.Sprintf("sudo umount %q", dir), framework.TestContext.Provider, config.node0) +func unmountTmpfsLocalVolume(config *localTestConfig, dir string, node *v1.Node) { + By(fmt.Sprintf("Unmount tmpfs mount point on node %q at path %q", node.Name, dir)) + err := framework.IssueSSHCommand(fmt.Sprintf("sudo umount %q", dir), framework.TestContext.Provider, node) Expect(err).NotTo(HaveOccurred()) } @@ -745,16 +845,20 @@ func podRWCmdExec(pod *v1.Pod, cmd string) string { // Initialize test volume on node // and create local PVC and PV -func setupLocalVolumePVCPV(config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume { - By("Initializing test volume") - setupLocalVolume, ok := setupLocalVolumeMap[localVolumeType] - Expect(ok).To(BeTrue()) - testVol := setupLocalVolume(config) +func setupLocalVolumesPVCsPVs( + config *localTestConfig, + localVolumeType localVolumeType, + node *v1.Node, + count int, + mode storagev1.VolumeBindingMode) []*localTestVolume { - By("Creating local PVC and PV") - createLocalPVCPV(config, testVol) + By("Initializing test volumes") + testVols := setupLocalVolumes(config, localVolumeType, node, count) - return testVol + By("Creating local PVCs and PVs") + createLocalPVCsPVs(config, testVols, mode) + + return testVols } func setupLocalVolumeProvisioner(config *localTestConfig) { @@ -925,6 +1029,29 @@ func newLocalClaim(config *localTestConfig) *v1.PersistentVolumeClaim { return &claim } +// newLocalClaim creates a new persistent volume claim. +func newLocalClaimWithName(config *localTestConfig, name string) *v1.PersistentVolumeClaim { + claim := v1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: config.ns, + }, + Spec: v1.PersistentVolumeClaimSpec{ + StorageClassName: &config.scName, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + }, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse(testRequestSize), + }, + }, + }, + } + + return &claim +} + // waitForLocalPersistentVolume waits a local persistent volume with 'volumePath' to be available. func waitForLocalPersistentVolume(c clientset.Interface, volumePath string) (*v1.PersistentVolume, error) { var pv *v1.PersistentVolume @@ -968,6 +1095,87 @@ func findLocalPersistentVolume(c clientset.Interface, volumePath string) (*v1.Pe return nil, fmt.Errorf("Unable to find local persistent volume with path %v", volumePath) } +func createStatefulSet(config *localTestConfig, ssReplicas int32, volumeCount int) *appsv1beta1.StatefulSet { + mounts := []v1.VolumeMount{} + claims := []v1.PersistentVolumeClaim{} + for i := 0; i < volumeCount; i++ { + name := fmt.Sprintf("vol%v", i+1) + pvc := newLocalClaimWithName(config, name) + mounts = append(mounts, v1.VolumeMount{Name: name, MountPath: "/" + name}) + claims = append(claims, *pvc) + } + + affinity := v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "app", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"local-volume-test"}, + }, + }, + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + } + + labels := map[string]string{"app": "local-volume-test"} + spec := &appsv1beta1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-volume-statefulset", + Namespace: config.ns, + }, + Spec: appsv1beta1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "local-volume-test"}, + }, + Replicas: &ssReplicas, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "nginx", + Image: imageutils.GetE2EImage(imageutils.NginxSlim), + VolumeMounts: mounts, + }, + }, + Affinity: &affinity, + }, + }, + VolumeClaimTemplates: claims, + ServiceName: "test-service", + }, + } + + ss, err := config.client.AppsV1beta1().StatefulSets(config.ns).Create(spec) + Expect(err).NotTo(HaveOccurred()) + + config.ssTester.WaitForRunningAndReady(ssReplicas, ss) + return ss +} + +func validateStatefulSet(config *localTestConfig, ss *appsv1beta1.StatefulSet) { + pods := config.ssTester.GetPodList(ss) + + // Verify that each pod is on a different node + nodes := sets.NewString() + for _, pod := range pods.Items { + nodes.Insert(pod.Spec.NodeName) + } + + Expect(nodes.Len()).To(Equal(len(pods.Items))) + + // TODO: validate all PVCs are bound +} + // SkipUnlessLocalSSDExists takes in an ssdInterface (scsi/nvme) and a filesystemType (fs/block) // and skips if a disk of that type does not exist on the node func SkipUnlessLocalSSDExists(ssdInterface, filesystemType string, node *v1.Node) { diff --git a/test/integration/scheduler/BUILD b/test/integration/scheduler/BUILD index ce0f9874c47..2e01d918b6f 100644 --- a/test/integration/scheduler/BUILD +++ b/test/integration/scheduler/BUILD @@ -17,6 +17,7 @@ go_test( "priorities_test.go", "scheduler_test.go", "taint_test.go", + "volume_binding_test.go", ], importpath = "k8s.io/kubernetes/test/integration/scheduler", library = ":go_default_library", @@ -25,10 +26,12 @@ go_test( "//pkg/api/legacyscheme:go_default_library", "//pkg/api/testapi:go_default_library", "//pkg/apis/componentconfig:go_default_library", + "//pkg/apis/core/v1/helper:go_default_library", "//pkg/client/clientset_generated/internalclientset:go_default_library", "//pkg/client/informers/informers_generated/internalversion:go_default_library", "//pkg/controller/node:go_default_library", "//pkg/controller/node/ipam:go_default_library", + "//pkg/controller/volume/persistentvolume:go_default_library", "//pkg/features:go_default_library", "//pkg/kubeapiserver/admission:go_default_library", "//plugin/cmd/kube-scheduler/app:go_default_library", @@ -47,6 +50,7 @@ go_test( "//vendor/github.com/golang/glog:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/policy/v1beta1:go_default_library", + "//vendor/k8s.io/api/storage/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", diff --git a/test/integration/scheduler/extender_test.go b/test/integration/scheduler/extender_test.go index 0cd189ae45d..780c3a0376c 100644 --- a/test/integration/scheduler/extender_test.go +++ b/test/integration/scheduler/extender_test.go @@ -369,6 +369,7 @@ func TestSchedulerExtender(t *testing.T) { informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index 5d71637aa67..a6e851e5499 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -469,6 +469,7 @@ func TestMultiScheduler(t *testing.T) { informerFactory2.Apps().V1beta1().StatefulSets(), informerFactory2.Core().V1().Services(), informerFactory2.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory2.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, ) diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index b0ad30457eb..da8fc51aa02 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -131,6 +131,7 @@ func TestTaintNodeByCondition(t *testing.T) { informers.Apps().V1beta1().StatefulSets(), informers.Core().V1().Services(), informers.Policy().V1beta1().PodDisruptionBudgets(), + informers.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, true, // Enable EqualCache by default. ) diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index da50d8113aa..4e66e0855cf 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -78,6 +78,7 @@ func initTest(t *testing.T, nsPrefix string) *TestContext { context.informerFactory.Apps().V1beta1().StatefulSets(), context.informerFactory.Core().V1().Services(), context.informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + context.informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, true, ) diff --git a/test/integration/scheduler/volume_binding_test.go b/test/integration/scheduler/volume_binding_test.go new file mode 100644 index 00000000000..e185ce72b77 --- /dev/null +++ b/test/integration/scheduler/volume_binding_test.go @@ -0,0 +1,494 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +// This file tests the VolumeScheduling feature. + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/golang/glog" + + "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/client-go/informers" + clientset "k8s.io/client-go/kubernetes" + restclient "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/api/testapi" + "k8s.io/kubernetes/pkg/apis/core/v1/helper" + "k8s.io/kubernetes/pkg/controller/volume/persistentvolume" + "k8s.io/kubernetes/plugin/pkg/scheduler" + "k8s.io/kubernetes/plugin/pkg/scheduler/factory" + "k8s.io/kubernetes/test/integration/framework" +) + +type testConfig struct { + client clientset.Interface + ns string + stop <-chan struct{} + teardown func() +} + +var ( + // Delete API objects immediately + deletePeriod = int64(0) + deleteOption = &metav1.DeleteOptions{GracePeriodSeconds: &deletePeriod} + + modeWait = storagev1.VolumeBindingWaitForFirstConsumer + modeImmediate = storagev1.VolumeBindingImmediate + + classWait = "wait" + classImmediate = "immediate" +) + +const ( + labelKey = "test-label" + labelValue = "test-value" + nodeName = "node1" + podLimit = 100 + volsPerPod = 5 +) + +func TestVolumeBinding(t *testing.T) { + config := setup(t, "volume-scheduling") + defer config.teardown() + + cases := map[string]struct { + pod *v1.Pod + pvs []*v1.PersistentVolume + pvcs []*v1.PersistentVolumeClaim + }{ + "immediate can bind": { + pod: makePod("pod-i-canbind", config.ns, []string{"pvc-i-canbind"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-i-canbind", classImmediate, "", "")}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-i-canbind", config.ns, &classImmediate, "")}, + }, + "immediate pvc prebound": { + pod: makePod("pod-i-pvc-prebound", config.ns, []string{"pvc-i-prebound"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-i-pvc-prebound", classImmediate, "", "")}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-i-prebound", config.ns, &classImmediate, "pv-i-pvc-prebound")}, + }, + "immediate pv prebound": { + pod: makePod("pod-i-pv-prebound", config.ns, []string{"pvc-i-pv-prebound"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-i-prebound", classImmediate, "pvc-i-pv-prebound", config.ns)}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-i-pv-prebound", config.ns, &classImmediate, "")}, + }, + "wait can bind": { + pod: makePod("pod-w-canbind", config.ns, []string{"pvc-w-canbind"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-w-canbind", classWait, "", "")}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-w-canbind", config.ns, &classWait, "")}, + }, + "wait pvc prebound": { + pod: makePod("pod-w-pvc-prebound", config.ns, []string{"pvc-w-prebound"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-w-pvc-prebound", classWait, "", "")}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-w-prebound", config.ns, &classWait, "pv-w-pvc-prebound")}, + }, + "wait pv prebound": { + pod: makePod("pod-w-pv-prebound", config.ns, []string{"pvc-w-pv-prebound"}), + pvs: []*v1.PersistentVolume{makePV(t, "pv-w-prebound", classWait, "pvc-w-pv-prebound", config.ns)}, + pvcs: []*v1.PersistentVolumeClaim{makePVC("pvc-w-pv-prebound", config.ns, &classWait, "")}, + }, + "wait can bind two": { + pod: makePod("pod-w-canbind-2", config.ns, []string{"pvc-w-canbind-2", "pvc-w-canbind-3"}), + pvs: []*v1.PersistentVolume{ + makePV(t, "pv-w-canbind-2", classWait, "", ""), + makePV(t, "pv-w-canbind-3", classWait, "", ""), + }, + pvcs: []*v1.PersistentVolumeClaim{ + makePVC("pvc-w-canbind-2", config.ns, &classWait, ""), + makePVC("pvc-w-canbind-3", config.ns, &classWait, ""), + }, + }, + "mix immediate and wait": { + pod: makePod("pod-mix-bound", config.ns, []string{"pvc-w-canbind-4", "pvc-i-canbind-2"}), + pvs: []*v1.PersistentVolume{ + makePV(t, "pv-w-canbind-4", classWait, "", ""), + makePV(t, "pv-i-canbind-2", classImmediate, "", ""), + }, + pvcs: []*v1.PersistentVolumeClaim{ + makePVC("pvc-w-canbind-4", config.ns, &classWait, ""), + makePVC("pvc-i-canbind-2", config.ns, &classImmediate, ""), + }, + }, + // TODO: + // immediate mode - PVC cannot bound + // wait mode - PVC cannot bind + // wait mode - 2 PVCs, 1 cannot bind + } + + for name, test := range cases { + glog.Infof("Running test %v", name) + + // Create PVs + for _, pv := range test.pvs { + if _, err := config.client.CoreV1().PersistentVolumes().Create(pv); err != nil { + t.Fatalf("Failed to create PersistentVolume %q: %v", pv.Name, err) + } + } + + // Create PVCs + for _, pvc := range test.pvcs { + if _, err := config.client.CoreV1().PersistentVolumeClaims(config.ns).Create(pvc); err != nil { + t.Fatalf("Failed to create PersistentVolumeClaim %q: %v", pvc.Name, err) + } + } + + // Create Pod + if _, err := config.client.CoreV1().Pods(config.ns).Create(test.pod); err != nil { + t.Fatalf("Failed to create Pod %q: %v", test.pod.Name, err) + } + if err := waitForPodToSchedule(config.client, test.pod); err != nil { + t.Errorf("Failed to schedule Pod %q: %v", test.pod.Name, err) + } + + // Validate PVC/PV binding + for _, pvc := range test.pvcs { + validatePVCPhase(t, config.client, pvc, v1.ClaimBound) + } + for _, pv := range test.pvs { + validatePVPhase(t, config.client, pv, v1.VolumeBound) + } + + // TODO: validate events on Pods and PVCs + + config.client.CoreV1().Pods(config.ns).DeleteCollection(deleteOption, metav1.ListOptions{}) + config.client.CoreV1().PersistentVolumeClaims(config.ns).DeleteCollection(deleteOption, metav1.ListOptions{}) + config.client.CoreV1().PersistentVolumes().DeleteCollection(deleteOption, metav1.ListOptions{}) + } +} + +// TestVolumeBindingStress creates pods, each with unbound PVCs. +func TestVolumeBindingStress(t *testing.T) { + config := setup(t, "volume-binding-stress") + defer config.teardown() + + // Create enough PVs and PVCs for all the pods + pvs := []*v1.PersistentVolume{} + pvcs := []*v1.PersistentVolumeClaim{} + for i := 0; i < podLimit*volsPerPod; i++ { + pv := makePV(t, fmt.Sprintf("pv-stress-%v", i), classWait, "", "") + pvc := makePVC(fmt.Sprintf("pvc-stress-%v", i), config.ns, &classWait, "") + + if pv, err := config.client.CoreV1().PersistentVolumes().Create(pv); err != nil { + t.Fatalf("Failed to create PersistentVolume %q: %v", pv.Name, err) + } + if pvc, err := config.client.CoreV1().PersistentVolumeClaims(config.ns).Create(pvc); err != nil { + t.Fatalf("Failed to create PersistentVolumeClaim %q: %v", pvc.Name, err) + } + + pvs = append(pvs, pv) + pvcs = append(pvcs, pvc) + } + + pods := []*v1.Pod{} + for i := 0; i < podLimit; i++ { + // Generate string of all the PVCs for the pod + podPvcs := []string{} + for j := i * volsPerPod; j < (i+1)*volsPerPod; j++ { + podPvcs = append(podPvcs, pvcs[j].Name) + } + + pod := makePod(fmt.Sprintf("pod%v", i), config.ns, podPvcs) + if pod, err := config.client.CoreV1().Pods(config.ns).Create(pod); err != nil { + t.Fatalf("Failed to create Pod %q: %v", pod.Name, err) + } + pods = append(pods, pod) + } + + // Validate Pods scheduled + for _, pod := range pods { + if err := waitForPodToSchedule(config.client, pod); err != nil { + t.Errorf("Failed to schedule Pod %q: %v", pod.Name, err) + } + } + + // Validate PVC/PV binding + for _, pvc := range pvcs { + validatePVCPhase(t, config.client, pvc, v1.ClaimBound) + } + for _, pv := range pvs { + validatePVPhase(t, config.client, pv, v1.VolumeBound) + } + + // TODO: validate events on Pods and PVCs +} + +func setup(t *testing.T, nsName string) *testConfig { + h := &framework.MasterHolder{Initialized: make(chan struct{})} + s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + <-h.Initialized + h.M.GenericAPIServer.Handler.ServeHTTP(w, req) + })) + + // Enable feature gates + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=true,PersistentLocalVolumes=true") + + // Build clientset and informers for controllers. + clientset := clientset.NewForConfigOrDie(&restclient.Config{QPS: -1, Host: s.URL, ContentConfig: restclient.ContentConfig{GroupVersion: testapi.Groups[v1.GroupName].GroupVersion()}}) + informers := informers.NewSharedInformerFactory(clientset, time.Second) + + // Start master + masterConfig := framework.NewIntegrationTestMasterConfig() + _, _, closeFn := framework.RunAMasterUsingServer(masterConfig, s, h) + ns := framework.CreateTestingNamespace(nsName, s, t).Name + + controllerCh := make(chan struct{}) + + // Start PV controller for volume binding. + params := persistentvolume.ControllerParameters{ + KubeClient: clientset, + SyncPeriod: time.Hour, // test shouldn't need to resync + VolumePlugins: nil, // TODO; need later for dynamic provisioning + Cloud: nil, + ClusterName: "volume-test-cluster", + VolumeInformer: informers.Core().V1().PersistentVolumes(), + ClaimInformer: informers.Core().V1().PersistentVolumeClaims(), + ClassInformer: informers.Storage().V1().StorageClasses(), + EventRecorder: nil, // TODO: add one so we can test PV events + EnableDynamicProvisioning: true, + } + ctrl, err := persistentvolume.NewController(params) + if err != nil { + t.Fatalf("Failed to create PV controller: %v", err) + } + go ctrl.Run(controllerCh) + + // Start scheduler + configurator := factory.NewConfigFactory( + v1.DefaultSchedulerName, + clientset, + informers.Core().V1().Nodes(), + informers.Core().V1().Pods(), + informers.Core().V1().PersistentVolumes(), + informers.Core().V1().PersistentVolumeClaims(), + informers.Core().V1().ReplicationControllers(), + informers.Extensions().V1beta1().ReplicaSets(), + informers.Apps().V1beta1().StatefulSets(), + informers.Core().V1().Services(), + informers.Policy().V1beta1().PodDisruptionBudgets(), + informers.Storage().V1().StorageClasses(), + v1.DefaultHardPodAffinitySymmetricWeight, + true, // Enable EqualCache by default. + ) + + sched, err := scheduler.NewFromConfigurator(configurator, func(cfg *scheduler.Config) { + cfg.StopEverything = controllerCh + cfg.Recorder = &record.FakeRecorder{} + }) + if err != nil { + t.Fatalf("Failed to create scheduler: %v.", err) + } + go sched.Run() + + // Waiting for all controller sync. + informers.Start(controllerCh) + informers.WaitForCacheSync(controllerCh) + + // Create shared objects + // Create node + testNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{labelKey: labelValue}, + }, + Spec: v1.NodeSpec{Unschedulable: false}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourcePods: *resource.NewQuantity(podLimit, resource.DecimalSI), + }, + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + Reason: fmt.Sprintf("schedulable condition"), + LastHeartbeatTime: metav1.Time{Time: time.Now()}, + }, + }, + }, + } + if _, err := clientset.CoreV1().Nodes().Create(testNode); err != nil { + t.Fatalf("Failed to create Node %q: %v", testNode.Name, err) + } + + // Create SCs + scs := []*storagev1.StorageClass{ + makeStorageClass(classWait, &modeWait), + makeStorageClass(classImmediate, &modeImmediate), + } + for _, sc := range scs { + if _, err := clientset.StorageV1().StorageClasses().Create(sc); err != nil { + t.Fatalf("Failed to create StorageClass %q: %v", sc.Name, err) + } + } + + return &testConfig{ + client: clientset, + ns: ns, + stop: controllerCh, + teardown: func() { + clientset.CoreV1().Pods(ns).DeleteCollection(nil, metav1.ListOptions{}) + clientset.CoreV1().PersistentVolumeClaims(ns).DeleteCollection(nil, metav1.ListOptions{}) + clientset.CoreV1().PersistentVolumes().DeleteCollection(nil, metav1.ListOptions{}) + clientset.StorageV1().StorageClasses().DeleteCollection(nil, metav1.ListOptions{}) + clientset.CoreV1().Nodes().DeleteCollection(nil, metav1.ListOptions{}) + close(controllerCh) + closeFn() + utilfeature.DefaultFeatureGate.Set("VolumeScheduling=false,LocalPersistentVolumes=false") + }, + } +} + +func makeStorageClass(name string, mode *storagev1.VolumeBindingMode) *storagev1.StorageClass { + return &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Provisioner: "kubernetes.io/no-provisioner", + VolumeBindingMode: mode, + } +} + +func makePV(t *testing.T, name, scName, pvcName, ns string) *v1.PersistentVolume { + pv := &v1.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Annotations: map[string]string{}, + }, + Spec: v1.PersistentVolumeSpec{ + Capacity: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("5Gi"), + }, + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + }, + StorageClassName: scName, + PersistentVolumeSource: v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{ + Path: "/test-path", + }, + }, + }, + } + + if pvcName != "" { + pv.Spec.ClaimRef = &v1.ObjectReference{Name: pvcName, Namespace: ns} + } + + testNodeAffinity := &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: labelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{labelValue}, + }, + }, + }, + }, + }, + } + err := helper.StorageNodeAffinityToAlphaAnnotation(pv.Annotations, testNodeAffinity) + if err != nil { + t.Fatalf("Setting storage node affinity failed: %v", err) + } + return pv +} + +func makePVC(name, ns string, scName *string, volumeName string) *v1.PersistentVolumeClaim { + return &v1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: v1.PersistentVolumeClaimSpec{ + AccessModes: []v1.PersistentVolumeAccessMode{ + v1.ReadWriteOnce, + }, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceName(v1.ResourceStorage): resource.MustParse("5Gi"), + }, + }, + StorageClassName: scName, + VolumeName: volumeName, + }, + } +} + +func makePod(name, ns string, pvcs []string) *v1.Pod { + volumes := []v1.Volume{} + for i, pvc := range pvcs { + volumes = append(volumes, v1.Volume{ + Name: fmt.Sprintf("vol%v", i), + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvc, + }, + }, + }) + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "write-pod", + Image: "gcr.io/google_containers/busybox:1.24", + Command: []string{"/bin/sh"}, + Args: []string{"-c", "while true; do sleep 1; done"}, + }, + }, + Volumes: volumes, + }, + } +} + +func validatePVCPhase(t *testing.T, client clientset.Interface, pvc *v1.PersistentVolumeClaim, phase v1.PersistentVolumeClaimPhase) { + claim, err := client.CoreV1().PersistentVolumeClaims(pvc.Namespace).Get(pvc.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Failed to get PVC %v/%v: %v", pvc.Namespace, pvc.Name, err) + } + + if claim.Status.Phase != phase { + t.Errorf("PVC %v/%v phase not %v, got %v", pvc.Namespace, pvc.Name, phase, claim.Status.Phase) + } +} + +func validatePVPhase(t *testing.T, client clientset.Interface, pv *v1.PersistentVolume, phase v1.PersistentVolumePhase) { + pv, err := client.CoreV1().PersistentVolumes().Get(pv.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Failed to get PV %v: %v", pv.Name, err) + } + + if pv.Status.Phase != phase { + t.Errorf("PV %v phase not %v, got %v", pv.Name, phase, pv.Status.Phase) + } +} diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go index 3fc29da0ea6..4e6a9025c9f 100644 --- a/test/integration/scheduler_perf/util.go +++ b/test/integration/scheduler_perf/util.go @@ -75,6 +75,7 @@ func mustSetupScheduler() (schedulerConfigurator scheduler.Configurator, destroy informerFactory.Apps().V1beta1().StatefulSets(), informerFactory.Core().V1().Services(), informerFactory.Policy().V1beta1().PodDisruptionBudgets(), + informerFactory.Storage().V1().StorageClasses(), v1.DefaultHardPodAffinitySymmetricWeight, enableEquivalenceCache, )