From f5ab6d5ad4ad0827870b685347df953e80c02d0d Mon Sep 17 00:00:00 2001 From: Jonathan Basseri Date: Wed, 24 Jan 2018 17:02:12 -0800 Subject: [PATCH 1/5] [PATCH] Fix equiv. cache invalidation of Node condition. Equivalence cache for CheckNodeConditionPred becomes invalid when Node.Spec.Unschedulable changes. This can happen even if Node.Status.Conditions does not change, so move the logic around. This logic is covered by integration test "test/integration/scheduler".TestUnschedulableNodes but equivalence cache is currently skipped when test pods have no OwnerReference. Add benchmark for equivalence hashing. Change equivalence hash function. This changes the equivalence class hashing function to use as inputs all the Pod fields which are read by FitPredicates. Before we used a combination of OwnerReference and PersistentVolumeClaim info, which was a close approximation. The new method ensures that hashing remains correct regardless of controller behavior. The PVCSet field can be removed from equivalencePod because it is implicitly included in the Volume list. Tests are now broken. Move equivalence class hash code. This moves the equivalence hashing code from algorithm/predicates/utils.go to core/equivalence_cache.go. In the process, making the hashing function and hashing function factory both injectable dependencies is removed. Fix equivalence cache hash tests. Co-authored-by: Jonathan Basseri Co-authored-by: Harry Zhang --- pkg/scheduler/algorithm/predicates/BUILD | 1 - pkg/scheduler/algorithm/predicates/utils.go | 67 --- pkg/scheduler/algorithm/types.go | 3 - .../algorithmprovider/defaults/defaults.go | 7 - pkg/scheduler/core/equivalence_cache.go | 72 ++- pkg/scheduler/core/equivalence_cache_test.go | 410 ++++++++---------- pkg/scheduler/factory/factory.go | 12 +- pkg/scheduler/factory/plugins.go | 11 - 8 files changed, 255 insertions(+), 328 deletions(-) diff --git a/pkg/scheduler/algorithm/predicates/BUILD b/pkg/scheduler/algorithm/predicates/BUILD index d85c822872c..09b94dbed0b 100644 --- a/pkg/scheduler/algorithm/predicates/BUILD +++ b/pkg/scheduler/algorithm/predicates/BUILD @@ -34,7 +34,6 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/rand:go_default_library", - "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", "//vendor/k8s.io/client-go/listers/storage/v1:go_default_library", diff --git a/pkg/scheduler/algorithm/predicates/utils.go b/pkg/scheduler/algorithm/predicates/utils.go index 2e8761279ca..2b9c94f9dc8 100644 --- a/pkg/scheduler/algorithm/predicates/utils.go +++ b/pkg/scheduler/algorithm/predicates/utils.go @@ -17,12 +17,8 @@ limitations under the License. package predicates import ( - "github.com/golang/glog" "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/kubernetes/pkg/scheduler/algorithm" schedutil "k8s.io/kubernetes/pkg/scheduler/util" ) @@ -70,69 +66,6 @@ func CreateSelectorFromLabels(aL map[string]string) labels.Selector { return labels.Set(aL).AsSelector() } -// EquivalencePodGenerator is a generator of equivalence class for pod with consideration of PVC info. -type EquivalencePodGenerator struct { - pvcInfo PersistentVolumeClaimInfo -} - -// NewEquivalencePodGenerator returns a getEquivalencePod method with consideration of PVC info. -func NewEquivalencePodGenerator(pvcInfo PersistentVolumeClaimInfo) algorithm.GetEquivalencePodFunc { - g := &EquivalencePodGenerator{ - pvcInfo: pvcInfo, - } - return g.getEquivalencePod -} - -// GetEquivalencePod returns a EquivalencePod which contains a group of pod attributes which can be reused. -func (e *EquivalencePodGenerator) getEquivalencePod(pod *v1.Pod) interface{} { - // For now we only consider pods: - // 1. OwnerReferences is Controller - // 2. with same OwnerReferences - // 3. with same PVC claim - // to be equivalent - for _, ref := range pod.OwnerReferences { - if ref.Controller != nil && *ref.Controller { - pvcSet, err := e.getPVCSet(pod) - if err == nil { - // A pod can only belongs to one controller, so let's return. - return &EquivalencePod{ - ControllerRef: ref, - PVCSet: pvcSet, - } - } - - // If error encountered, log warning and return nil (i.e. no equivalent pod found) - glog.Warningf("[EquivalencePodGenerator] for pod: %v failed due to: %v", pod.GetName(), err) - return nil - } - } - return nil -} - -// getPVCSet returns a set of PVC UIDs of given pod. -func (e *EquivalencePodGenerator) getPVCSet(pod *v1.Pod) (sets.String, error) { - result := sets.NewString() - for _, volume := range pod.Spec.Volumes { - if volume.PersistentVolumeClaim == nil { - continue - } - pvcName := volume.PersistentVolumeClaim.ClaimName - pvc, err := e.pvcInfo.GetPersistentVolumeClaimInfo(pod.GetNamespace(), pvcName) - if err != nil { - return nil, err - } - result.Insert(string(pvc.UID)) - } - - return result, nil -} - -// EquivalencePod is a group of pod attributes which can be reused as equivalence to schedule other pods. -type EquivalencePod struct { - ControllerRef metav1.OwnerReference - PVCSet sets.String -} - // portsConflict check whether existingPorts and wantPorts conflict with each other // return true if we have a conflict func portsConflict(existingPorts schedutil.HostPortInfo, wantPorts []*v1.ContainerPort) bool { diff --git a/pkg/scheduler/algorithm/types.go b/pkg/scheduler/algorithm/types.go index 60ad348c9ab..1ebf50d5c9b 100644 --- a/pkg/scheduler/algorithm/types.go +++ b/pkg/scheduler/algorithm/types.go @@ -78,9 +78,6 @@ type PredicateFailureReason interface { GetReason() string } -// GetEquivalencePodFunc is a function that gets a EquivalencePod from a pod. -type GetEquivalencePodFunc func(pod *v1.Pod) interface{} - // NodeLister interface represents anything that can list nodes for a scheduler. type NodeLister interface { // We explicitly return []*v1.Node, instead of v1.NodeList, to avoid diff --git a/pkg/scheduler/algorithmprovider/defaults/defaults.go b/pkg/scheduler/algorithmprovider/defaults/defaults.go index 6ac2f1849ad..812a52efc65 100644 --- a/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -77,13 +77,6 @@ func init() { // Fit is determined by node selector query. factory.RegisterFitPredicate(predicates.MatchNodeSelectorPred, predicates.PodMatchNodeSelector) - // Use equivalence class to speed up heavy predicates phase. - factory.RegisterGetEquivalencePodFunction( - func(args factory.PluginFactoryArgs) algorithm.GetEquivalencePodFunc { - return predicates.NewEquivalencePodGenerator(args.PVCInfo) - }, - ) - // ServiceSpreadingPriority is a priority config factory that spreads pods by minimizing // the number of pods (belonging to the same service) on the same node. // Register the factory so that it's available, but do not include it as part of the default priorities diff --git a/pkg/scheduler/core/equivalence_cache.go b/pkg/scheduler/core/equivalence_cache.go index 3eebe9dcb67..a8f67f5af90 100644 --- a/pkg/scheduler/core/equivalence_cache.go +++ b/pkg/scheduler/core/equivalence_cache.go @@ -37,8 +37,7 @@ const maxCacheEntries = 100 // 2. function to get equivalence pod type EquivalenceCache struct { sync.RWMutex - getEquivalencePod algorithm.GetEquivalencePodFunc - algorithmCache map[string]AlgorithmCache + algorithmCache map[string]AlgorithmCache } // The AlgorithmCache stores PredicateMap with predicate name as key @@ -62,11 +61,9 @@ func newAlgorithmCache() AlgorithmCache { } } -// NewEquivalenceCache creates a EquivalenceCache object. -func NewEquivalenceCache(getEquivalencePodFunc algorithm.GetEquivalencePodFunc) *EquivalenceCache { +func NewEquivalenceCache() *EquivalenceCache { return &EquivalenceCache{ - getEquivalencePod: getEquivalencePodFunc, - algorithmCache: make(map[string]AlgorithmCache), + algorithmCache: make(map[string]AlgorithmCache), } } @@ -219,9 +216,11 @@ type equivalenceClassInfo struct { hash uint64 } -// getEquivalenceClassInfo returns the equivalence class of given pod. +// getEquivalenceClassInfo returns a hash of the given pod. +// The hashing function returns the same value for any two pods that are +// equivalent from the perspective of scheduling. func (ec *EquivalenceCache) getEquivalenceClassInfo(pod *v1.Pod) *equivalenceClassInfo { - equivalencePod := ec.getEquivalencePod(pod) + equivalencePod := getEquivalenceHash(pod) if equivalencePod != nil { hash := fnv.New32a() hashutil.DeepHashObject(hash, equivalencePod) @@ -231,3 +230,60 @@ func (ec *EquivalenceCache) getEquivalenceClassInfo(pod *v1.Pod) *equivalenceCla } return nil } + +// equivalencePod is the set of pod attributes which must match for two pods to +// be considered equivalent for scheduling purposes. For correctness, this must +// include any Pod field which is used by a FitPredicate. +// +// NOTE: For equivalence hash to be formally correct, lists and maps in the +// equivalencePod should be normalized. (e.g. by sorting them) However, the +// vast majority of equivalent pod classes are expected to be created from a +// single pod template, so they will all have the same ordering. +type equivalencePod struct { + Namespace *string + Labels map[string]string + Affinity *v1.Affinity + Containers []v1.Container // See note about ordering + InitContainers []v1.Container // See note about ordering + NodeName *string + NodeSelector map[string]string + Tolerations []v1.Toleration + Volumes []v1.Volume // See note about ordering +} + +// getEquivalenceHash returns the equivalencePod for a Pod. +func getEquivalenceHash(pod *v1.Pod) *equivalencePod { + ep := &equivalencePod{ + Namespace: &pod.Namespace, + Labels: pod.Labels, + Affinity: pod.Spec.Affinity, + Containers: pod.Spec.Containers, + InitContainers: pod.Spec.InitContainers, + NodeName: &pod.Spec.NodeName, + NodeSelector: pod.Spec.NodeSelector, + Tolerations: pod.Spec.Tolerations, + Volumes: pod.Spec.Volumes, + } + // DeepHashObject considers nil and empty slices to be different. Normalize them. + if len(ep.Containers) == 0 { + ep.Containers = nil + } + if len(ep.InitContainers) == 0 { + ep.InitContainers = nil + } + if len(ep.Tolerations) == 0 { + ep.Tolerations = nil + } + if len(ep.Volumes) == 0 { + ep.Volumes = nil + } + // Normalize empty maps also. + if len(ep.Labels) == 0 { + ep.Labels = nil + } + if len(ep.NodeSelector) == 0 { + ep.NodeSelector = nil + } + // TODO(misterikkit): Also normalize nested maps and slices. + return ep +} diff --git a/pkg/scheduler/core/equivalence_cache_test.go b/pkg/scheduler/core/equivalence_cache_test.go index fcb2c9455a7..199fe7ace92 100644 --- a/pkg/scheduler/core/equivalence_cache_test.go +++ b/pkg/scheduler/core/equivalence_cache_test.go @@ -21,12 +21,132 @@ import ( "testing" "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/scheduler/algorithm" "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates" ) +// makeBasicPod returns a Pod object with many of the fields populated. +func makeBasicPod(name string) *v1.Pod { + isController := true + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "test-ns", + Labels: map[string]string{"app": "web", "env": "prod"}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "ReplicationController", + Name: "rc", + UID: "123", + Controller: &isController, + }, + }, + }, + Spec: v1.PodSpec{ + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "failure-domain.beta.kubernetes.io/zone", + Operator: "Exists", + }, + }, + }, + }, + }, + }, + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "db"}}, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "web"}}, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, + InitContainers: []v1.Container{ + { + Name: "init-pause", + Image: "gcr.io/google_containers/pause", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "cpu": resource.MustParse("1"), + "mem": resource.MustParse("100Mi"), + }, + }, + }, + }, + Containers: []v1.Container{ + { + Name: "pause", + Image: "gcr.io/google_containers/pause", + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "cpu": resource.MustParse("1"), + "mem": resource.MustParse("100Mi"), + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "nfs", + MountPath: "/srv/data", + }, + }, + }, + }, + NodeSelector: map[string]string{"node-type": "awesome"}, + Tolerations: []v1.Toleration{ + { + Effect: "NoSchedule", + Key: "experimental", + Operator: "Exists", + }, + }, + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "someEBSVol1", + }, + }, + }, + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "someEBSVol2", + }, + }, + }, + { + Name: "nfs", + VolumeSource: v1.VolumeSource{ + NFS: &v1.NFSVolumeSource{ + Server: "nfs.corp.example.com", + }, + }, + }, + }, + }, + } +} + type predicateItemType struct { fit bool reasons []algorithm.PredicateFailureReason @@ -70,9 +190,7 @@ func TestUpdateCachedPredicateItem(t *testing.T) { }, } for _, test := range tests { - // this case does not need to calculate equivalence hash, just pass an empty function - fakeGetEquivalencePodFunc := func(pod *v1.Pod) interface{} { return nil } - ecache := NewEquivalenceCache(fakeGetEquivalencePodFunc) + ecache := NewEquivalenceCache() if test.expectPredicateMap { ecache.algorithmCache[test.nodeName] = newAlgorithmCache() predicateItem := HostPredicate{ @@ -191,9 +309,7 @@ func TestPredicateWithECache(t *testing.T) { } for _, test := range tests { - // this case does not need to calculate equivalence hash, just pass an empty function - fakeGetEquivalencePodFunc := func(pod *v1.Pod) interface{} { return nil } - ecache := NewEquivalenceCache(fakeGetEquivalencePodFunc) + ecache := NewEquivalenceCache() // set cached item to equivalence cache ecache.UpdateCachedPredicateItem( test.podName, @@ -240,205 +356,46 @@ func TestPredicateWithECache(t *testing.T) { } } -func TestGetHashEquivalencePod(t *testing.T) { +func TestGetEquivalenceHash(t *testing.T) { - testNamespace := "test" + ecache := NewEquivalenceCache() - pvcInfo := predicates.FakePersistentVolumeClaimInfo{ + pod1 := makeBasicPod("pod1") + pod2 := makeBasicPod("pod2") + + pod3 := makeBasicPod("pod3") + pod3.Spec.Volumes = []v1.Volume{ { - ObjectMeta: metav1.ObjectMeta{UID: "someEBSVol1", Name: "someEBSVol1", Namespace: testNamespace}, - Spec: v1.PersistentVolumeClaimSpec{VolumeName: "someEBSVol1"}, + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "someEBSVol111", + }, + }, }, + } + + pod4 := makeBasicPod("pod4") + pod4.Spec.Volumes = []v1.Volume{ { - ObjectMeta: metav1.ObjectMeta{UID: "someEBSVol2", Name: "someEBSVol2", Namespace: testNamespace}, - Spec: v1.PersistentVolumeClaimSpec{VolumeName: "someNonEBSVol"}, - }, - { - ObjectMeta: metav1.ObjectMeta{UID: "someEBSVol3-0", Name: "someEBSVol3-0", Namespace: testNamespace}, - Spec: v1.PersistentVolumeClaimSpec{VolumeName: "pvcWithDeletedPV"}, - }, - { - ObjectMeta: metav1.ObjectMeta{UID: "someEBSVol3-1", Name: "someEBSVol3-1", Namespace: testNamespace}, - Spec: v1.PersistentVolumeClaimSpec{VolumeName: "anotherPVCWithDeletedPV"}, - }, - } - - // use default equivalence class generator - ecache := NewEquivalenceCache(predicates.NewEquivalencePodGenerator(pvcInfo)) - - isController := true - - pod1 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "123", - Controller: &isController, - }, - }, - }, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol1", - }, - }, - }, - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol2", - }, - }, + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "someEBSVol222", }, }, }, } - pod2 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod2", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "123", - Controller: &isController, - }, - }, - }, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol2", - }, - }, - }, - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol1", - }, - }, - }, - }, - }, - } + pod5 := makeBasicPod("pod5") + pod5.Spec.Volumes = []v1.Volume{} - pod3 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "567", - Controller: &isController, - }, - }, - }, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol3-1", - }, - }, - }, - }, - }, - } + pod6 := makeBasicPod("pod6") + pod6.Spec.Volumes = nil - pod4 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod4", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "567", - Controller: &isController, - }, - }, - }, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "someEBSVol3-0", - }, - }, - }, - }, - }, - } + pod7 := makeBasicPod("pod7") + pod7.Spec.NodeSelector = nil - pod5 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod5", - Namespace: testNamespace, - }, - } - - pod6 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod6", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "567", - Controller: &isController, - }, - }, - }, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "no-exists-pvc", - }, - }, - }, - }, - }, - } - - pod7 := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod7", - Namespace: testNamespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "ReplicationController", - Name: "rc", - UID: "567", - Controller: &isController, - }, - }, - }, - } + pod8 := makeBasicPod("pod8") + pod8.Spec.NodeSelector = make(map[string]string) type podInfo struct { pod *v1.Pod @@ -446,39 +403,41 @@ func TestGetHashEquivalencePod(t *testing.T) { } tests := []struct { + name string podInfoList []podInfo isEquivalent bool }{ - // pods with same controllerRef and same pvc claim { + name: "pods with everything the same except name", podInfoList: []podInfo{ {pod: pod1, hashIsValid: true}, {pod: pod2, hashIsValid: true}, }, isEquivalent: true, }, - // pods with same controllerRef but different pvc claim { + name: "pods that only differ in their PVC volume sources", podInfoList: []podInfo{ {pod: pod3, hashIsValid: true}, {pod: pod4, hashIsValid: true}, }, isEquivalent: false, }, - // pod without controllerRef { + name: "pods that have no volumes, but one uses nil and one uses an empty slice", podInfoList: []podInfo{ - {pod: pod5, hashIsValid: false}, + {pod: pod5, hashIsValid: true}, + {pod: pod6, hashIsValid: true}, }, - isEquivalent: false, + isEquivalent: true, }, - // pods with same controllerRef but one has non-exists pvc claim { + name: "pods that have no NodeSelector, but one uses nil and one uses an empty map", podInfoList: []podInfo{ - {pod: pod6, hashIsValid: false}, {pod: pod7, hashIsValid: true}, + {pod: pod8, hashIsValid: true}, }, - isEquivalent: false, + isEquivalent: true, }, } @@ -488,28 +447,30 @@ func TestGetHashEquivalencePod(t *testing.T) { ) for _, test := range tests { - for i, podInfo := range test.podInfoList { - testPod := podInfo.pod - eclassInfo := ecache.getEquivalenceClassInfo(testPod) - if eclassInfo == nil && podInfo.hashIsValid { - t.Errorf("Failed: pod %v is expected to have valid hash", testPod) - } + t.Run(test.name, func(t *testing.T) { + for i, podInfo := range test.podInfoList { + testPod := podInfo.pod + eclassInfo := ecache.getEquivalenceClassInfo(testPod) + if eclassInfo == nil && podInfo.hashIsValid { + t.Errorf("Failed: pod %v is expected to have valid hash", testPod) + } - if eclassInfo != nil { - // NOTE(harry): the first element will be used as target so - // this logic can't verify more than two inequivalent pods - if i == 0 { - targetHash = eclassInfo.hash - targetPodInfo = podInfo - } else { - if targetHash != eclassInfo.hash { - if test.isEquivalent { - t.Errorf("Failed: pod: %v is expected to be equivalent to: %v", testPod, targetPodInfo.pod) + if eclassInfo != nil { + // NOTE(harry): the first element will be used as target so + // this logic can't verify more than two inequivalent pods + if i == 0 { + targetHash = eclassInfo.hash + targetPodInfo = podInfo + } else { + if targetHash != eclassInfo.hash { + if test.isEquivalent { + t.Errorf("Failed: pod: %v is expected to be equivalent to: %v", testPod, targetPodInfo.pod) + } } } } } - } + }) } } @@ -554,9 +515,7 @@ func TestInvalidateCachedPredicateItemOfAllNodes(t *testing.T) { }, }, } - // this case does not need to calculate equivalence hash, just pass an empty function - fakeGetEquivalencePodFunc := func(pod *v1.Pod) interface{} { return nil } - ecache := NewEquivalenceCache(fakeGetEquivalencePodFunc) + ecache := NewEquivalenceCache() for _, test := range tests { // set cached item to equivalence cache @@ -623,9 +582,7 @@ func TestInvalidateAllCachedPredicateItemOfNode(t *testing.T) { }, }, } - // this case does not need to calculate equivalence hash, just pass an empty function - fakeGetEquivalencePodFunc := func(pod *v1.Pod) interface{} { return nil } - ecache := NewEquivalenceCache(fakeGetEquivalencePodFunc) + ecache := NewEquivalenceCache() for _, test := range tests { // set cached item to equivalence cache @@ -649,3 +606,10 @@ func TestInvalidateAllCachedPredicateItemOfNode(t *testing.T) { } } } + +func BenchmarkEquivalenceHash(b *testing.B) { + pod := makeBasicPod("test") + for i := 0; i < b.N; i++ { + getEquivalenceHash(pod) + } +} diff --git a/pkg/scheduler/factory/factory.go b/pkg/scheduler/factory/factory.go index 407d5b85c12..8bfccb2f5dd 100644 --- a/pkg/scheduler/factory/factory.go +++ b/pkg/scheduler/factory/factory.go @@ -134,6 +134,8 @@ type configFactory struct { alwaysCheckAllPredicates bool } +var _ scheduler.Configurator = &configFactory{} + // NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only // return the interface. func NewConfigFactory( @@ -1046,14 +1048,8 @@ func (c *configFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, } // Init equivalence class cache - if c.enableEquivalenceClassCache && getEquivalencePodFuncFactory != nil { - pluginArgs, err := c.getPluginArgs() - if err != nil { - return nil, err - } - c.equivalencePodCache = core.NewEquivalenceCache( - getEquivalencePodFuncFactory(*pluginArgs), - ) + if c.enableEquivalenceClassCache { + c.equivalencePodCache = core.NewEquivalenceCache() glog.Info("Created equivalence class cache") } diff --git a/pkg/scheduler/factory/plugins.go b/pkg/scheduler/factory/plugins.go index bd33dad00e9..ba481dde93c 100644 --- a/pkg/scheduler/factory/plugins.go +++ b/pkg/scheduler/factory/plugins.go @@ -75,9 +75,6 @@ type PriorityConfigFactory struct { Weight int } -// EquivalencePodFuncFactory produces a function to get equivalence class for given pod. -type EquivalencePodFuncFactory func(PluginFactoryArgs) algorithm.GetEquivalencePodFunc - var ( schedulerFactoryMutex sync.Mutex @@ -90,9 +87,6 @@ var ( // Registered metadata producers priorityMetadataProducer PriorityMetadataProducerFactory predicateMetadataProducer PredicateMetadataProducerFactory - - // get equivalence pod function - getEquivalencePodFuncFactory EquivalencePodFuncFactory ) const ( @@ -346,11 +340,6 @@ func RegisterCustomPriorityFunction(policy schedulerapi.PriorityPolicy) string { return RegisterPriorityConfigFactory(policy.Name, *pcf) } -// RegisterGetEquivalencePodFunction registers equivalenceFuncFactory to produce equivalence class for given pod. -func RegisterGetEquivalencePodFunction(equivalenceFuncFactory EquivalencePodFuncFactory) { - getEquivalencePodFuncFactory = equivalenceFuncFactory -} - // IsPriorityFunctionRegistered is useful for testing providers. func IsPriorityFunctionRegistered(name string) bool { schedulerFactoryMutex.Lock() From c292af8f7b414d9fc3f5adf6e26402ea5c185577 Mon Sep 17 00:00:00 2001 From: Harry Zhang Date: Sun, 4 Mar 2018 14:35:57 -0800 Subject: [PATCH 2/5] Use const in equiv class --- pkg/scheduler/core/equivalence_cache.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/scheduler/core/equivalence_cache.go b/pkg/scheduler/core/equivalence_cache.go index a8f67f5af90..bfdd77b78f5 100644 --- a/pkg/scheduler/core/equivalence_cache.go +++ b/pkg/scheduler/core/equivalence_cache.go @@ -23,6 +23,7 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/scheduler/algorithm" + "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates" hashutil "k8s.io/kubernetes/pkg/util/hash" "github.com/golang/glog" @@ -188,21 +189,21 @@ func (ec *EquivalenceCache) InvalidateCachedPredicateItemForPodAdd(pod *v1.Pod, // it will also fits to equivalence class of existing pods // GeneralPredicates: will always be affected by adding a new pod - invalidPredicates := sets.NewString("GeneralPredicates") + invalidPredicates := sets.NewString(predicates.GeneralPred) // MaxPDVolumeCountPredicate: we check the volumes of pod to make decision. for _, vol := range pod.Spec.Volumes { if vol.PersistentVolumeClaim != nil { - invalidPredicates.Insert("MaxEBSVolumeCount", "MaxGCEPDVolumeCount", "MaxAzureDiskVolumeCount") + invalidPredicates.Insert(predicates.MaxEBSVolumeCountPred, predicates.MaxGCEPDVolumeCountPred, predicates.MaxAzureDiskVolumeCountPred) } else { if vol.AWSElasticBlockStore != nil { - invalidPredicates.Insert("MaxEBSVolumeCount") + invalidPredicates.Insert(predicates.MaxEBSVolumeCountPred) } if vol.GCEPersistentDisk != nil { - invalidPredicates.Insert("MaxGCEPDVolumeCount") + invalidPredicates.Insert(predicates.MaxGCEPDVolumeCountPred) } if vol.AzureDisk != nil { - invalidPredicates.Insert("MaxAzureDiskVolumeCount") + invalidPredicates.Insert(predicates.MaxAzureDiskVolumeCountPred) } } } From 6380a7548489686c2c5ec2680e7cb99549a7ddbf Mon Sep 17 00:00:00 2001 From: Harry Zhang Date: Sun, 4 Mar 2018 14:37:20 -0800 Subject: [PATCH 3/5] Update generated files --- pkg/scheduler/algorithm/predicates/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/scheduler/algorithm/predicates/BUILD b/pkg/scheduler/algorithm/predicates/BUILD index 09b94dbed0b..d85c822872c 100644 --- a/pkg/scheduler/algorithm/predicates/BUILD +++ b/pkg/scheduler/algorithm/predicates/BUILD @@ -34,6 +34,7 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/rand:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/listers/core/v1:go_default_library", "//vendor/k8s.io/client-go/listers/storage/v1:go_default_library", From 4e5901f9476e194336d2f9f368d2334615d3f8e8 Mon Sep 17 00:00:00 2001 From: Harry Zhang Date: Sun, 4 Mar 2018 17:12:09 -0800 Subject: [PATCH 4/5] Fixe golints of equiv class --- pkg/scheduler/core/equivalence_cache.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/scheduler/core/equivalence_cache.go b/pkg/scheduler/core/equivalence_cache.go index bfdd77b78f5..2976a3c654b 100644 --- a/pkg/scheduler/core/equivalence_cache.go +++ b/pkg/scheduler/core/equivalence_cache.go @@ -62,6 +62,8 @@ func newAlgorithmCache() AlgorithmCache { } } +// NewEquivalenceCache returns EquivalenceCache to speed up predicates by caching +// result from previous scheduling. func NewEquivalenceCache() *EquivalenceCache { return &EquivalenceCache{ algorithmCache: make(map[string]AlgorithmCache), From 5cc841a337fe2a8bcfe29450a6bc4650d9e348c2 Mon Sep 17 00:00:00 2001 From: Harry Zhang Date: Thu, 8 Mar 2018 16:01:54 -0800 Subject: [PATCH 5/5] Use inline func to fix deadlock --- pkg/scheduler/core/generic_scheduler.go | 78 +++++++++++++++---------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/pkg/scheduler/core/generic_scheduler.go b/pkg/scheduler/core/generic_scheduler.go index f7e9cddfd23..d93c59fb45a 100644 --- a/pkg/scheduler/core/generic_scheduler.go +++ b/pkg/scheduler/core/generic_scheduler.go @@ -430,10 +430,6 @@ func podFitsOnNode( var ( eCacheAvailable bool failedPredicates []algorithm.PredicateFailureReason - invalid bool - fit bool - reasons []algorithm.PredicateFailureReason - err error ) predicateResults := make(map[string]HostPredicate) @@ -469,38 +465,54 @@ func podFitsOnNode( // when pods are nominated or their nominations change. eCacheAvailable = equivCacheInfo != nil && !podsAdded for _, predicateKey := range predicates.Ordering() { + var ( + fit bool + reasons []algorithm.PredicateFailureReason + err error + ) //TODO (yastij) : compute average predicate restrictiveness to export it as Prometheus metric if predicate, exist := predicateFuncs[predicateKey]; exist { - if eCacheAvailable { - // Lock ecache here to avoid a race condition against cache invalidation invoked - // in event handlers. This race has existed despite locks in eCache implementation. - ecache.Lock() - // PredicateWithECache will return its cached predicate results. - fit, reasons, invalid = ecache.PredicateWithECache(pod.GetName(), info.Node().GetName(), predicateKey, equivCacheInfo.hash, false) - } - - if !eCacheAvailable || invalid { - // we need to execute predicate functions since equivalence cache does not work - fit, reasons, err = predicate(pod, metaToUse, nodeInfoToUse) - if err != nil { - return false, []algorithm.PredicateFailureReason{}, err - } + // Use an in-line function to guarantee invocation of ecache.Unlock() + // when the in-line function returns. + func() { + var invalid bool if eCacheAvailable { - // Store data to update eCache after this loop. - if res, exists := predicateResults[predicateKey]; exists { - res.Fit = res.Fit && fit - res.FailReasons = append(res.FailReasons, reasons...) - predicateResults[predicateKey] = res - } else { - predicateResults[predicateKey] = HostPredicate{Fit: fit, FailReasons: reasons} - } - result := predicateResults[predicateKey] - ecache.UpdateCachedPredicateItem(pod.GetName(), info.Node().GetName(), predicateKey, result.Fit, result.FailReasons, equivCacheInfo.hash, false) + // Lock ecache here to avoid a race condition against cache invalidation invoked + // in event handlers. This race has existed despite locks in equivClassCacheimplementation. + ecache.Lock() + defer ecache.Unlock() + // PredicateWithECache will return its cached predicate results. + fit, reasons, invalid = ecache.PredicateWithECache( + pod.GetName(), info.Node().GetName(), + predicateKey, equivCacheInfo.hash, false) } - } - if eCacheAvailable { - ecache.Unlock() + if !eCacheAvailable || invalid { + // we need to execute predicate functions since equivalence cache does not work + fit, reasons, err = predicate(pod, metaToUse, nodeInfoToUse) + if err != nil { + return + } + + if eCacheAvailable { + // Store data to update equivClassCacheafter this loop. + if res, exists := predicateResults[predicateKey]; exists { + res.Fit = res.Fit && fit + res.FailReasons = append(res.FailReasons, reasons...) + predicateResults[predicateKey] = res + } else { + predicateResults[predicateKey] = HostPredicate{Fit: fit, FailReasons: reasons} + } + result := predicateResults[predicateKey] + ecache.UpdateCachedPredicateItem( + pod.GetName(), info.Node().GetName(), + predicateKey, result.Fit, result.FailReasons, equivCacheInfo.hash, false) + } + } + }() + + if err != nil { + return false, []algorithm.PredicateFailureReason{}, err } if !fit { @@ -508,7 +520,9 @@ func podFitsOnNode( failedPredicates = append(failedPredicates, reasons...) // if alwaysCheckAllPredicates is false, short circuit all predicates when one predicate fails. if !alwaysCheckAllPredicates { - glog.V(5).Infoln("since alwaysCheckAllPredicates has not been set, the predicate evaluation is short circuited and there are chances of other predicates failing as well.") + glog.V(5).Infoln("since alwaysCheckAllPredicates has not been set, the predicate" + + "evaluation is short circuited and there are chances" + + "of other predicates failing as well.") break } }