diff --git a/pkg/scheduler/factory.go b/pkg/scheduler/factory.go deleted file mode 100644 index d8321009925..00000000000 --- a/pkg/scheduler/factory.go +++ /dev/null @@ -1,142 +0,0 @@ -/* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "context" - "fmt" - - v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/klog/v2" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" - "k8s.io/kubernetes/pkg/scheduler/framework" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" - internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" - internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" -) - -func buildExtenders(extenders []schedulerapi.Extender, profiles []schedulerapi.KubeSchedulerProfile) ([]framework.Extender, error) { - var fExtenders []framework.Extender - if len(extenders) == 0 { - return nil, nil - } - - var ignoredExtendedResources []string - var ignorableExtenders []framework.Extender - for i := range extenders { - klog.V(2).InfoS("Creating extender", "extender", extenders[i]) - extender, err := NewHTTPExtender(&extenders[i]) - if err != nil { - return nil, err - } - if !extender.IsIgnorable() { - fExtenders = append(fExtenders, extender) - } else { - ignorableExtenders = append(ignorableExtenders, extender) - } - for _, r := range extenders[i].ManagedResources { - if r.IgnoredByScheduler { - ignoredExtendedResources = append(ignoredExtendedResources, r.Name) - } - } - } - // place ignorable extenders to the tail of extenders - fExtenders = append(fExtenders, ignorableExtenders...) - - // If there are any extended resources found from the Extenders, append them to the pluginConfig for each profile. - // This should only have an effect on ComponentConfig, where it is possible to configure Extenders and - // plugin args (and in which case the extender ignored resources take precedence). - if len(ignoredExtendedResources) == 0 { - return fExtenders, nil - } - - for i := range profiles { - prof := &profiles[i] - var found = false - for k := range prof.PluginConfig { - if prof.PluginConfig[k].Name == noderesources.Name { - // Update the existing args - pc := &prof.PluginConfig[k] - args, ok := pc.Args.(*schedulerapi.NodeResourcesFitArgs) - if !ok { - return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", pc.Args) - } - args.IgnoredResources = ignoredExtendedResources - found = true - break - } - } - if !found { - return nil, fmt.Errorf("can't find NodeResourcesFitArgs in plugin config") - } - } - return fExtenders, nil -} - -// MakeDefaultErrorFunc construct a function to handle pod scheduler error -func MakeDefaultErrorFunc(client clientset.Interface, podLister corelisters.PodLister, podQueue internalqueue.SchedulingQueue, schedulerCache internalcache.Cache) func(*framework.QueuedPodInfo, error) { - return func(podInfo *framework.QueuedPodInfo, err error) { - pod := podInfo.Pod - if err == ErrNoNodesAvailable { - klog.V(2).InfoS("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) - } else if fitError, ok := err.(*framework.FitError); ok { - // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. - podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins - klog.V(2).InfoS("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", err) - } else if apierrors.IsNotFound(err) { - klog.V(2).InfoS("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", err) - if errStatus, ok := err.(apierrors.APIStatus); ok && errStatus.Status().Details.Kind == "node" { - nodeName := errStatus.Status().Details.Name - // when node is not found, We do not remove the node right away. Trying again to get - // the node and if the node is still not found, then remove it from the scheduler cache. - _, err := client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) - if err != nil && apierrors.IsNotFound(err) { - node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}} - if err := schedulerCache.RemoveNode(&node); err != nil { - klog.V(4).InfoS("Node is not found; failed to remove it from the cache", "node", node.Name) - } - } - } - } else { - klog.ErrorS(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod)) - } - - // Check if the Pod exists in informer cache. - cachedPod, err := podLister.Pods(pod.Namespace).Get(pod.Name) - if err != nil { - klog.InfoS("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", err) - return - } - - // In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler. - // It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version. - if len(cachedPod.Spec.NodeName) != 0 { - klog.InfoS("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName) - return - } - - // As is from SharedInformer, we need to do a DeepCopy() here. - podInfo.PodInfo = framework.NewPodInfo(cachedPod.DeepCopy()) - if err := podQueue.AddUnschedulableIfNotPresent(podInfo, podQueue.SchedulingCycle()); err != nil { - klog.ErrorS(err, "Error occurred") - } - } -} diff --git a/pkg/scheduler/factory_test.go b/pkg/scheduler/factory_test.go deleted file mode 100644 index 8cdce31e37a..00000000000 --- a/pkg/scheduler/factory_test.go +++ /dev/null @@ -1,322 +0,0 @@ -/* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "context" - "errors" - "fmt" - "testing" - "time" - - "github.com/google/go-cmp/cmp" - v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/cache" - extenderv1 "k8s.io/kube-scheduler/extender/v1" - "k8s.io/kubernetes/pkg/scheduler/framework" - internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" - internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" - testingclock "k8s.io/utils/clock/testing" -) - -const ( - podInitialBackoffDurationSeconds = 1 - podMaxBackoffDurationSeconds = 10 - testSchedulerName = "test-scheduler" -) - -func TestDefaultErrorFunc(t *testing.T) { - testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}} - testPodUpdated := testPod.DeepCopy() - testPodUpdated.Labels = map[string]string{"foo": ""} - - tests := []struct { - name string - injectErr error - podUpdatedDuringScheduling bool // pod is updated during a scheduling cycle - podDeletedDuringScheduling bool // pod is deleted during a scheduling cycle - expect *v1.Pod - }{ - { - name: "pod is updated during a scheduling cycle", - injectErr: nil, - podUpdatedDuringScheduling: true, - expect: testPodUpdated, - }, - { - name: "pod is not updated during a scheduling cycle", - injectErr: nil, - expect: testPod, - }, - { - name: "pod is deleted during a scheduling cycle", - injectErr: nil, - podDeletedDuringScheduling: true, - expect: nil, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - stopCh := make(chan struct{}) - defer close(stopCh) - - client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}) - informerFactory := informers.NewSharedInformerFactory(client, 0) - podInformer := informerFactory.Core().V1().Pods() - // Need to add/update/delete testPod to the store. - podInformer.Informer().GetStore().Add(testPod) - - queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) - schedulerCache := internalcache.New(30*time.Second, stopCh) - - queue.Add(testPod) - queue.Pop() - - if tt.podUpdatedDuringScheduling { - podInformer.Informer().GetStore().Update(testPodUpdated) - queue.Update(testPod, testPodUpdated) - } - if tt.podDeletedDuringScheduling { - podInformer.Informer().GetStore().Delete(testPod) - queue.Delete(testPod) - } - - testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} - errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) - errFunc(testPodInfo, tt.injectErr) - - var got *v1.Pod - if tt.podUpdatedDuringScheduling { - head, e := queue.Pop() - if e != nil { - t.Fatalf("Cannot pop pod from the activeQ: %v", e) - } - got = head.Pod - } else { - got = getPodFromPriorityQueue(queue, testPod) - } - - if diff := cmp.Diff(tt.expect, got); diff != "" { - t.Errorf("Unexpected pod (-want, +got): %s", diff) - } - }) - } -} - -func TestDefaultErrorFunc_NodeNotFound(t *testing.T) { - nodeFoo := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} - nodeBar := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "bar"}} - testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}} - tests := []struct { - name string - nodes []v1.Node - nodeNameToDelete string - injectErr error - expectNodeNames sets.String - }{ - { - name: "node is deleted during a scheduling cycle", - nodes: []v1.Node{*nodeFoo, *nodeBar}, - nodeNameToDelete: "foo", - injectErr: apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name), - expectNodeNames: sets.NewString("bar"), - }, - { - name: "node is not deleted but NodeNotFound is received incorrectly", - nodes: []v1.Node{*nodeFoo, *nodeBar}, - injectErr: apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name), - expectNodeNames: sets.NewString("foo", "bar"), - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - stopCh := make(chan struct{}) - defer close(stopCh) - - client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: tt.nodes}) - informerFactory := informers.NewSharedInformerFactory(client, 0) - podInformer := informerFactory.Core().V1().Pods() - // Need to add testPod to the store. - podInformer.Informer().GetStore().Add(testPod) - - queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) - schedulerCache := internalcache.New(30*time.Second, stopCh) - - for i := range tt.nodes { - node := tt.nodes[i] - // Add node to schedulerCache no matter it's deleted in API server or not. - schedulerCache.AddNode(&node) - if node.Name == tt.nodeNameToDelete { - client.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}) - } - } - - testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} - errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) - errFunc(testPodInfo, tt.injectErr) - - gotNodes := schedulerCache.Dump().Nodes - gotNodeNames := sets.NewString() - for _, nodeInfo := range gotNodes { - gotNodeNames.Insert(nodeInfo.Node().Name) - } - if diff := cmp.Diff(tt.expectNodeNames, gotNodeNames); diff != "" { - t.Errorf("Unexpected nodes (-want, +got): %s", diff) - } - }) - } -} - -func TestDefaultErrorFunc_PodAlreadyBound(t *testing.T) { - stopCh := make(chan struct{}) - defer close(stopCh) - - nodeFoo := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} - testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}, Spec: v1.PodSpec{NodeName: "foo"}} - - client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: []v1.Node{nodeFoo}}) - informerFactory := informers.NewSharedInformerFactory(client, 0) - podInformer := informerFactory.Core().V1().Pods() - // Need to add testPod to the store. - podInformer.Informer().GetStore().Add(testPod) - - queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) - schedulerCache := internalcache.New(30*time.Second, stopCh) - - // Add node to schedulerCache no matter it's deleted in API server or not. - schedulerCache.AddNode(&nodeFoo) - - testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} - errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) - errFunc(testPodInfo, fmt.Errorf("binding rejected: timeout")) - - pod := getPodFromPriorityQueue(queue, testPod) - if pod != nil { - t.Fatalf("Unexpected pod: %v should not be in PriorityQueue when the NodeName of pod is not empty", pod.Name) - } -} - -// getPodFromPriorityQueue is the function used in the TestDefaultErrorFunc test to get -// the specific pod from the given priority queue. It returns the found pod in the priority queue. -func getPodFromPriorityQueue(queue *internalqueue.PriorityQueue, pod *v1.Pod) *v1.Pod { - podList := queue.PendingPods() - if len(podList) == 0 { - return nil - } - - queryPodKey, err := cache.MetaNamespaceKeyFunc(pod) - if err != nil { - return nil - } - - for _, foundPod := range podList { - foundPodKey, err := cache.MetaNamespaceKeyFunc(foundPod) - if err != nil { - return nil - } - - if foundPodKey == queryPodKey { - return foundPod - } - } - - return nil -} - -type fakeExtender struct { - isBinder bool - interestedPodName string - ignorable bool - gotBind bool -} - -func (f *fakeExtender) Name() string { - return "fakeExtender" -} - -func (f *fakeExtender) IsIgnorable() bool { - return f.ignorable -} - -func (f *fakeExtender) ProcessPreemption( - _ *v1.Pod, - _ map[string]*extenderv1.Victims, - _ framework.NodeInfoLister, -) (map[string]*extenderv1.Victims, error) { - return nil, nil -} - -func (f *fakeExtender) SupportsPreemption() bool { - return false -} - -func (f *fakeExtender) Filter(pod *v1.Pod, nodes []*v1.Node) ([]*v1.Node, extenderv1.FailedNodesMap, extenderv1.FailedNodesMap, error) { - return nil, nil, nil, nil -} - -func (f *fakeExtender) Prioritize( - _ *v1.Pod, - _ []*v1.Node, -) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error) { - return nil, 0, nil -} - -func (f *fakeExtender) Bind(binding *v1.Binding) error { - if f.isBinder { - f.gotBind = true - return nil - } - return errors.New("not a binder") -} - -func (f *fakeExtender) IsBinder() bool { - return f.isBinder -} - -func (f *fakeExtender) IsInterested(pod *v1.Pod) bool { - return pod != nil && pod.Name == f.interestedPodName -} - -type TestPlugin struct { - name string -} - -var _ framework.ScorePlugin = &TestPlugin{} -var _ framework.FilterPlugin = &TestPlugin{} - -func (t *TestPlugin) Name() string { - return t.name -} - -func (t *TestPlugin) Score(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) (int64, *framework.Status) { - return 1, nil -} - -func (t *TestPlugin) ScoreExtensions() framework.ScoreExtensions { - return nil -} - -func (t *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { - return nil -} diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go new file mode 100644 index 00000000000..cdbdff3ffb0 --- /dev/null +++ b/pkg/scheduler/schedule_one.go @@ -0,0 +1,859 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "fmt" + "math/rand" + "strconv" + "sync" + "sync/atomic" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + extenderv1 "k8s.io/kube-scheduler/extender/v1" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/apis/core/validation" + "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" + frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" + "k8s.io/kubernetes/pkg/scheduler/metrics" + "k8s.io/kubernetes/pkg/scheduler/util" + utiltrace "k8s.io/utils/trace" +) + +const ( + // SchedulerError is the reason recorded for events when an error occurs during scheduling a pod. + SchedulerError = "SchedulerError" + // Percentage of plugin metrics to be sampled. + pluginMetricsSamplePercent = 10 + // minFeasibleNodesToFind is the minimum number of nodes that would be scored + // in each scheduling cycle. This is a semi-arbitrary value to ensure that a + // certain minimum of nodes are checked for feasibility. This in turn helps + // ensure a minimum level of spreading. + minFeasibleNodesToFind = 100 + // minFeasibleNodesPercentageToFind is the minimum percentage of nodes that + // would be scored in each scheduling cycle. This is a semi-arbitrary value + // to ensure that a certain minimum of nodes are checked for feasibility. + // This in turn helps ensure a minimum level of spreading. + minFeasibleNodesPercentageToFind = 5 +) + +var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""} + +// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. +func (sched *Scheduler) scheduleOne(ctx context.Context) { + podInfo := sched.NextPod() + // pod could be nil when schedulerQueue is closed + if podInfo == nil || podInfo.Pod == nil { + return + } + pod := podInfo.Pod + fwk, err := sched.frameworkForPod(pod) + if err != nil { + // This shouldn't happen, because we only accept for scheduling the pods + // which specify a scheduler name that matches one of the profiles. + klog.ErrorS(err, "Error occurred") + return + } + if sched.skipPodSchedule(fwk, pod) { + return + } + + klog.V(3).InfoS("Attempting to schedule pod", "pod", klog.KObj(pod)) + + // Synchronously attempt to find a fit for the pod. + start := time.Now() + state := framework.NewCycleState() + state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent) + // Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty. + podsToActivate := framework.NewPodsToActivate() + state.Write(framework.PodsToActivateKey, podsToActivate) + + schedulingCycleCtx, cancel := context.WithCancel(ctx) + defer cancel() + scheduleResult, err := sched.SchedulePod(schedulingCycleCtx, fwk, state, pod) + if err != nil { + // SchedulePod() may have failed because the pod would not fit on any host, so we try to + // preempt, with the expectation that the next time the pod is tried for scheduling it + // will fit due to the preemption. It is also possible that a different pod will schedule + // into the resources that were preempted, but this is harmless. + var nominatingInfo *framework.NominatingInfo + if fitError, ok := err.(*framework.FitError); ok { + if !fwk.HasPostFilterPlugins() { + klog.V(3).InfoS("No PostFilter plugins are registered, so no preemption will be performed") + } else { + // Run PostFilter plugins to try to make the pod schedulable in a future scheduling cycle. + result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap) + if status.Code() == framework.Error { + klog.ErrorS(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status) + } else { + fitError.Diagnosis.PostFilterMsg = status.Message() + klog.V(5).InfoS("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status) + } + if result != nil { + nominatingInfo = result.NominatingInfo + } + } + // Pod did not fit anywhere, so it is counted as a failure. If preemption + // succeeds, the pod should get counted as a success the next time we try to + // schedule it. (hopefully) + metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) + } else if err == ErrNoNodesAvailable { + nominatingInfo = clearNominatedNode + // No nodes available is counted as unschedulable rather than an error. + metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) + } else { + nominatingInfo = clearNominatedNode + klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod)) + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + } + sched.handleSchedulingFailure(fwk, podInfo, err, v1.PodReasonUnschedulable, nominatingInfo) + return + } + metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) + // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. + // This allows us to keep scheduling without waiting on binding to occur. + assumedPodInfo := podInfo.DeepCopy() + assumedPod := assumedPodInfo.Pod + // assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost + err = sched.assume(assumedPod, scheduleResult.SuggestedHost) + if err != nil { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + // This is most probably result of a BUG in retrying logic. + // We report an error here so that pod scheduling can be retried. + // This relies on the fact that Error will check if the pod has been bound + // to a node and if so will not add it back to the unscheduled pods queue + // (otherwise this would cause an infinite loop). + sched.handleSchedulingFailure(fwk, assumedPodInfo, err, SchedulerError, clearNominatedNode) + return + } + + // Run the Reserve method of reserve plugins. + if sts := fwk.RunReservePluginsReserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + // trigger un-reserve to clean up state associated with the reserved Pod + fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { + klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed") + } + sched.handleSchedulingFailure(fwk, assumedPodInfo, sts.AsError(), SchedulerError, clearNominatedNode) + return + } + + // Run "permit" plugins. + runPermitStatus := fwk.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() { + var reason string + if runPermitStatus.IsUnschedulable() { + metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) + reason = v1.PodReasonUnschedulable + } else { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + reason = SchedulerError + } + // One of the plugins returned status different than success or wait. + fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { + klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed") + } + sched.handleSchedulingFailure(fwk, assumedPodInfo, runPermitStatus.AsError(), reason, clearNominatedNode) + return + } + + // At the end of a successful scheduling cycle, pop and move up Pods if needed. + if len(podsToActivate.Map) != 0 { + sched.SchedulingQueue.Activate(podsToActivate.Map) + // Clear the entries after activation. + podsToActivate.Map = make(map[string]*v1.Pod) + } + + // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). + go func() { + bindingCycleCtx, cancel := context.WithCancel(ctx) + defer cancel() + metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Inc() + defer metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Dec() + + waitOnPermitStatus := fwk.WaitOnPermit(bindingCycleCtx, assumedPod) + if !waitOnPermitStatus.IsSuccess() { + var reason string + if waitOnPermitStatus.IsUnschedulable() { + metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) + reason = v1.PodReasonUnschedulable + } else { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + reason = SchedulerError + } + // trigger un-reserve plugins to clean up state associated with the reserved Pod + fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { + klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed") + } else { + // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, + // as the assumed Pod had occupied a certain amount of resources in scheduler cache. + // TODO(#103853): de-duplicate the logic. + // Avoid moving the assumed Pod itself as it's always Unschedulable. + // It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would + // update `q.moveRequest` and thus move the assumed pod to backoffQ anyways. + defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, func(pod *v1.Pod) bool { + return assumedPod.UID != pod.UID + }) + } + sched.handleSchedulingFailure(fwk, assumedPodInfo, waitOnPermitStatus.AsError(), reason, clearNominatedNode) + return + } + + // Run "prebind" plugins. + preBindStatus := fwk.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if !preBindStatus.IsSuccess() { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + // trigger un-reserve plugins to clean up state associated with the reserved Pod + fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { + klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed") + } else { + // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, + // as the assumed Pod had occupied a certain amount of resources in scheduler cache. + // TODO(#103853): de-duplicate the logic. + sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil) + } + sched.handleSchedulingFailure(fwk, assumedPodInfo, preBindStatus.AsError(), SchedulerError, clearNominatedNode) + return + } + + err := sched.bind(bindingCycleCtx, fwk, assumedPod, scheduleResult.SuggestedHost, state) + if err != nil { + metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) + // trigger un-reserve plugins to clean up state associated with the reserved Pod + fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + if err := sched.Cache.ForgetPod(assumedPod); err != nil { + klog.ErrorS(err, "scheduler cache ForgetPod failed") + } else { + // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, + // as the assumed Pod had occupied a certain amount of resources in scheduler cache. + // TODO(#103853): de-duplicate the logic. + sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil) + } + sched.handleSchedulingFailure(fwk, assumedPodInfo, fmt.Errorf("binding rejected: %w", err), SchedulerError, clearNominatedNode) + return + } + // Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2. + klog.V(2).InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) + metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start)) + metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts)) + metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) + + // Run "postbind" plugins. + fwk.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) + + // At the end of a successful binding cycle, move up Pods if needed. + if len(podsToActivate.Map) != 0 { + sched.SchedulingQueue.Activate(podsToActivate.Map) + // Unlike the logic in scheduling cycle, we don't bother deleting the entries + // as `podsToActivate.Map` is no longer consumed. + } + }() +} + +func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) { + fwk, ok := sched.Profiles[pod.Spec.SchedulerName] + if !ok { + return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName) + } + return fwk, nil +} + +// skipPodSchedule returns true if we could skip scheduling the pod for specified cases. +func (sched *Scheduler) skipPodSchedule(fwk framework.Framework, pod *v1.Pod) bool { + // Case 1: pod is being deleted. + if pod.DeletionTimestamp != nil { + fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) + klog.V(3).InfoS("Skip schedule deleting pod", "pod", klog.KObj(pod)) + return true + } + + // Case 2: pod that has been assumed could be skipped. + // An assumed pod can be added again to the scheduling queue if it got an update event + // during its previous scheduling cycle but before getting assumed. + isAssumed, err := sched.Cache.IsAssumedPod(pod) + if err != nil { + utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err)) + return false + } + return isAssumed +} + +// schedulePod tries to schedule the given pod to one of the nodes in the node list. +// If it succeeds, it will return the name of the node. +// If it fails, it will return a FitError with reasons. +func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) { + trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name}) + defer trace.LogIfLong(100 * time.Millisecond) + + if err := sched.Cache.UpdateSnapshot(sched.nodeInfoSnapshot); err != nil { + return result, err + } + trace.Step("Snapshotting scheduler cache and node infos done") + + if sched.nodeInfoSnapshot.NumNodes() == 0 { + return result, ErrNoNodesAvailable + } + + feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod) + if err != nil { + return result, err + } + trace.Step("Computing predicates done") + + if len(feasibleNodes) == 0 { + return result, &framework.FitError{ + Pod: pod, + NumAllNodes: sched.nodeInfoSnapshot.NumNodes(), + Diagnosis: diagnosis, + } + } + + // When only one node after predicate, just use it. + if len(feasibleNodes) == 1 { + return ScheduleResult{ + SuggestedHost: feasibleNodes[0].Name, + EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap), + FeasibleNodes: 1, + }, nil + } + + priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes) + if err != nil { + return result, err + } + + host, err := selectHost(priorityList) + trace.Step("Prioritizing done") + + return ScheduleResult{ + SuggestedHost: host, + EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap), + FeasibleNodes: len(feasibleNodes), + }, err +} + +// Filters the nodes to find the ones that fit the pod based on the framework +// filter plugins and filter extenders. +func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) { + diagnosis := framework.Diagnosis{ + NodeToStatusMap: make(framework.NodeToStatusMap), + UnschedulablePlugins: sets.NewString(), + } + + // Run "prefilter" plugins. + preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod) + allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List() + if err != nil { + return nil, diagnosis, err + } + if !s.IsSuccess() { + if !s.IsUnschedulable() { + return nil, diagnosis, s.AsError() + } + // All nodes will have the same status. Some non trivial refactoring is + // needed to avoid this copy. + for _, n := range allNodes { + diagnosis.NodeToStatusMap[n.Node().Name] = s + } + // Status satisfying IsUnschedulable() gets injected into diagnosis.UnschedulablePlugins. + if s.FailedPlugin() != "" { + diagnosis.UnschedulablePlugins.Insert(s.FailedPlugin()) + } + return nil, diagnosis, nil + } + + // "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption. + // This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes. + if len(pod.Status.NominatedNodeName) > 0 { + feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis) + if err != nil { + klog.ErrorS(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName) + } + // Nominated node passes all the filters, scheduler is good to assign this node to the pod. + if len(feasibleNodes) != 0 { + return feasibleNodes, diagnosis, nil + } + } + + nodes := allNodes + if !preRes.AllNodes() { + nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames)) + for n := range preRes.NodeNames { + nInfo, err := sched.nodeInfoSnapshot.NodeInfos().Get(n) + if err != nil { + return nil, diagnosis, err + } + nodes = append(nodes, nInfo) + } + } + feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, nodes) + if err != nil { + return nil, diagnosis, err + } + + feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) + if err != nil { + return nil, diagnosis, err + } + return feasibleNodes, diagnosis, nil +} + +func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*v1.Node, error) { + nnn := pod.Status.NominatedNodeName + nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn) + if err != nil { + return nil, err + } + node := []*framework.NodeInfo{nodeInfo} + feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, node) + if err != nil { + return nil, err + } + + feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) + if err != nil { + return nil, err + } + + return feasibleNodes, nil +} + +// findNodesThatPassFilters finds the nodes that fit the filter plugins. +func (sched *Scheduler) findNodesThatPassFilters( + ctx context.Context, + fwk framework.Framework, + state *framework.CycleState, + pod *v1.Pod, + diagnosis framework.Diagnosis, + nodes []*framework.NodeInfo) ([]*v1.Node, error) { + numNodesToFind := sched.numFeasibleNodesToFind(int32(len(nodes))) + + // Create feasible list with enough space to avoid growing it + // and allow assigning. + feasibleNodes := make([]*v1.Node, numNodesToFind) + + if !fwk.HasFilterPlugins() { + length := len(nodes) + for i := range feasibleNodes { + feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%length].Node() + } + sched.nextStartNodeIndex = (sched.nextStartNodeIndex + len(feasibleNodes)) % length + return feasibleNodes, nil + } + + errCh := parallelize.NewErrorChannel() + var statusesLock sync.Mutex + var feasibleNodesLen int32 + ctx, cancel := context.WithCancel(ctx) + checkNode := func(i int) { + // We check the nodes starting from where we left off in the previous scheduling cycle, + // this is to make sure all nodes have the same chance of being examined across pods. + nodeInfo := nodes[(sched.nextStartNodeIndex+i)%len(nodes)] + status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo) + if status.Code() == framework.Error { + errCh.SendErrorWithCancel(status.AsError(), cancel) + return + } + if status.IsSuccess() { + length := atomic.AddInt32(&feasibleNodesLen, 1) + if length > numNodesToFind { + cancel() + atomic.AddInt32(&feasibleNodesLen, -1) + } else { + feasibleNodes[length-1] = nodeInfo.Node() + } + } else { + statusesLock.Lock() + diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status + diagnosis.UnschedulablePlugins.Insert(status.FailedPlugin()) + statusesLock.Unlock() + } + } + + beginCheckNode := time.Now() + statusCode := framework.Success + defer func() { + // We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins + // function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle. + // Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod. + metrics.FrameworkExtensionPointDuration.WithLabelValues(frameworkruntime.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode)) + }() + + // Stops searching for more nodes once the configured number of feasible nodes + // are found. + fwk.Parallelizer().Until(ctx, len(nodes), checkNode) + processedNodes := int(feasibleNodesLen) + len(diagnosis.NodeToStatusMap) + sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes) + + feasibleNodes = feasibleNodes[:feasibleNodesLen] + if err := errCh.ReceiveError(); err != nil { + statusCode = framework.Error + return nil, err + } + return feasibleNodes, nil +} + +// numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops +// its search for more feasible nodes. +func (sched *Scheduler) numFeasibleNodesToFind(numAllNodes int32) (numNodes int32) { + if numAllNodes < minFeasibleNodesToFind || sched.percentageOfNodesToScore >= 100 { + return numAllNodes + } + + adaptivePercentage := sched.percentageOfNodesToScore + if adaptivePercentage <= 0 { + basePercentageOfNodesToScore := int32(50) + adaptivePercentage = basePercentageOfNodesToScore - numAllNodes/125 + if adaptivePercentage < minFeasibleNodesPercentageToFind { + adaptivePercentage = minFeasibleNodesPercentageToFind + } + } + + numNodes = numAllNodes * adaptivePercentage / 100 + if numNodes < minFeasibleNodesToFind { + return minFeasibleNodesToFind + } + + return numNodes +} + +func findNodesThatPassExtenders(extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*v1.Node, statuses framework.NodeToStatusMap) ([]*v1.Node, error) { + // Extenders are called sequentially. + // Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next + // extender in a decreasing manner. + for _, extender := range extenders { + if len(feasibleNodes) == 0 { + break + } + if !extender.IsInterested(pod) { + continue + } + + // Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in , + // so that the scheduler framework can respect the UnschedulableAndUnresolvable status for + // particular nodes, and this may eventually improve preemption efficiency. + // Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable + // status ahead of others. + feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes) + if err != nil { + if extender.IsIgnorable() { + klog.InfoS("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err) + continue + } + return nil, err + } + + for failedNodeName, failedMsg := range failedAndUnresolvableMap { + var aggregatedReasons []string + if _, found := statuses[failedNodeName]; found { + aggregatedReasons = statuses[failedNodeName].Reasons() + } + aggregatedReasons = append(aggregatedReasons, failedMsg) + statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...) + } + + for failedNodeName, failedMsg := range failedMap { + if _, found := failedAndUnresolvableMap[failedNodeName]; found { + // failedAndUnresolvableMap takes precedence over failedMap + // note that this only happens if the extender returns the node in both maps + continue + } + if _, found := statuses[failedNodeName]; !found { + statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg) + } else { + statuses[failedNodeName].AppendReason(failedMsg) + } + } + + feasibleNodes = feasibleList + } + return feasibleNodes, nil +} + +// prioritizeNodes prioritizes the nodes by running the score plugins, +// which return a score for each node from the call to RunScorePlugins(). +// The scores from each plugin are added together to make the score for that node, then +// any extenders are run as well. +// All scores are finally combined (added) to get the total weighted scores of all nodes +func prioritizeNodes( + ctx context.Context, + extenders []framework.Extender, + fwk framework.Framework, + state *framework.CycleState, + pod *v1.Pod, + nodes []*v1.Node, +) (framework.NodeScoreList, error) { + // If no priority configs are provided, then all nodes will have a score of one. + // This is required to generate the priority list in the required format + if len(extenders) == 0 && !fwk.HasScorePlugins() { + result := make(framework.NodeScoreList, 0, len(nodes)) + for i := range nodes { + result = append(result, framework.NodeScore{ + Name: nodes[i].Name, + Score: 1, + }) + } + return result, nil + } + + // Run PreScore plugins. + preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes) + if !preScoreStatus.IsSuccess() { + return nil, preScoreStatus.AsError() + } + + // Run the Score plugins. + scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes) + if !scoreStatus.IsSuccess() { + return nil, scoreStatus.AsError() + } + + // Additional details logged at level 10 if enabled. + klogV := klog.V(10) + if klogV.Enabled() { + for plugin, nodeScoreList := range scoresMap { + for _, nodeScore := range nodeScoreList { + klogV.InfoS("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", plugin, "node", nodeScore.Name, "score", nodeScore.Score) + } + } + } + + // Summarize all scores. + result := make(framework.NodeScoreList, 0, len(nodes)) + + for i := range nodes { + result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0}) + for j := range scoresMap { + result[i].Score += scoresMap[j][i].Score + } + } + + if len(extenders) != 0 && nodes != nil { + var mu sync.Mutex + var wg sync.WaitGroup + combinedScores := make(map[string]int64, len(nodes)) + for i := range extenders { + if !extenders[i].IsInterested(pod) { + continue + } + wg.Add(1) + go func(extIndex int) { + metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Inc() + defer func() { + metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Dec() + wg.Done() + }() + prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes) + if err != nil { + // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities + klog.V(5).InfoS("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name()) + return + } + mu.Lock() + for i := range *prioritizedList { + host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score + if klogV.Enabled() { + klogV.InfoS("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", host, "score", score) + } + combinedScores[host] += score * weight + } + mu.Unlock() + }(i) + } + // wait for all go routines to finish + wg.Wait() + for i := range result { + // MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore, + // therefore we need to scale the score returned by extenders to the score range used by the scheduler. + result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority) + } + } + + if klogV.Enabled() { + for i := range result { + klogV.InfoS("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", result[i].Name, "score", result[i].Score) + } + } + return result, nil +} + +// selectHost takes a prioritized list of nodes and then picks one +// in a reservoir sampling manner from the nodes that had the highest score. +func selectHost(nodeScoreList framework.NodeScoreList) (string, error) { + if len(nodeScoreList) == 0 { + return "", fmt.Errorf("empty priorityList") + } + maxScore := nodeScoreList[0].Score + selected := nodeScoreList[0].Name + cntOfMaxScore := 1 + for _, ns := range nodeScoreList[1:] { + if ns.Score > maxScore { + maxScore = ns.Score + selected = ns.Name + cntOfMaxScore = 1 + } else if ns.Score == maxScore { + cntOfMaxScore++ + if rand.Intn(cntOfMaxScore) == 0 { + // Replace the candidate with probability of 1/cntOfMaxScore + selected = ns.Name + } + } + } + return selected, nil +} + +// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. +// assume modifies `assumed`. +func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { + // Optimistically assume that the binding will succeed and send it to apiserver + // in the background. + // If the binding fails, scheduler will release resources allocated to assumed pod + // immediately. + assumed.Spec.NodeName = host + + if err := sched.Cache.AssumePod(assumed); err != nil { + klog.ErrorS(err, "Scheduler cache AssumePod failed") + return err + } + // if "assumed" is a nominated pod, we should remove it from internal cache + if sched.SchedulingQueue != nil { + sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed) + } + + return nil +} + +// bind binds a pod to a given node defined in a binding object. +// The precedence for binding is: (1) extenders and (2) framework plugins. +// We expect this to run asynchronously, so we handle binding metrics internally. +func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (err error) { + defer func() { + sched.finishBinding(fwk, assumed, targetNode, err) + }() + + bound, err := sched.extendersBinding(assumed, targetNode) + if bound { + return err + } + bindStatus := fwk.RunBindPlugins(ctx, state, assumed, targetNode) + if bindStatus.IsSuccess() { + return nil + } + if bindStatus.Code() == framework.Error { + return bindStatus.AsError() + } + return fmt.Errorf("bind status: %s, %v", bindStatus.Code().String(), bindStatus.Message()) +} + +// TODO(#87159): Move this to a Plugin. +func (sched *Scheduler) extendersBinding(pod *v1.Pod, node string) (bool, error) { + for _, extender := range sched.Extenders { + if !extender.IsBinder() || !extender.IsInterested(pod) { + continue + } + return true, extender.Bind(&v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID}, + Target: v1.ObjectReference{Kind: "Node", Name: node}, + }) + } + return false, nil +} + +func (sched *Scheduler) finishBinding(fwk framework.Framework, assumed *v1.Pod, targetNode string, err error) { + if finErr := sched.Cache.FinishBinding(assumed); finErr != nil { + klog.ErrorS(finErr, "Scheduler cache FinishBinding failed") + } + if err != nil { + klog.V(1).InfoS("Failed to bind pod", "pod", klog.KObj(assumed)) + return + } + + fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode) +} + +func getAttemptsLabel(p *framework.QueuedPodInfo) string { + // We breakdown the pod scheduling duration by attempts capped to a limit + // to avoid ending up with a high cardinality metric. + if p.Attempts >= 15 { + return "15+" + } + return strconv.Itoa(p.Attempts) +} + +// handleSchedulingFailure records an event for the pod that indicates the +// pod has failed to schedule. Also, update the pod condition and nominated node name if set. +func (sched *Scheduler) handleSchedulingFailure(fwk framework.Framework, podInfo *framework.QueuedPodInfo, err error, reason string, nominatingInfo *framework.NominatingInfo) { + sched.Error(podInfo, err) + + // Update the scheduling queue with the nominated pod information. Without + // this, there would be a race condition between the next scheduling cycle + // and the time the scheduler receives a Pod Update for the nominated pod. + // Here we check for nil only for tests. + if sched.SchedulingQueue != nil { + sched.SchedulingQueue.AddNominatedPod(podInfo.PodInfo, nominatingInfo) + } + + pod := podInfo.Pod + msg := truncateMessage(err.Error()) + fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) + if err := updatePod(sched.client, pod, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: reason, + Message: err.Error(), + }, nominatingInfo); err != nil { + klog.ErrorS(err, "Error updating pod", "pod", klog.KObj(pod)) + } +} + +// truncateMessage truncates a message if it hits the NoteLengthLimit. +func truncateMessage(message string) string { + max := validation.NoteLengthLimit + if len(message) <= max { + return message + } + suffix := " ..." + return message[:max-len(suffix)] + suffix +} + +func updatePod(client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { + klog.V(3).InfoS("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) + podStatusCopy := pod.Status.DeepCopy() + // NominatedNodeName is updated only if we are trying to set it, and the value is + // different from the existing one. + nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName + if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate { + return nil + } + if nnnNeedsUpdate { + podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName + } + return util.PatchPodStatus(client, pod, podStatusCopy) +} diff --git a/pkg/scheduler/schedule_one_test.go b/pkg/scheduler/schedule_one_test.go new file mode 100644 index 00000000000..5f4801946f7 --- /dev/null +++ b/pkg/scheduler/schedule_one_test.go @@ -0,0 +1,2878 @@ +/* +Copyright 2014 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "errors" + "fmt" + "math" + "reflect" + "regexp" + "strconv" + "sync" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + v1 "k8s.io/api/core/v1" + eventsv1 "k8s.io/api/events/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/informers" + clientsetfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/kubernetes/scheme" + clienttesting "k8s.io/client-go/testing" + clientcache "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/events" + "k8s.io/component-helpers/storage/volume" + extenderv1 "k8s.io/kube-scheduler/extender/v1" + schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" + "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/selectorspread" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding" + frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" + fakecache "k8s.io/kubernetes/pkg/scheduler/internal/cache/fake" + internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" + "k8s.io/kubernetes/pkg/scheduler/profile" + st "k8s.io/kubernetes/pkg/scheduler/testing" + schedutil "k8s.io/kubernetes/pkg/scheduler/util" + "k8s.io/utils/pointer" +) + +const ( + testSchedulerName = "test-scheduler" +) + +var ( + emptySnapshot = internalcache.NewEmptySnapshot() + podTopologySpreadFunc = frameworkruntime.FactoryAdapter(feature.Features{}, podtopologyspread.New) + errPrioritize = fmt.Errorf("priority map encounters an error") +) + +type mockScheduleResult struct { + result ScheduleResult + err error +} + +type fakeExtender struct { + isBinder bool + interestedPodName string + ignorable bool + gotBind bool +} + +func (f *fakeExtender) Name() string { + return "fakeExtender" +} + +func (f *fakeExtender) IsIgnorable() bool { + return f.ignorable +} + +func (f *fakeExtender) ProcessPreemption( + _ *v1.Pod, + _ map[string]*extenderv1.Victims, + _ framework.NodeInfoLister, +) (map[string]*extenderv1.Victims, error) { + return nil, nil +} + +func (f *fakeExtender) SupportsPreemption() bool { + return false +} + +func (f *fakeExtender) Filter(pod *v1.Pod, nodes []*v1.Node) ([]*v1.Node, extenderv1.FailedNodesMap, extenderv1.FailedNodesMap, error) { + return nil, nil, nil, nil +} + +func (f *fakeExtender) Prioritize( + _ *v1.Pod, + _ []*v1.Node, +) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error) { + return nil, 0, nil +} + +func (f *fakeExtender) Bind(binding *v1.Binding) error { + if f.isBinder { + f.gotBind = true + return nil + } + return errors.New("not a binder") +} + +func (f *fakeExtender) IsBinder() bool { + return f.isBinder +} + +func (f *fakeExtender) IsInterested(pod *v1.Pod) bool { + return pod != nil && pod.Name == f.interestedPodName +} + +type falseMapPlugin struct{} + +func newFalseMapPlugin() frameworkruntime.PluginFactory { + return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { + return &falseMapPlugin{}, nil + } +} + +func (pl *falseMapPlugin) Name() string { + return "FalseMap" +} + +func (pl *falseMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, _ string) (int64, *framework.Status) { + return 0, framework.AsStatus(errPrioritize) +} + +func (pl *falseMapPlugin) ScoreExtensions() framework.ScoreExtensions { + return nil +} + +type numericMapPlugin struct{} + +func newNumericMapPlugin() frameworkruntime.PluginFactory { + return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { + return &numericMapPlugin{}, nil + } +} + +func (pl *numericMapPlugin) Name() string { + return "NumericMap" +} + +func (pl *numericMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeName string) (int64, *framework.Status) { + score, err := strconv.Atoi(nodeName) + if err != nil { + return 0, framework.NewStatus(framework.Error, fmt.Sprintf("Error converting nodename to int: %+v", nodeName)) + } + return int64(score), nil +} + +func (pl *numericMapPlugin) ScoreExtensions() framework.ScoreExtensions { + return nil +} + +// NewNoPodsFilterPlugin initializes a noPodsFilterPlugin and returns it. +func NewNoPodsFilterPlugin(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { + return &noPodsFilterPlugin{}, nil +} + +type reverseNumericMapPlugin struct{} + +func (pl *reverseNumericMapPlugin) Name() string { + return "ReverseNumericMap" +} + +func (pl *reverseNumericMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeName string) (int64, *framework.Status) { + score, err := strconv.Atoi(nodeName) + if err != nil { + return 0, framework.NewStatus(framework.Error, fmt.Sprintf("Error converting nodename to int: %+v", nodeName)) + } + return int64(score), nil +} + +func (pl *reverseNumericMapPlugin) ScoreExtensions() framework.ScoreExtensions { + return pl +} + +func (pl *reverseNumericMapPlugin) NormalizeScore(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeScores framework.NodeScoreList) *framework.Status { + var maxScore float64 + minScore := math.MaxFloat64 + + for _, hostPriority := range nodeScores { + maxScore = math.Max(maxScore, float64(hostPriority.Score)) + minScore = math.Min(minScore, float64(hostPriority.Score)) + } + for i, hostPriority := range nodeScores { + nodeScores[i] = framework.NodeScore{ + Name: hostPriority.Name, + Score: int64(maxScore + minScore - float64(hostPriority.Score)), + } + } + return nil +} + +func newReverseNumericMapPlugin() frameworkruntime.PluginFactory { + return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { + return &reverseNumericMapPlugin{}, nil + } +} + +type trueMapPlugin struct{} + +func (pl *trueMapPlugin) Name() string { + return "TrueMap" +} + +func (pl *trueMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, _ string) (int64, *framework.Status) { + return 1, nil +} + +func (pl *trueMapPlugin) ScoreExtensions() framework.ScoreExtensions { + return pl +} + +func (pl *trueMapPlugin) NormalizeScore(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeScores framework.NodeScoreList) *framework.Status { + for _, host := range nodeScores { + if host.Name == "" { + return framework.NewStatus(framework.Error, "unexpected empty host name") + } + } + return nil +} + +func newTrueMapPlugin() frameworkruntime.PluginFactory { + return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { + return &trueMapPlugin{}, nil + } +} + +type noPodsFilterPlugin struct{} + +// Name returns name of the plugin. +func (pl *noPodsFilterPlugin) Name() string { + return "NoPodsFilter" +} + +// Filter invoked at the filter extension point. +func (pl *noPodsFilterPlugin) Filter(_ context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { + if len(nodeInfo.Pods) == 0 { + return nil + } + return framework.NewStatus(framework.Unschedulable, st.ErrReasonFake) +} + +type fakeNodeSelectorArgs struct { + NodeName string `json:"nodeName"` +} + +type fakeNodeSelector struct { + fakeNodeSelectorArgs +} + +func (s *fakeNodeSelector) Name() string { + return "FakeNodeSelector" +} + +func (s *fakeNodeSelector) Filter(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { + if nodeInfo.Node().Name != s.NodeName { + return framework.NewStatus(framework.UnschedulableAndUnresolvable) + } + return nil +} + +func newFakeNodeSelector(args runtime.Object, _ framework.Handle) (framework.Plugin, error) { + pl := &fakeNodeSelector{} + if err := frameworkruntime.DecodeInto(args, &pl.fakeNodeSelectorArgs); err != nil { + return nil, err + } + return pl, nil +} + +type TestPlugin struct { + name string +} + +var _ framework.ScorePlugin = &TestPlugin{} +var _ framework.FilterPlugin = &TestPlugin{} + +func (t *TestPlugin) Name() string { + return t.name +} + +func (t *TestPlugin) Score(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) (int64, *framework.Status) { + return 1, nil +} + +func (t *TestPlugin) ScoreExtensions() framework.ScoreExtensions { + return nil +} + +func (t *TestPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { + return nil +} + +func TestSchedulerMultipleProfilesScheduling(t *testing.T) { + nodes := []runtime.Object{ + st.MakeNode().Name("machine1").UID("machine1").Obj(), + st.MakeNode().Name("machine2").UID("machine2").Obj(), + st.MakeNode().Name("machine3").UID("machine3").Obj(), + } + pods := []*v1.Pod{ + st.MakePod().Name("pod1").UID("pod1").SchedulerName("match-machine3").Obj(), + st.MakePod().Name("pod2").UID("pod2").SchedulerName("match-machine2").Obj(), + st.MakePod().Name("pod3").UID("pod3").SchedulerName("match-machine2").Obj(), + st.MakePod().Name("pod4").UID("pod4").SchedulerName("match-machine3").Obj(), + } + wantBindings := map[string]string{ + "pod1": "machine3", + "pod2": "machine2", + "pod3": "machine2", + "pod4": "machine3", + } + wantControllers := map[string]string{ + "pod1": "match-machine3", + "pod2": "match-machine2", + "pod3": "match-machine2", + "pod4": "match-machine3", + } + + // Set up scheduler for the 3 nodes. + // We use a fake filter that only allows one particular node. We create two + // profiles, each with a different node in the filter configuration. + objs := append([]runtime.Object{ + &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ""}}}, nodes...) + client := clientsetfake.NewSimpleClientset(objs...) + broadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + informerFactory := informers.NewSharedInformerFactory(client, 0) + sched, err := New( + client, + informerFactory, + nil, + profile.NewRecorderFactory(broadcaster), + ctx.Done(), + WithProfiles( + schedulerapi.KubeSchedulerProfile{SchedulerName: "match-machine2", + Plugins: &schedulerapi.Plugins{ + Filter: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "FakeNodeSelector"}}}, + QueueSort: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "PrioritySort"}}}, + Bind: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "DefaultBinder"}}}, + }, + PluginConfig: []schedulerapi.PluginConfig{ + { + Name: "FakeNodeSelector", + Args: &runtime.Unknown{Raw: []byte(`{"nodeName":"machine2"}`)}, + }, + }, + }, + schedulerapi.KubeSchedulerProfile{ + SchedulerName: "match-machine3", + Plugins: &schedulerapi.Plugins{ + Filter: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "FakeNodeSelector"}}}, + QueueSort: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "PrioritySort"}}}, + Bind: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "DefaultBinder"}}}, + }, + PluginConfig: []schedulerapi.PluginConfig{ + { + Name: "FakeNodeSelector", + Args: &runtime.Unknown{Raw: []byte(`{"nodeName":"machine3"}`)}, + }, + }, + }, + ), + WithFrameworkOutOfTreeRegistry(frameworkruntime.Registry{ + "FakeNodeSelector": newFakeNodeSelector, + }), + ) + if err != nil { + t.Fatal(err) + } + + // Capture the bindings and events' controllers. + var wg sync.WaitGroup + wg.Add(2 * len(pods)) + bindings := make(map[string]string) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "binding" { + return false, nil, nil + } + binding := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + bindings[binding.Name] = binding.Target.Name + wg.Done() + return true, binding, nil + }) + controllers := make(map[string]string) + stopFn := broadcaster.StartEventWatcher(func(obj runtime.Object) { + e, ok := obj.(*eventsv1.Event) + if !ok || e.Reason != "Scheduled" { + return + } + controllers[e.Regarding.Name] = e.ReportingController + wg.Done() + }) + defer stopFn() + + // Run scheduler. + informerFactory.Start(ctx.Done()) + informerFactory.WaitForCacheSync(ctx.Done()) + go sched.Run(ctx) + + // Send pods to be scheduled. + for _, p := range pods { + _, err := client.CoreV1().Pods("").Create(ctx, p, metav1.CreateOptions{}) + if err != nil { + t.Fatal(err) + } + } + wg.Wait() + + // Verify correct bindings and reporting controllers. + if diff := cmp.Diff(wantBindings, bindings); diff != "" { + t.Errorf("pods were scheduled incorrectly (-want, +got):\n%s", diff) + } + if diff := cmp.Diff(wantControllers, controllers); diff != "" { + t.Errorf("events were reported with wrong controllers (-want, +got):\n%s", diff) + } +} + +func TestSchedulerScheduleOne(t *testing.T) { + testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} + client := clientsetfake.NewSimpleClientset(&testNode) + eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) + errS := errors.New("scheduler") + errB := errors.New("binder") + preBindErr := errors.New("on PreBind") + + table := []struct { + name string + injectBindError error + sendPod *v1.Pod + registerPluginFuncs []st.RegisterPluginFunc + expectErrorPod *v1.Pod + expectForgetPod *v1.Pod + expectAssumedPod *v1.Pod + expectError error + expectBind *v1.Binding + eventReason string + mockResult mockScheduleResult + }{ + { + name: "error reserve pod", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, + registerPluginFuncs: []st.RegisterPluginFunc{ + st.RegisterReservePlugin("FakeReserve", st.NewFakeReservePlugin(framework.NewStatus(framework.Error, "reserve error"))), + }, + expectErrorPod: podWithID("foo", testNode.Name), + expectForgetPod: podWithID("foo", testNode.Name), + expectAssumedPod: podWithID("foo", testNode.Name), + expectError: fmt.Errorf(`running Reserve plugin "FakeReserve": %w`, errors.New("reserve error")), + eventReason: "FailedScheduling", + }, + { + name: "error permit pod", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, + registerPluginFuncs: []st.RegisterPluginFunc{ + st.RegisterPermitPlugin("FakePermit", st.NewFakePermitPlugin(framework.NewStatus(framework.Error, "permit error"), time.Minute)), + }, + expectErrorPod: podWithID("foo", testNode.Name), + expectForgetPod: podWithID("foo", testNode.Name), + expectAssumedPod: podWithID("foo", testNode.Name), + expectError: fmt.Errorf(`running Permit plugin "FakePermit": %w`, errors.New("permit error")), + eventReason: "FailedScheduling", + }, + { + name: "error prebind pod", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, + registerPluginFuncs: []st.RegisterPluginFunc{ + st.RegisterPreBindPlugin("FakePreBind", st.NewFakePreBindPlugin(framework.AsStatus(preBindErr))), + }, + expectErrorPod: podWithID("foo", testNode.Name), + expectForgetPod: podWithID("foo", testNode.Name), + expectAssumedPod: podWithID("foo", testNode.Name), + expectError: fmt.Errorf(`running PreBind plugin "FakePreBind": %w`, preBindErr), + eventReason: "FailedScheduling", + }, + { + name: "bind assumed pod scheduled", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, + expectBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: testNode.Name}}, + expectAssumedPod: podWithID("foo", testNode.Name), + eventReason: "Scheduled", + }, + { + name: "error pod failed scheduling", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, errS}, + expectError: errS, + expectErrorPod: podWithID("foo", ""), + eventReason: "FailedScheduling", + }, + { + name: "error bind forget pod failed scheduling", + sendPod: podWithID("foo", ""), + mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, + expectBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: testNode.Name}}, + expectAssumedPod: podWithID("foo", testNode.Name), + injectBindError: errB, + expectError: fmt.Errorf(`binding rejected: %w`, fmt.Errorf("running Bind plugin %q: %w", "DefaultBinder", errors.New("binder"))), + expectErrorPod: podWithID("foo", testNode.Name), + expectForgetPod: podWithID("foo", testNode.Name), + eventReason: "FailedScheduling", + }, + { + name: "deleting pod", + sendPod: deletingPod("foo"), + mockResult: mockScheduleResult{ScheduleResult{}, nil}, + eventReason: "FailedScheduling", + }, + } + + for _, item := range table { + t.Run(item.name, func(t *testing.T) { + var gotError error + var gotPod *v1.Pod + var gotForgetPod *v1.Pod + var gotAssumedPod *v1.Pod + var gotBinding *v1.Binding + cache := &fakecache.Cache{ + ForgetFunc: func(pod *v1.Pod) { + gotForgetPod = pod + }, + AssumeFunc: func(pod *v1.Pod) { + gotAssumedPod = pod + }, + IsAssumedPodFunc: func(pod *v1.Pod) bool { + if pod == nil || gotAssumedPod == nil { + return false + } + return pod.UID == gotAssumedPod.UID + }, + } + client := clientsetfake.NewSimpleClientset(item.sendPod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "binding" { + return false, nil, nil + } + gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + return true, gotBinding, item.injectBindError + }) + registerPluginFuncs := append(item.registerPluginFuncs, + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + ) + fwk, err := st.NewFramework(registerPluginFuncs, + testSchedulerName, + frameworkruntime.WithClientSet(client), + frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName))) + if err != nil { + t.Fatal(err) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + s := newScheduler( + cache, + nil, + func() *framework.QueuedPodInfo { + return &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(item.sendPod)} + }, + func(p *framework.QueuedPodInfo, err error) { + gotPod = p.Pod + gotError = err + }, + nil, + internalqueue.NewTestQueue(ctx, nil), + profile.Map{ + testSchedulerName: fwk, + }, + client, + nil, + 0) + s.SchedulePod = func(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (ScheduleResult, error) { + return item.mockResult.result, item.mockResult.err + } + called := make(chan struct{}) + stopFunc := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { + e, _ := obj.(*eventsv1.Event) + if e.Reason != item.eventReason { + t.Errorf("got event %v, want %v", e.Reason, item.eventReason) + } + close(called) + }) + s.scheduleOne(ctx) + <-called + if e, a := item.expectAssumedPod, gotAssumedPod; !reflect.DeepEqual(e, a) { + t.Errorf("assumed pod: wanted %v, got %v", e, a) + } + if e, a := item.expectErrorPod, gotPod; !reflect.DeepEqual(e, a) { + t.Errorf("error pod: wanted %v, got %v", e, a) + } + if e, a := item.expectForgetPod, gotForgetPod; !reflect.DeepEqual(e, a) { + t.Errorf("forget pod: wanted %v, got %v", e, a) + } + if e, a := item.expectError, gotError; !reflect.DeepEqual(e, a) { + t.Errorf("error: wanted %v, got %v", e, a) + } + if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { + t.Errorf("got binding diff (-want, +got): %s", diff) + } + stopFunc() + }) + } +} + +func TestSchedulerNoPhantomPodAfterExpire(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + scache := internalcache.New(100*time.Millisecond, ctx.Done()) + pod := podWithPort("pod.Name", "", 8080) + node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} + scache.AddNode(&node) + + fns := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + st.RegisterPluginAsExtensions(nodeports.Name, nodeports.New, "Filter", "PreFilter"), + } + scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, pod, &node, fns...) + + waitPodExpireChan := make(chan struct{}) + timeout := make(chan struct{}) + go func() { + for { + select { + case <-timeout: + return + default: + } + pods, err := scache.PodCount() + if err != nil { + errChan <- fmt.Errorf("cache.List failed: %v", err) + return + } + if pods == 0 { + close(waitPodExpireChan) + return + } + time.Sleep(100 * time.Millisecond) + } + }() + // waiting for the assumed pod to expire + select { + case err := <-errChan: + t.Fatal(err) + case <-waitPodExpireChan: + case <-time.After(wait.ForeverTestTimeout): + close(timeout) + t.Fatalf("timeout timeout in waiting pod expire after %v", wait.ForeverTestTimeout) + } + + // We use conflicted pod ports to incur fit predicate failure if first pod not removed. + secondPod := podWithPort("bar", "", 8080) + queuedPodStore.Add(secondPod) + scheduler.scheduleOne(ctx) + select { + case b := <-bindingChan: + expectBinding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, + Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, + } + if !reflect.DeepEqual(expectBinding, b) { + t.Errorf("binding want=%v, get=%v", expectBinding, b) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + } +} + +func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + scache := internalcache.New(10*time.Minute, ctx.Done()) + firstPod := podWithPort("pod.Name", "", 8080) + node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} + scache.AddNode(&node) + fns := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + st.RegisterPluginAsExtensions(nodeports.Name, nodeports.New, "Filter", "PreFilter"), + } + scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, firstPod, &node, fns...) + + // We use conflicted pod ports to incur fit predicate failure. + secondPod := podWithPort("bar", "", 8080) + queuedPodStore.Add(secondPod) + // queuedPodStore: [bar:8080] + // cache: [(assumed)foo:8080] + + scheduler.scheduleOne(ctx) + select { + case err := <-errChan: + expectErr := &framework.FitError{ + Pod: secondPod, + NumAllNodes: 1, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + node.Name: framework.NewStatus(framework.Unschedulable, nodeports.ErrReason).WithFailedPlugin(nodeports.Name), + }, + UnschedulablePlugins: sets.NewString(nodeports.Name), + }, + } + if !reflect.DeepEqual(expectErr, err) { + t.Errorf("err want=%v, get=%v", expectErr, err) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in fitting after %v", wait.ForeverTestTimeout) + } + + // We mimic the workflow of cache behavior when a pod is removed by user. + // Note: if the schedulernodeinfo timeout would be super short, the first pod would expire + // and would be removed itself (without any explicit actions on schedulernodeinfo). Even in that case, + // explicitly AddPod will as well correct the behavior. + firstPod.Spec.NodeName = node.Name + if err := scache.AddPod(firstPod); err != nil { + t.Fatalf("err: %v", err) + } + if err := scache.RemovePod(firstPod); err != nil { + t.Fatalf("err: %v", err) + } + + queuedPodStore.Add(secondPod) + scheduler.scheduleOne(ctx) + select { + case b := <-bindingChan: + expectBinding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, + Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, + } + if !reflect.DeepEqual(expectBinding, b) { + t.Errorf("binding want=%v, get=%v", expectBinding, b) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + } +} + +func TestSchedulerFailedSchedulingReasons(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + scache := internalcache.New(10*time.Minute, ctx.Done()) + + // Design the baseline for the pods, and we will make nodes that don't fit it later. + var cpu = int64(4) + var mem = int64(500) + podWithTooBigResourceRequests := podWithResources("bar", "", v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }, v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }) + + // create several nodes which cannot schedule the above pod + var nodes []*v1.Node + var objects []runtime.Object + for i := 0; i < 100; i++ { + uid := fmt.Sprintf("machine%v", i) + node := v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: uid, UID: types.UID(uid)}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }}, + } + scache.AddNode(&node) + nodes = append(nodes, &node) + objects = append(objects, &node) + } + + // Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary. + failedNodeStatues := framework.NodeToStatusMap{} + for _, node := range nodes { + failedNodeStatues[node.Name] = framework.NewStatus( + framework.Unschedulable, + fmt.Sprintf("Insufficient %v", v1.ResourceCPU), + fmt.Sprintf("Insufficient %v", v1.ResourceMemory), + ).WithFailedPlugin(noderesources.Name) + } + fns := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + st.RegisterPluginAsExtensions(noderesources.Name, frameworkruntime.FactoryAdapter(feature.Features{}, noderesources.NewFit), "Filter", "PreFilter"), + } + + informerFactory := informers.NewSharedInformerFactory(clientsetfake.NewSimpleClientset(objects...), 0) + scheduler, _, errChan := setupTestScheduler(ctx, queuedPodStore, scache, informerFactory, nil, fns...) + + queuedPodStore.Add(podWithTooBigResourceRequests) + scheduler.scheduleOne(ctx) + select { + case err := <-errChan: + expectErr := &framework.FitError{ + Pod: podWithTooBigResourceRequests, + NumAllNodes: len(nodes), + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: failedNodeStatues, + UnschedulablePlugins: sets.NewString(noderesources.Name), + }, + } + if len(fmt.Sprint(expectErr)) > 150 { + t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) + } + if !reflect.DeepEqual(expectErr, err) { + t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout after %v", wait.ForeverTestTimeout) + } +} + +func TestSchedulerWithVolumeBinding(t *testing.T) { + findErr := fmt.Errorf("find err") + assumeErr := fmt.Errorf("assume err") + bindErr := fmt.Errorf("bind err") + client := clientsetfake.NewSimpleClientset() + + eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) + + // This can be small because we wait for pod to finish scheduling first + chanTimeout := 2 * time.Second + + table := []struct { + name string + expectError error + expectPodBind *v1.Binding + expectAssumeCalled bool + expectBindCalled bool + eventReason string + volumeBinderConfig *volumebinding.FakeVolumeBinderConfig + }{ + { + name: "all bound", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + AllBound: true, + }, + expectAssumeCalled: true, + expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "foo-ns", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}}, + eventReason: "Scheduled", + }, + { + name: "bound/invalid pv affinity", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + AllBound: true, + FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonNodeConflict}, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 node(s) had volume node affinity conflict"), + }, + { + name: "unbound/no matches", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonBindConflict}, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 node(s) didn't find available persistent volumes to bind"), + }, + { + name: "bound and unbound unsatisfied", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonBindConflict, volumebinding.ErrReasonNodeConflict}, + }, + eventReason: "FailedScheduling", + expectError: makePredicateError("1 node(s) didn't find available persistent volumes to bind, 1 node(s) had volume node affinity conflict"), + }, + { + name: "unbound/found matches/bind succeeds", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{}, + expectAssumeCalled: true, + expectBindCalled: true, + expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "foo-ns", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}}, + eventReason: "Scheduled", + }, + { + name: "predicate error", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + FindErr: findErr, + }, + eventReason: "FailedScheduling", + expectError: fmt.Errorf("running %q filter plugin: %v", volumebinding.Name, findErr), + }, + { + name: "assume error", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + AssumeErr: assumeErr, + }, + expectAssumeCalled: true, + eventReason: "FailedScheduling", + expectError: fmt.Errorf("running Reserve plugin %q: %w", volumebinding.Name, assumeErr), + }, + { + name: "bind error", + volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ + BindErr: bindErr, + }, + expectAssumeCalled: true, + expectBindCalled: true, + eventReason: "FailedScheduling", + expectError: fmt.Errorf("running PreBind plugin %q: %w", volumebinding.Name, bindErr), + }, + } + + for _, item := range table { + t.Run(item.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + fakeVolumeBinder := volumebinding.NewFakeVolumeBinder(item.volumeBinderConfig) + s, bindingChan, errChan := setupTestSchedulerWithVolumeBinding(ctx, fakeVolumeBinder, eventBroadcaster) + eventChan := make(chan struct{}) + stopFunc := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { + e, _ := obj.(*eventsv1.Event) + if e, a := item.eventReason, e.Reason; e != a { + t.Errorf("expected %v, got %v", e, a) + } + close(eventChan) + }) + s.scheduleOne(ctx) + // Wait for pod to succeed or fail scheduling + select { + case <-eventChan: + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("scheduling timeout after %v", wait.ForeverTestTimeout) + } + stopFunc() + // Wait for scheduling to return an error or succeed binding. + var ( + gotErr error + gotBind *v1.Binding + ) + select { + case gotErr = <-errChan: + case gotBind = <-bindingChan: + case <-time.After(chanTimeout): + t.Fatalf("did not receive pod binding or error after %v", chanTimeout) + } + if item.expectError != nil { + if gotErr == nil || item.expectError.Error() != gotErr.Error() { + t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) + } + } else if gotErr != nil { + t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) + } + if !cmp.Equal(item.expectPodBind, gotBind) { + t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectPodBind, gotBind) + } + + if item.expectAssumeCalled != fakeVolumeBinder.AssumeCalled { + t.Errorf("expectedAssumeCall %v", item.expectAssumeCalled) + } + + if item.expectBindCalled != fakeVolumeBinder.BindCalled { + t.Errorf("expectedBindCall %v", item.expectBindCalled) + } + }) + } +} + +func TestSchedulerBinding(t *testing.T) { + table := []struct { + podName string + extenders []framework.Extender + wantBinderID int + name string + }{ + { + name: "the extender is not a binder", + podName: "pod0", + extenders: []framework.Extender{ + &fakeExtender{isBinder: false, interestedPodName: "pod0"}, + }, + wantBinderID: -1, // default binding. + }, + { + name: "one of the extenders is a binder and interested in pod", + podName: "pod0", + extenders: []framework.Extender{ + &fakeExtender{isBinder: false, interestedPodName: "pod0"}, + &fakeExtender{isBinder: true, interestedPodName: "pod0"}, + }, + wantBinderID: 1, + }, + { + name: "one of the extenders is a binder, but not interested in pod", + podName: "pod1", + extenders: []framework.Extender{ + &fakeExtender{isBinder: false, interestedPodName: "pod1"}, + &fakeExtender{isBinder: true, interestedPodName: "pod0"}, + }, + wantBinderID: -1, // default binding. + }, + } + + for _, test := range table { + t.Run(test.name, func(t *testing.T) { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: test.podName, + }, + } + defaultBound := false + client := clientsetfake.NewSimpleClientset(pod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() == "binding" { + defaultBound = true + } + return false, nil, nil + }) + fwk, err := st.NewFramework([]st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, "", frameworkruntime.WithClientSet(client), frameworkruntime.WithEventRecorder(&events.FakeRecorder{})) + if err != nil { + t.Fatal(err) + } + stop := make(chan struct{}) + defer close(stop) + sched := &Scheduler{ + Extenders: test.extenders, + Cache: internalcache.New(100*time.Millisecond, stop), + nodeInfoSnapshot: nil, + percentageOfNodesToScore: 0, + } + err = sched.bind(context.Background(), fwk, pod, "node", nil) + if err != nil { + t.Error(err) + } + + // Checking default binding. + if wantBound := test.wantBinderID == -1; defaultBound != wantBound { + t.Errorf("got bound with default binding: %v, want %v", defaultBound, wantBound) + } + + // Checking extenders binding. + for i, ext := range test.extenders { + wantBound := i == test.wantBinderID + if gotBound := ext.(*fakeExtender).gotBind; gotBound != wantBound { + t.Errorf("got bound with extender #%d: %v, want %v", i, gotBound, wantBound) + } + } + + }) + } +} + +func TestUpdatePod(t *testing.T) { + tests := []struct { + name string + currentPodConditions []v1.PodCondition + newPodCondition *v1.PodCondition + currentNominatedNodeName string + newNominatingInfo *framework.NominatingInfo + expectedPatchRequests int + expectedPatchDataPattern string + }{ + { + name: "Should make patch request to add pod condition when there are none currently", + currentPodConditions: []v1.PodCondition{}, + newPodCondition: &v1.PodCondition{ + Type: "newType", + Status: "newStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), + Reason: "newReason", + Message: "newMessage", + }, + expectedPatchRequests: 1, + expectedPatchDataPattern: `{"status":{"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, + }, + { + name: "Should make patch request to add a new pod condition when there is already one with another type", + currentPodConditions: []v1.PodCondition{ + { + Type: "someOtherType", + Status: "someOtherTypeStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 11, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 10, 0, 0, 0, 0, time.UTC)), + Reason: "someOtherTypeReason", + Message: "someOtherTypeMessage", + }, + }, + newPodCondition: &v1.PodCondition{ + Type: "newType", + Status: "newStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), + Reason: "newReason", + Message: "newMessage", + }, + expectedPatchRequests: 1, + expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"someOtherType"},{"type":"newType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, + }, + { + name: "Should make patch request to update an existing pod condition", + currentPodConditions: []v1.PodCondition{ + { + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + }, + newPodCondition: &v1.PodCondition{ + Type: "currentType", + Status: "newStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), + Reason: "newReason", + Message: "newMessage", + }, + expectedPatchRequests: 1, + expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"currentType"}]}}`, + }, + { + name: "Should make patch request to update an existing pod condition, but the transition time should remain unchanged because the status is the same", + currentPodConditions: []v1.PodCondition{ + { + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + }, + newPodCondition: &v1.PodCondition{ + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "newReason", + Message: "newMessage", + }, + expectedPatchRequests: 1, + expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","message":"newMessage","reason":"newReason","type":"currentType"}]}}`, + }, + { + name: "Should not make patch request if pod condition already exists and is identical and nominated node name is not set", + currentPodConditions: []v1.PodCondition{ + { + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + }, + newPodCondition: &v1.PodCondition{ + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + currentNominatedNodeName: "node1", + expectedPatchRequests: 0, + }, + { + name: "Should make patch request if pod condition already exists and is identical but nominated node name is set and different", + currentPodConditions: []v1.PodCondition{ + { + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + }, + newPodCondition: &v1.PodCondition{ + Type: "currentType", + Status: "currentStatus", + LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), + LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), + Reason: "currentReason", + Message: "currentMessage", + }, + newNominatingInfo: &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: "node1"}, + expectedPatchRequests: 1, + expectedPatchDataPattern: `{"status":{"nominatedNodeName":"node1"}}`, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actualPatchRequests := 0 + var actualPatchData string + cs := &clientsetfake.Clientset{} + cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + actualPatchRequests++ + patch := action.(clienttesting.PatchAction) + actualPatchData = string(patch.GetPatch()) + // For this test, we don't care about the result of the patched pod, just that we got the expected + // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. + return true, &v1.Pod{}, nil + }) + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "foo"}, + Status: v1.PodStatus{ + Conditions: test.currentPodConditions, + NominatedNodeName: test.currentNominatedNodeName, + }, + } + + if err := updatePod(cs, pod, test.newPodCondition, test.newNominatingInfo); err != nil { + t.Fatalf("Error calling update: %v", err) + } + + if actualPatchRequests != test.expectedPatchRequests { + t.Fatalf("Actual patch requests (%d) does not equal expected patch requests (%d), actual patch data: %v", actualPatchRequests, test.expectedPatchRequests, actualPatchData) + } + + regex, err := regexp.Compile(test.expectedPatchDataPattern) + if err != nil { + t.Fatalf("Error compiling regexp for %v: %v", test.expectedPatchDataPattern, err) + } + + if test.expectedPatchRequests > 0 && !regex.MatchString(actualPatchData) { + t.Fatalf("Patch data mismatch: Actual was %v, but expected to match regexp %v", actualPatchData, test.expectedPatchDataPattern) + } + }) + } +} + +func TestSelectHost(t *testing.T) { + tests := []struct { + name string + list framework.NodeScoreList + possibleHosts sets.String + expectsErr bool + }{ + { + name: "unique properly ordered scores", + list: []framework.NodeScore{ + {Name: "machine1.1", Score: 1}, + {Name: "machine2.1", Score: 2}, + }, + possibleHosts: sets.NewString("machine2.1"), + expectsErr: false, + }, + { + name: "equal scores", + list: []framework.NodeScore{ + {Name: "machine1.1", Score: 1}, + {Name: "machine1.2", Score: 2}, + {Name: "machine1.3", Score: 2}, + {Name: "machine2.1", Score: 2}, + }, + possibleHosts: sets.NewString("machine1.2", "machine1.3", "machine2.1"), + expectsErr: false, + }, + { + name: "out of order scores", + list: []framework.NodeScore{ + {Name: "machine1.1", Score: 3}, + {Name: "machine1.2", Score: 3}, + {Name: "machine2.1", Score: 2}, + {Name: "machine3.1", Score: 1}, + {Name: "machine1.3", Score: 3}, + }, + possibleHosts: sets.NewString("machine1.1", "machine1.2", "machine1.3"), + expectsErr: false, + }, + { + name: "empty priority list", + list: []framework.NodeScore{}, + possibleHosts: sets.NewString(), + expectsErr: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // increase the randomness + for i := 0; i < 10; i++ { + got, err := selectHost(test.list) + if test.expectsErr { + if err == nil { + t.Error("Unexpected non-error") + } + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !test.possibleHosts.Has(got) { + t.Errorf("got %s is not in the possible map %v", got, test.possibleHosts) + } + } + } + }) + } +} + +func TestFindNodesThatPassExtenders(t *testing.T) { + tests := []struct { + name string + extenders []st.FakeExtender + nodes []*v1.Node + filteredNodesStatuses framework.NodeToStatusMap + expectsErr bool + expectedNodes []*v1.Node + expectedStatuses framework.NodeToStatusMap + }{ + { + name: "error", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{st.ErrorPredicateExtender}, + }, + }, + nodes: makeNodeList([]string{"a"}), + filteredNodesStatuses: make(framework.NodeToStatusMap), + expectsErr: true, + }, + { + name: "success", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{st.TruePredicateExtender}, + }, + }, + nodes: makeNodeList([]string{"a"}), + filteredNodesStatuses: make(framework.NodeToStatusMap), + expectsErr: false, + expectedNodes: makeNodeList([]string{"a"}), + expectedStatuses: make(framework.NodeToStatusMap), + }, + { + name: "unschedulable", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { + if node.Name == "a" { + return framework.NewStatus(framework.Success) + } + return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) + }}, + }, + }, + nodes: makeNodeList([]string{"a", "b"}), + filteredNodesStatuses: make(framework.NodeToStatusMap), + expectsErr: false, + expectedNodes: makeNodeList([]string{"a"}), + expectedStatuses: framework.NodeToStatusMap{ + "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), + }, + }, + { + name: "unschedulable and unresolvable", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { + if node.Name == "a" { + return framework.NewStatus(framework.Success) + } + if node.Name == "b" { + return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) + } + return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) + }}, + }, + }, + nodes: makeNodeList([]string{"a", "b", "c"}), + filteredNodesStatuses: make(framework.NodeToStatusMap), + expectsErr: false, + expectedNodes: makeNodeList([]string{"a"}), + expectedStatuses: framework.NodeToStatusMap{ + "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), + "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), + }, + }, + { + name: "extender may overwrite the statuses", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { + if node.Name == "a" { + return framework.NewStatus(framework.Success) + } + if node.Name == "b" { + return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) + } + return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) + }}, + }, + }, + nodes: makeNodeList([]string{"a", "b", "c"}), + filteredNodesStatuses: framework.NodeToStatusMap{ + "c": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeFilterPlugin: node %q failed", "c")), + }, + expectsErr: false, + expectedNodes: makeNodeList([]string{"a"}), + expectedStatuses: framework.NodeToStatusMap{ + "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), + "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeFilterPlugin: node %q failed", "c"), fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), + }, + }, + { + name: "multiple extenders", + extenders: []st.FakeExtender{ + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { + if node.Name == "a" { + return framework.NewStatus(framework.Success) + } + if node.Name == "b" { + return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) + } + return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) + }}, + }, + { + ExtenderName: "FakeExtender1", + Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { + if node.Name == "a" { + return framework.NewStatus(framework.Success) + } + return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) + }}, + }, + }, + nodes: makeNodeList([]string{"a", "b", "c"}), + filteredNodesStatuses: make(framework.NodeToStatusMap), + expectsErr: false, + expectedNodes: makeNodeList([]string{"a"}), + expectedStatuses: framework.NodeToStatusMap{ + "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), + "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), + }, + }, + } + + cmpOpts := []cmp.Option{ + cmp.Comparer(func(s1 framework.Status, s2 framework.Status) bool { + return s1.Code() == s2.Code() && reflect.DeepEqual(s1.Reasons(), s2.Reasons()) + }), + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var extenders []framework.Extender + for ii := range tt.extenders { + extenders = append(extenders, &tt.extenders[ii]) + } + + pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}} + got, err := findNodesThatPassExtenders(extenders, pod, tt.nodes, tt.filteredNodesStatuses) + if tt.expectsErr { + if err == nil { + t.Error("Unexpected non-error") + } + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if diff := cmp.Diff(tt.expectedNodes, got); diff != "" { + t.Errorf("filtered nodes (-want,+got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedStatuses, tt.filteredNodesStatuses, cmpOpts...); diff != "" { + t.Errorf("filtered statuses (-want,+got):\n%s", diff) + } + } + }) + } +} + +func TestSchedulerSchedulePod(t *testing.T) { + fts := feature.Features{} + tests := []struct { + name string + registerPlugins []st.RegisterPluginFunc + nodes []string + pvcs []v1.PersistentVolumeClaim + pod *v1.Pod + pods []*v1.Pod + wantNodes sets.String + wantEvaluatedNodes *int32 + wErr error + }{ + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + name: "test 1", + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + NumAllNodes: 2, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), + "machine2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), + }, + UnschedulablePlugins: sets.NewString("FalseFilter"), + }, + }, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}}, + wantNodes: sets.NewString("machine1", "machine2"), + name: "test 2", + wErr: nil, + }, + { + // Fits on a machine where the pod ID matches the machine name + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine2", UID: types.UID("machine2")}}, + wantNodes: sets.NewString("machine2"), + name: "test 3", + wErr: nil, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3", "2", "1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}}, + wantNodes: sets.NewString("3"), + name: "test 4", + wErr: nil, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3", "2", "1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + wantNodes: sets.NewString("2"), + name: "test 5", + wErr: nil, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterScorePlugin("ReverseNumericMap", newReverseNumericMapPlugin(), 2), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3", "2", "1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + wantNodes: sets.NewString("1"), + name: "test 6", + wErr: nil, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3", "2", "1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + name: "test 7", + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + NumAllNodes: 3, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "3": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), + "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), + "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), + }, + UnschedulablePlugins: sets.NewString("FalseFilter"), + }, + }, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("NoPodsFilter", NewNoPodsFilterPlugin), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}, + Spec: v1.PodSpec{ + NodeName: "2", + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + }, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + nodes: []string{"1", "2"}, + name: "test 8", + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, + NumAllNodes: 2, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), + "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("NoPodsFilter"), + }, + UnschedulablePlugins: sets.NewString("MatchFilter", "NoPodsFilter"), + }, + }, + }, + { + // Pod with existing PVC + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pvcs: []v1.PersistentVolumeClaim{ + { + ObjectMeta: metav1.ObjectMeta{Name: "existingPVC", UID: types.UID("existingPVC"), Namespace: v1.NamespaceDefault}, + Spec: v1.PersistentVolumeClaimSpec{VolumeName: "existingPV"}, + }, + }, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "existingPVC", + }, + }, + }, + }, + }, + }, + wantNodes: sets.NewString("machine1", "machine2"), + name: "existing PVC", + wErr: nil, + }, + { + // Pod with non existing PVC + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "unknownPVC", + }, + }, + }, + }, + }, + }, + name: "unknown PVC", + wErr: &framework.FitError{ + Pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "unknownPVC", + }, + }, + }, + }, + }, + }, + NumAllNodes: 2, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "unknownPVC" not found`).WithFailedPlugin(volumebinding.Name), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "unknownPVC" not found`).WithFailedPlugin(volumebinding.Name), + }, + UnschedulablePlugins: sets.NewString(volumebinding.Name), + }, + }, + }, + { + // Pod with deleting PVC + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pvcs: []v1.PersistentVolumeClaim{{ObjectMeta: metav1.ObjectMeta{Name: "existingPVC", UID: types.UID("existingPVC"), Namespace: v1.NamespaceDefault, DeletionTimestamp: &metav1.Time{}}}}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "existingPVC", + }, + }, + }, + }, + }, + }, + name: "deleted PVC", + wErr: &framework.FitError{ + Pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, + Spec: v1.PodSpec{ + Volumes: []v1.Volume{ + { + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "existingPVC", + }, + }, + }, + }, + }, + }, + NumAllNodes: 2, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "existingPVC" is being deleted`).WithFailedPlugin(volumebinding.Name), + "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "existingPVC" is being deleted`).WithFailedPlugin(volumebinding.Name), + }, + UnschedulablePlugins: sets.NewString(volumebinding.Name), + }, + }, + }, + { + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterScorePlugin("FalseMap", newFalseMapPlugin(), 1), + st.RegisterScorePlugin("TrueMap", newTrueMapPlugin(), 2), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"2", "1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2"}}, + name: "test error with priority map", + wErr: fmt.Errorf("running Score plugins: %w", fmt.Errorf(`plugin "FalseMap" failed with: %w`, errPrioritize)), + }, + { + name: "test podtopologyspread plugin - 2 nodes with maxskew=1", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions( + podtopologyspread.Name, + podTopologySpreadFunc, + "PreFilter", + "Filter", + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2"}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p", UID: types.UID("p"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + NodeName: "machine1", + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + }, + wantNodes: sets.NewString("machine2"), + wErr: nil, + }, + { + name: "test podtopologyspread plugin - 3 nodes with maxskew=2", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPluginAsExtensions( + podtopologyspread.Name, + podTopologySpreadFunc, + "PreFilter", + "Filter", + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"machine1", "machine2", "machine3"}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "p", UID: types.UID("p"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 2, + TopologyKey: "hostname", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + }, + }, + }, + }, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod1a", UID: types.UID("pod1a"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + NodeName: "machine1", + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod1b", UID: types.UID("pod1b"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + NodeName: "machine1", + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "pod2", UID: types.UID("pod2"), Labels: map[string]string{"foo": ""}}, + Spec: v1.PodSpec{ + NodeName: "machine2", + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + }, + }, + }, + wantNodes: sets.NewString("machine2", "machine3"), + wErr: nil, + }, + { + name: "test with filter plugin returning Unschedulable status", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin( + "FakeFilter", + st.NewFakeFilterPlugin(map[string]framework.Code{"3": framework.Unschedulable}), + ), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, + wantNodes: nil, + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, + NumAllNodes: 1, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "3": framework.NewStatus(framework.Unschedulable, "injecting failure for pod test-filter").WithFailedPlugin("FakeFilter"), + }, + UnschedulablePlugins: sets.NewString("FakeFilter"), + }, + }, + }, + { + name: "test with filter plugin returning UnschedulableAndUnresolvable status", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin( + "FakeFilter", + st.NewFakeFilterPlugin(map[string]framework.Code{"3": framework.UnschedulableAndUnresolvable}), + ), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, + wantNodes: nil, + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, + NumAllNodes: 1, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "3": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injecting failure for pod test-filter").WithFailedPlugin("FakeFilter"), + }, + UnschedulablePlugins: sets.NewString("FakeFilter"), + }, + }, + }, + { + name: "test with partial failed filter plugin", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin( + "FakeFilter", + st.NewFakeFilterPlugin(map[string]framework.Code{"1": framework.Unschedulable}), + ), + st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"1", "2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, + wantNodes: nil, + wErr: nil, + }, + { + name: "test prefilter plugin returning Unschedulable status", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin( + "FakePreFilter", + st.NewFakePreFilterPlugin("FakePreFilter", nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status")), + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"1", "2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + wantNodes: nil, + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + NumAllNodes: 2, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "1": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status").WithFailedPlugin("FakePreFilter"), + "2": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status").WithFailedPlugin("FakePreFilter"), + }, + UnschedulablePlugins: sets.NewString("FakePreFilter"), + }, + }, + }, + { + name: "test prefilter plugin returning error status", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin( + "FakePreFilter", + st.NewFakePreFilterPlugin("FakePreFilter", nil, framework.NewStatus(framework.Error, "injected error status")), + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"1", "2"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + wantNodes: nil, + wErr: fmt.Errorf(`running PreFilter plugin "FakePreFilter": %w`, errors.New("injected error status")), + }, + { + name: "test prefilter plugin returning node", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin( + "FakePreFilter1", + st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), + ), + st.RegisterPreFilterPlugin( + "FakePreFilter2", + st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString("node2")}, nil), + ), + st.RegisterPreFilterPlugin( + "FakePreFilter3", + st.NewFakePreFilterPlugin("FakePreFilter3", &framework.PreFilterResult{NodeNames: sets.NewString("node1", "node2")}, nil), + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"node1", "node2", "node3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + wantNodes: sets.NewString("node2"), + wantEvaluatedNodes: pointer.Int32Ptr(1), + }, + { + name: "test prefilter plugin returning non-intersecting nodes", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin( + "FakePreFilter1", + st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), + ), + st.RegisterPreFilterPlugin( + "FakePreFilter2", + st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString("node2")}, nil), + ), + st.RegisterPreFilterPlugin( + "FakePreFilter3", + st.NewFakePreFilterPlugin("FakePreFilter3", &framework.PreFilterResult{NodeNames: sets.NewString("node1")}, nil), + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"node1", "node2", "node3"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + NumAllNodes: 3, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "node1": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), + "node2": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), + "node3": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), + }, + UnschedulablePlugins: sets.String{}, + }, + }, + }, + { + name: "test prefilter plugin returning empty node set", + registerPlugins: []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterPreFilterPlugin( + "FakePreFilter1", + st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), + ), + st.RegisterPreFilterPlugin( + "FakePreFilter2", + st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString()}, nil), + ), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + nodes: []string{"node1"}, + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + wErr: &framework.FitError{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, + NumAllNodes: 1, + Diagnosis: framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "node1": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin FakePreFilter2"), + }, + UnschedulablePlugins: sets.String{}, + }, + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + cache := internalcache.New(time.Duration(0), wait.NeverStop) + for _, pod := range test.pods { + cache.AddPod(pod) + } + var nodes []*v1.Node + for _, name := range test.nodes { + node := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name, Labels: map[string]string{"hostname": name}}} + nodes = append(nodes, node) + cache.AddNode(node) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + cs := clientsetfake.NewSimpleClientset() + informerFactory := informers.NewSharedInformerFactory(cs, 0) + for _, pvc := range test.pvcs { + metav1.SetMetaDataAnnotation(&pvc.ObjectMeta, volume.AnnBindCompleted, "true") + cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Create(ctx, &pvc, metav1.CreateOptions{}) + if pvName := pvc.Spec.VolumeName; pvName != "" { + pv := v1.PersistentVolume{ObjectMeta: metav1.ObjectMeta{Name: pvName}} + cs.CoreV1().PersistentVolumes().Create(ctx, &pv, metav1.CreateOptions{}) + } + } + snapshot := internalcache.NewSnapshot(test.pods, nodes) + fwk, err := st.NewFramework( + test.registerPlugins, "", + frameworkruntime.WithSnapshotSharedLister(snapshot), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), + ) + if err != nil { + t.Fatal(err) + } + + scheduler := newScheduler( + cache, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + snapshot, + schedulerapi.DefaultPercentageOfNodesToScore) + informerFactory.Start(ctx.Done()) + informerFactory.WaitForCacheSync(ctx.Done()) + + result, err := scheduler.SchedulePod(ctx, fwk, framework.NewCycleState(), test.pod) + if err != test.wErr { + gotFitErr, gotOK := err.(*framework.FitError) + wantFitErr, wantOK := test.wErr.(*framework.FitError) + if gotOK != wantOK { + t.Errorf("Expected err to be FitError: %v, but got %v", wantOK, gotOK) + } else if gotOK { + if diff := cmp.Diff(gotFitErr, wantFitErr); diff != "" { + t.Errorf("Unexpected fitErr: (-want, +got): %s", diff) + } + } + } + if test.wantNodes != nil && !test.wantNodes.Has(result.SuggestedHost) { + t.Errorf("Expected: %s, got: %s", test.wantNodes, result.SuggestedHost) + } + wantEvaluatedNodes := len(test.nodes) + if test.wantEvaluatedNodes != nil { + wantEvaluatedNodes = int(*test.wantEvaluatedNodes) + } + if test.wErr == nil && wantEvaluatedNodes != result.EvaluatedNodes { + t.Errorf("Expected EvaluatedNodes: %d, got: %d", wantEvaluatedNodes, result.EvaluatedNodes) + } + }) + } +} + +func TestFindFitAllError(t *testing.T) { + nodes := makeNodeList([]string{"3", "2", "1"}) + scheduler := makeScheduler(nodes) + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + "", + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), + ) + if err != nil { + t.Fatal(err) + } + + _, diagnosis, err := scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), &v1.Pod{}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + expected := framework.Diagnosis{ + NodeToStatusMap: framework.NodeToStatusMap{ + "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), + "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), + "3": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), + }, + UnschedulablePlugins: sets.NewString("MatchFilter"), + } + if diff := cmp.Diff(diagnosis, expected); diff != "" { + t.Errorf("Unexpected diagnosis: (-want, +got): %s", diff) + } +} + +func TestFindFitSomeError(t *testing.T) { + nodes := makeNodeList([]string{"3", "2", "1"}) + scheduler := makeScheduler(nodes) + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + "", + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), + ) + if err != nil { + t.Fatal(err) + } + + pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}} + _, diagnosis, err := scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), pod) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + if len(diagnosis.NodeToStatusMap) != len(nodes)-1 { + t.Errorf("unexpected failed status map: %v", diagnosis.NodeToStatusMap) + } + + if diff := cmp.Diff(sets.NewString("MatchFilter"), diagnosis.UnschedulablePlugins); diff != "" { + t.Errorf("Unexpected unschedulablePlugins: (-want, +got): %s", diagnosis.UnschedulablePlugins) + } + + for _, node := range nodes { + if node.Name == pod.Name { + continue + } + t.Run(node.Name, func(t *testing.T) { + status, found := diagnosis.NodeToStatusMap[node.Name] + if !found { + t.Errorf("failed to find node %v in %v", node.Name, diagnosis.NodeToStatusMap) + } + reasons := status.Reasons() + if len(reasons) != 1 || reasons[0] != st.ErrReasonFake { + t.Errorf("unexpected failures: %v", reasons) + } + }) + } +} + +func TestFindFitPredicateCallCounts(t *testing.T) { + tests := []struct { + name string + pod *v1.Pod + expectedCount int32 + }{ + { + name: "nominated pods have lower priority, predicate is called once", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}, Spec: v1.PodSpec{Priority: &highPriority}}, + expectedCount: 1, + }, + { + name: "nominated pods have higher priority, predicate is called twice", + pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}, Spec: v1.PodSpec{Priority: &lowPriority}}, + expectedCount: 2, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + nodes := makeNodeList([]string{"1"}) + + plugin := st.FakeFilterPlugin{} + registerFakeFilterFunc := st.RegisterFilterPlugin( + "FakeFilter", + func(_ runtime.Object, fh framework.Handle) (framework.Plugin, error) { + return &plugin, nil + }, + ) + registerPlugins := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + registerFakeFilterFunc, + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + } + fwk, err := st.NewFramework( + registerPlugins, "", + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), + ) + if err != nil { + t.Fatal(err) + } + + scheduler := makeScheduler(nodes) + if err := scheduler.Cache.UpdateSnapshot(scheduler.nodeInfoSnapshot); err != nil { + t.Fatal(err) + } + fwk.AddNominatedPod(framework.NewPodInfo(&v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: "nominated"}, Spec: v1.PodSpec{Priority: &midPriority}}), + &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: "1"}) + + _, _, err = scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), test.pod) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if test.expectedCount != plugin.NumFilterCalled { + t.Errorf("predicate was called %d times, expected is %d", plugin.NumFilterCalled, test.expectedCount) + } + }) + } +} + +// The point of this test is to show that you: +// - get the same priority for a zero-request pod as for a pod with the defaults requests, +// both when the zero-request pod is already on the machine and when the zero-request pod +// is the one being scheduled. +// - don't get the same score no matter what we schedule. +func TestZeroRequest(t *testing.T) { + // A pod with no resources. We expect spreading to count it as having the default resources. + noResources := v1.PodSpec{ + Containers: []v1.Container{ + {}, + }, + } + noResources1 := noResources + noResources1.NodeName = "machine1" + // A pod with the same resources as a 0-request pod gets by default as its resources (for spreading). + small := v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest, 10) + "m"), + v1.ResourceMemory: resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest, 10)), + }, + }, + }, + }, + } + small2 := small + small2.NodeName = "machine2" + // A larger pod. + large := v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse( + strconv.FormatInt(schedutil.DefaultMilliCPURequest*3, 10) + "m"), + v1.ResourceMemory: resource.MustParse( + strconv.FormatInt(schedutil.DefaultMemoryRequest*3, 10)), + }, + }, + }, + }, + } + large1 := large + large1.NodeName = "machine1" + large2 := large + large2.NodeName = "machine2" + tests := []struct { + pod *v1.Pod + pods []*v1.Pod + nodes []*v1.Node + name string + expectedScore int64 + }{ + // The point of these next two tests is to show you get the same priority for a zero-request pod + // as for a pod with the defaults requests, both when the zero-request pod is already on the machine + // and when the zero-request pod is the one being scheduled. + { + pod: &v1.Pod{Spec: noResources}, + nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, + name: "test priority of zero-request pod with machine with zero-request pod", + pods: []*v1.Pod{ + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + expectedScore: 250, + }, + { + pod: &v1.Pod{Spec: small}, + nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, + name: "test priority of nonzero-request pod with machine with zero-request pod", + pods: []*v1.Pod{ + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + expectedScore: 250, + }, + // The point of this test is to verify that we're not just getting the same score no matter what we schedule. + { + pod: &v1.Pod{Spec: large}, + nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, + name: "test priority of larger pod with machine with zero-request pod", + pods: []*v1.Pod{ + {Spec: large1}, {Spec: noResources1}, + {Spec: large2}, {Spec: small2}, + }, + expectedScore: 230, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client := clientsetfake.NewSimpleClientset() + informerFactory := informers.NewSharedInformerFactory(client, 0) + + snapshot := internalcache.NewSnapshot(test.pods, test.nodes) + fts := feature.Features{} + pluginRegistrations := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterScorePlugin(noderesources.Name, frameworkruntime.FactoryAdapter(fts, noderesources.NewFit), 1), + st.RegisterScorePlugin(noderesources.BalancedAllocationName, frameworkruntime.FactoryAdapter(fts, noderesources.NewBalancedAllocation), 1), + st.RegisterScorePlugin(selectorspread.Name, selectorspread.New, 1), + st.RegisterPreScorePlugin(selectorspread.Name, selectorspread.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + } + fwk, err := st.NewFramework( + pluginRegistrations, "", + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithSnapshotSharedLister(snapshot), + frameworkruntime.WithClientSet(client), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), + ) + if err != nil { + t.Fatalf("error creating framework: %+v", err) + } + + scheduler := newScheduler( + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + snapshot, + schedulerapi.DefaultPercentageOfNodesToScore) + + ctx := context.Background() + state := framework.NewCycleState() + _, _, err = scheduler.findNodesThatFitPod(ctx, fwk, state, test.pod) + if err != nil { + t.Fatalf("error filtering nodes: %+v", err) + } + fwk.RunPreScorePlugins(ctx, state, test.pod, test.nodes) + list, err := prioritizeNodes(ctx, nil, fwk, state, test.pod, test.nodes) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + for _, hp := range list { + if hp.Score != test.expectedScore { + t.Errorf("expected %d for all priorities, got list %#v", test.expectedScore, list) + } + } + }) + } +} + +var lowPriority, midPriority, highPriority = int32(0), int32(100), int32(1000) + +func TestNumFeasibleNodesToFind(t *testing.T) { + tests := []struct { + name string + percentageOfNodesToScore int32 + numAllNodes int32 + wantNumNodes int32 + }{ + { + name: "not set percentageOfNodesToScore and nodes number not more than 50", + numAllNodes: 10, + wantNumNodes: 10, + }, + { + name: "set percentageOfNodesToScore and nodes number not more than 50", + percentageOfNodesToScore: 40, + numAllNodes: 10, + wantNumNodes: 10, + }, + { + name: "not set percentageOfNodesToScore and nodes number more than 50", + numAllNodes: 1000, + wantNumNodes: 420, + }, + { + name: "set percentageOfNodesToScore and nodes number more than 50", + percentageOfNodesToScore: 40, + numAllNodes: 1000, + wantNumNodes: 400, + }, + { + name: "not set percentageOfNodesToScore and nodes number more than 50*125", + numAllNodes: 6000, + wantNumNodes: 300, + }, + { + name: "set percentageOfNodesToScore and nodes number more than 50*125", + percentageOfNodesToScore: 40, + numAllNodes: 6000, + wantNumNodes: 2400, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sched := &Scheduler{ + percentageOfNodesToScore: tt.percentageOfNodesToScore, + } + if gotNumNodes := sched.numFeasibleNodesToFind(tt.numAllNodes); gotNumNodes != tt.wantNumNodes { + t.Errorf("Scheduler.numFeasibleNodesToFind() = %v, want %v", gotNumNodes, tt.wantNumNodes) + } + }) + } +} + +func TestFairEvaluationForNodes(t *testing.T) { + numAllNodes := 500 + nodeNames := make([]string, 0, numAllNodes) + for i := 0; i < numAllNodes; i++ { + nodeNames = append(nodeNames, strconv.Itoa(i)) + } + nodes := makeNodeList(nodeNames) + sched := makeScheduler(nodes) + fwk, err := st.NewFramework( + []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + "", + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), + ) + if err != nil { + t.Fatal(err) + } + + // To make numAllNodes % nodesToFind != 0 + sched.percentageOfNodesToScore = 30 + nodesToFind := int(sched.numFeasibleNodesToFind(int32(numAllNodes))) + + // Iterating over all nodes more than twice + for i := 0; i < 2*(numAllNodes/nodesToFind+1); i++ { + nodesThatFit, _, err := sched.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), &v1.Pod{}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if len(nodesThatFit) != nodesToFind { + t.Errorf("got %d nodes filtered, want %d", len(nodesThatFit), nodesToFind) + } + if sched.nextStartNodeIndex != (i+1)*nodesToFind%numAllNodes { + t.Errorf("got %d lastProcessedNodeIndex, want %d", sched.nextStartNodeIndex, (i+1)*nodesToFind%numAllNodes) + } + } +} + +func TestPreferNominatedNodeFilterCallCounts(t *testing.T) { + tests := []struct { + name string + pod *v1.Pod + nodeReturnCodeMap map[string]framework.Code + expectedCount int32 + expectedPatchRequests int + }{ + { + name: "pod has the nominated node set, filter is called only once", + pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(), + expectedCount: 1, + }, + { + name: "pod without the nominated pod, filter is called for each node", + pod: st.MakePod().Name("p_without_nominated_node").UID("p").Priority(highPriority).Obj(), + expectedCount: 3, + }, + { + name: "nominated pod cannot pass the filter, filter is called for each node", + pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(), + nodeReturnCodeMap: map[string]framework.Code{"node1": framework.Unschedulable}, + expectedCount: 4, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // create three nodes in the cluster. + nodes := makeNodeList([]string{"node1", "node2", "node3"}) + client := clientsetfake.NewSimpleClientset(test.pod) + informerFactory := informers.NewSharedInformerFactory(client, 0) + cache := internalcache.New(time.Duration(0), wait.NeverStop) + for _, n := range nodes { + cache.AddNode(n) + } + plugin := st.FakeFilterPlugin{FailedNodeReturnCodeMap: test.nodeReturnCodeMap} + registerFakeFilterFunc := st.RegisterFilterPlugin( + "FakeFilter", + func(_ runtime.Object, fh framework.Handle) (framework.Plugin, error) { + return &plugin, nil + }, + ) + registerPlugins := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + registerFakeFilterFunc, + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + } + fwk, err := st.NewFramework( + registerPlugins, "", + frameworkruntime.WithClientSet(client), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), + ) + if err != nil { + t.Fatal(err) + } + snapshot := internalcache.NewSnapshot(nil, nodes) + scheduler := newScheduler( + cache, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + snapshot, + schedulerapi.DefaultPercentageOfNodesToScore) + + _, _, err = scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), test.pod) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if test.expectedCount != plugin.NumFilterCalled { + t.Errorf("predicate was called %d times, expected is %d", plugin.NumFilterCalled, test.expectedCount) + } + }) + } +} + +func podWithID(id, desiredHost string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: id, + UID: types.UID(id), + }, + Spec: v1.PodSpec{ + NodeName: desiredHost, + SchedulerName: testSchedulerName, + }, + } +} + +func deletingPod(id string) *v1.Pod { + deletionTimestamp := metav1.Now() + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: id, + UID: types.UID(id), + DeletionTimestamp: &deletionTimestamp, + }, + Spec: v1.PodSpec{ + NodeName: "", + SchedulerName: testSchedulerName, + }, + } +} + +func podWithPort(id, desiredHost string, port int) *v1.Pod { + pod := podWithID(id, desiredHost) + pod.Spec.Containers = []v1.Container{ + {Name: "ctr", Ports: []v1.ContainerPort{{HostPort: int32(port)}}}, + } + return pod +} + +func podWithResources(id, desiredHost string, limits v1.ResourceList, requests v1.ResourceList) *v1.Pod { + pod := podWithID(id, desiredHost) + pod.Spec.Containers = []v1.Container{ + {Name: "ctr", Resources: v1.ResourceRequirements{Limits: limits, Requests: requests}}, + } + return pod +} + +func makeNodeList(nodeNames []string) []*v1.Node { + result := make([]*v1.Node, 0, len(nodeNames)) + for _, nodeName := range nodeNames { + result = append(result, &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}) + } + return result +} + +// makeScheduler makes a simple Scheduler for testing. +func makeScheduler(nodes []*v1.Node) *Scheduler { + cache := internalcache.New(time.Duration(0), wait.NeverStop) + for _, n := range nodes { + cache.AddNode(n) + } + + s := newScheduler( + cache, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + emptySnapshot, + schedulerapi.DefaultPercentageOfNodesToScore) + cache.UpdateSnapshot(s.nodeInfoSnapshot) + return s +} + +func makeNode(node string, milliCPU, memory int64) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: node}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), + }, + Allocatable: v1.ResourceList{ + + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), + "pods": *resource.NewQuantity(100, resource.DecimalSI), + }, + }, + } +} + +// queuedPodStore: pods queued before processing. +// cache: scheduler cache that might contain assumed pods. +func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queuedPodStore *clientcache.FIFO, scache internalcache.Cache, + pod *v1.Pod, node *v1.Node, fns ...st.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { + scheduler, bindingChan, errChan := setupTestScheduler(ctx, queuedPodStore, scache, nil, nil, fns...) + + queuedPodStore.Add(pod) + // queuedPodStore: [foo:8080] + // cache: [] + + scheduler.scheduleOne(ctx) + // queuedPodStore: [] + // cache: [(assumed)foo:8080] + + select { + case b := <-bindingChan: + expectBinding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: pod.Name, UID: types.UID(pod.Name)}, + Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, + } + if !reflect.DeepEqual(expectBinding, b) { + t.Errorf("binding want=%v, get=%v", expectBinding, b) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout after %v", wait.ForeverTestTimeout) + } + return scheduler, bindingChan, errChan +} + +// queuedPodStore: pods queued before processing. +// scache: scheduler cache that might contain assumed pods. +func setupTestScheduler(ctx context.Context, queuedPodStore *clientcache.FIFO, cache internalcache.Cache, informerFactory informers.SharedInformerFactory, broadcaster events.EventBroadcaster, fns ...st.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { + bindingChan := make(chan *v1.Binding, 1) + client := clientsetfake.NewSimpleClientset() + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + var b *v1.Binding + if action.GetSubresource() == "binding" { + b := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + bindingChan <- b + } + return true, b, nil + }) + + var recorder events.EventRecorder + if broadcaster != nil { + recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName) + } else { + recorder = &events.FakeRecorder{} + } + + if informerFactory == nil { + informerFactory = informers.NewSharedInformerFactory(clientsetfake.NewSimpleClientset(), 0) + } + schedulingQueue := internalqueue.NewTestQueueWithInformerFactory(ctx, nil, informerFactory) + + fwk, _ := st.NewFramework( + fns, + testSchedulerName, + frameworkruntime.WithClientSet(client), + frameworkruntime.WithEventRecorder(recorder), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), + ) + + errChan := make(chan error, 1) + sched := newScheduler( + cache, + nil, + func() *framework.QueuedPodInfo { + return &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(clientcache.Pop(queuedPodStore).(*v1.Pod))} + }, + func(p *framework.QueuedPodInfo, err error) { + errChan <- err + }, + nil, + schedulingQueue, + profile.Map{ + testSchedulerName: fwk, + }, + client, + internalcache.NewEmptySnapshot(), + schedulerapi.DefaultPercentageOfNodesToScore) + return sched, bindingChan, errChan +} + +func setupTestSchedulerWithVolumeBinding(ctx context.Context, volumeBinder volumebinding.SchedulerVolumeBinder, broadcaster events.EventBroadcaster) (*Scheduler, chan *v1.Binding, chan error) { + testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + pod := podWithID("foo", "") + pod.Namespace = "foo-ns" + pod.Spec.Volumes = append(pod.Spec.Volumes, v1.Volume{Name: "testVol", + VolumeSource: v1.VolumeSource{PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ClaimName: "testPVC"}}}) + queuedPodStore.Add(pod) + scache := internalcache.New(10*time.Minute, ctx.Done()) + scache.AddNode(&testNode) + testPVC := v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "testPVC", Namespace: pod.Namespace, UID: types.UID("testPVC")}} + client := clientsetfake.NewSimpleClientset(&testNode, &testPVC) + informerFactory := informers.NewSharedInformerFactory(client, 0) + pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims() + pvcInformer.Informer().GetStore().Add(&testPVC) + + fns := []st.RegisterPluginFunc{ + st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + st.RegisterPluginAsExtensions(volumebinding.Name, func(plArgs runtime.Object, handle framework.Handle) (framework.Plugin, error) { + return &volumebinding.VolumeBinding{Binder: volumeBinder, PVCLister: pvcInformer.Lister()}, nil + }, "PreFilter", "Filter", "Reserve", "PreBind"), + } + s, bindingChan, errChan := setupTestScheduler(ctx, queuedPodStore, scache, informerFactory, broadcaster, fns...) + return s, bindingChan, errChan +} + +// This is a workaround because golint complains that errors cannot +// end with punctuation. However, the real predicate error message does +// end with a period. +func makePredicateError(failReason string) error { + s := fmt.Sprintf("0/1 nodes are available: %v.", failReason) + return fmt.Errorf(s) +} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 703e8416685..0559f393bdc 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -20,62 +20,40 @@ import ( "context" "errors" "fmt" - "math/rand" - "strconv" - "sync" - "sync/atomic" "time" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/informers" coreinformers "k8s.io/client-go/informers/core/v1" clientset "k8s.io/client-go/kubernetes" + corelisters "k8s.io/client-go/listers/core/v1" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" "k8s.io/kube-scheduler/config/v1beta3" - extenderv1 "k8s.io/kube-scheduler/extender/v1" - podutil "k8s.io/kubernetes/pkg/api/v1/pod" - "k8s.io/kubernetes/pkg/apis/core/validation" - "k8s.io/kubernetes/pkg/scheduler/apis/config" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" frameworkplugins "k8s.io/kubernetes/pkg/scheduler/framework/plugins" + "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" cachedebugger "k8s.io/kubernetes/pkg/scheduler/internal/cache/debugger" internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" "k8s.io/kubernetes/pkg/scheduler/metrics" "k8s.io/kubernetes/pkg/scheduler/profile" - "k8s.io/kubernetes/pkg/scheduler/util" - utiltrace "k8s.io/utils/trace" ) const ( - // SchedulerError is the reason recorded for events when an error occurs during scheduling a pod. - SchedulerError = "SchedulerError" - // Percentage of plugin metrics to be sampled. - pluginMetricsSamplePercent = 10 // Duration the scheduler will wait before expiring an assumed pod. // See issue #106361 for more details about this parameter and its value. durationToExpireAssumedPod = 15 * time.Minute - // minFeasibleNodesToFind is the minimum number of nodes that would be scored - // in each scheduling cycle. This is a semi-arbitrary value to ensure that a - // certain minimum of nodes are checked for feasibility. This in turn helps - // ensure a minimum level of spreading. - minFeasibleNodesToFind = 100 - // minFeasibleNodesPercentageToFind is the minimum percentage of nodes that - // would be scored in each scheduling cycle. This is a semi-arbitrary value - // to ensure that a certain minimum of nodes are checked for feasibility. - // This in turn helps ensure a minimum level of spreading. - minFeasibleNodesPercentageToFind = 5 ) // ErrNoNodesAvailable is used to describe the error that no nodes available to schedule pods. @@ -252,34 +230,6 @@ var defaultSchedulerOptions = schedulerOptions{ applyDefaultProfile: true, } -// newScheduler creates a Scheduler object. -func newScheduler( - cache internalcache.Cache, - extenders []framework.Extender, - nextPod func() *framework.QueuedPodInfo, - Error func(*framework.QueuedPodInfo, error), - stopEverything <-chan struct{}, - schedulingQueue internalqueue.SchedulingQueue, - profiles profile.Map, - client clientset.Interface, - nodeInfoSnapshot *internalcache.Snapshot, - percentageOfNodesToScore int32) *Scheduler { - sched := Scheduler{ - Cache: cache, - Extenders: extenders, - NextPod: nextPod, - Error: Error, - StopEverything: stopEverything, - SchedulingQueue: schedulingQueue, - Profiles: profiles, - client: client, - nodeInfoSnapshot: nodeInfoSnapshot, - percentageOfNodesToScore: percentageOfNodesToScore, - } - sched.SchedulePod = sched.schedulePod - return &sched -} - // New returns a Scheduler func New(client clientset.Interface, informerFactory informers.SharedInformerFactory, @@ -301,7 +251,7 @@ func New(client clientset.Interface, if options.applyDefaultProfile { var versionedCfg v1beta3.KubeSchedulerConfiguration scheme.Scheme.Default(&versionedCfg) - cfg := config.KubeSchedulerConfiguration{} + cfg := schedulerapi.KubeSchedulerConfiguration{} if err := scheme.Scheme.Convert(&versionedCfg, &cfg, nil); err != nil { return nil, err } @@ -382,6 +332,157 @@ func New(client clientset.Interface, return sched, nil } +// Run begins watching and scheduling. It starts scheduling and blocked until the context is done. +func (sched *Scheduler) Run(ctx context.Context) { + sched.SchedulingQueue.Run() + wait.UntilWithContext(ctx, sched.scheduleOne, 0) + sched.SchedulingQueue.Close() +} + +// MakeDefaultErrorFunc construct a function to handle pod scheduler error +func MakeDefaultErrorFunc(client clientset.Interface, podLister corelisters.PodLister, podQueue internalqueue.SchedulingQueue, schedulerCache internalcache.Cache) func(*framework.QueuedPodInfo, error) { + return func(podInfo *framework.QueuedPodInfo, err error) { + pod := podInfo.Pod + if err == ErrNoNodesAvailable { + klog.V(2).InfoS("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) + } else if fitError, ok := err.(*framework.FitError); ok { + // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. + podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins + klog.V(2).InfoS("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", err) + } else if apierrors.IsNotFound(err) { + klog.V(2).InfoS("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", err) + if errStatus, ok := err.(apierrors.APIStatus); ok && errStatus.Status().Details.Kind == "node" { + nodeName := errStatus.Status().Details.Name + // when node is not found, We do not remove the node right away. Trying again to get + // the node and if the node is still not found, then remove it from the scheduler cache. + _, err := client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if err != nil && apierrors.IsNotFound(err) { + node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}} + if err := schedulerCache.RemoveNode(&node); err != nil { + klog.V(4).InfoS("Node is not found; failed to remove it from the cache", "node", node.Name) + } + } + } + } else { + klog.ErrorS(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod)) + } + + // Check if the Pod exists in informer cache. + cachedPod, err := podLister.Pods(pod.Namespace).Get(pod.Name) + if err != nil { + klog.InfoS("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", err) + return + } + + // In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler. + // It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version. + if len(cachedPod.Spec.NodeName) != 0 { + klog.InfoS("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName) + return + } + + // As is from SharedInformer, we need to do a DeepCopy() here. + podInfo.PodInfo = framework.NewPodInfo(cachedPod.DeepCopy()) + if err := podQueue.AddUnschedulableIfNotPresent(podInfo, podQueue.SchedulingCycle()); err != nil { + klog.ErrorS(err, "Error occurred") + } + } +} + +// NewInformerFactory creates a SharedInformerFactory and initializes a scheduler specific +// in-place podInformer. +func NewInformerFactory(cs clientset.Interface, resyncPeriod time.Duration) informers.SharedInformerFactory { + informerFactory := informers.NewSharedInformerFactory(cs, resyncPeriod) + informerFactory.InformerFor(&v1.Pod{}, newPodInformer) + return informerFactory +} + +func buildExtenders(extenders []schedulerapi.Extender, profiles []schedulerapi.KubeSchedulerProfile) ([]framework.Extender, error) { + var fExtenders []framework.Extender + if len(extenders) == 0 { + return nil, nil + } + + var ignoredExtendedResources []string + var ignorableExtenders []framework.Extender + for i := range extenders { + klog.V(2).InfoS("Creating extender", "extender", extenders[i]) + extender, err := NewHTTPExtender(&extenders[i]) + if err != nil { + return nil, err + } + if !extender.IsIgnorable() { + fExtenders = append(fExtenders, extender) + } else { + ignorableExtenders = append(ignorableExtenders, extender) + } + for _, r := range extenders[i].ManagedResources { + if r.IgnoredByScheduler { + ignoredExtendedResources = append(ignoredExtendedResources, r.Name) + } + } + } + // place ignorable extenders to the tail of extenders + fExtenders = append(fExtenders, ignorableExtenders...) + + // If there are any extended resources found from the Extenders, append them to the pluginConfig for each profile. + // This should only have an effect on ComponentConfig, where it is possible to configure Extenders and + // plugin args (and in which case the extender ignored resources take precedence). + if len(ignoredExtendedResources) == 0 { + return fExtenders, nil + } + + for i := range profiles { + prof := &profiles[i] + var found = false + for k := range prof.PluginConfig { + if prof.PluginConfig[k].Name == noderesources.Name { + // Update the existing args + pc := &prof.PluginConfig[k] + args, ok := pc.Args.(*schedulerapi.NodeResourcesFitArgs) + if !ok { + return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", pc.Args) + } + args.IgnoredResources = ignoredExtendedResources + found = true + break + } + } + if !found { + return nil, fmt.Errorf("can't find NodeResourcesFitArgs in plugin config") + } + } + return fExtenders, nil +} + +// newScheduler creates a Scheduler object. +func newScheduler( + cache internalcache.Cache, + extenders []framework.Extender, + nextPod func() *framework.QueuedPodInfo, + Error func(*framework.QueuedPodInfo, error), + stopEverything <-chan struct{}, + schedulingQueue internalqueue.SchedulingQueue, + profiles profile.Map, + client clientset.Interface, + nodeInfoSnapshot *internalcache.Snapshot, + percentageOfNodesToScore int32) *Scheduler { + sched := Scheduler{ + Cache: cache, + Extenders: extenders, + NextPod: nextPod, + Error: Error, + StopEverything: stopEverything, + SchedulingQueue: schedulingQueue, + Profiles: profiles, + client: client, + nodeInfoSnapshot: nodeInfoSnapshot, + percentageOfNodesToScore: percentageOfNodesToScore, + } + sched.SchedulePod = sched.schedulePod + return &sched +} + func unionedGVKs(m map[framework.ClusterEvent]sets.String) map[framework.GVK]framework.ActionType { gvkMap := make(map[framework.GVK]framework.ActionType) for evt := range m { @@ -394,397 +495,6 @@ func unionedGVKs(m map[framework.ClusterEvent]sets.String) map[framework.GVK]fra return gvkMap } -// Run begins watching and scheduling. It starts scheduling and blocked until the context is done. -func (sched *Scheduler) Run(ctx context.Context) { - sched.SchedulingQueue.Run() - wait.UntilWithContext(ctx, sched.scheduleOne, 0) - sched.SchedulingQueue.Close() -} - -// handleSchedulingFailure records an event for the pod that indicates the -// pod has failed to schedule. Also, update the pod condition and nominated node name if set. -func (sched *Scheduler) handleSchedulingFailure(fwk framework.Framework, podInfo *framework.QueuedPodInfo, err error, reason string, nominatingInfo *framework.NominatingInfo) { - sched.Error(podInfo, err) - - // Update the scheduling queue with the nominated pod information. Without - // this, there would be a race condition between the next scheduling cycle - // and the time the scheduler receives a Pod Update for the nominated pod. - // Here we check for nil only for tests. - if sched.SchedulingQueue != nil { - sched.SchedulingQueue.AddNominatedPod(podInfo.PodInfo, nominatingInfo) - } - - pod := podInfo.Pod - msg := truncateMessage(err.Error()) - fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) - if err := updatePod(sched.client, pod, &v1.PodCondition{ - Type: v1.PodScheduled, - Status: v1.ConditionFalse, - Reason: reason, - Message: err.Error(), - }, nominatingInfo); err != nil { - klog.ErrorS(err, "Error updating pod", "pod", klog.KObj(pod)) - } -} - -// truncateMessage truncates a message if it hits the NoteLengthLimit. -func truncateMessage(message string) string { - max := validation.NoteLengthLimit - if len(message) <= max { - return message - } - suffix := " ..." - return message[:max-len(suffix)] + suffix -} - -func updatePod(client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { - klog.V(3).InfoS("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) - podStatusCopy := pod.Status.DeepCopy() - // NominatedNodeName is updated only if we are trying to set it, and the value is - // different from the existing one. - nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName - if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate { - return nil - } - if nnnNeedsUpdate { - podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName - } - return util.PatchPodStatus(client, pod, podStatusCopy) -} - -// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. -// assume modifies `assumed`. -func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { - // Optimistically assume that the binding will succeed and send it to apiserver - // in the background. - // If the binding fails, scheduler will release resources allocated to assumed pod - // immediately. - assumed.Spec.NodeName = host - - if err := sched.Cache.AssumePod(assumed); err != nil { - klog.ErrorS(err, "Scheduler cache AssumePod failed") - return err - } - // if "assumed" is a nominated pod, we should remove it from internal cache - if sched.SchedulingQueue != nil { - sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed) - } - - return nil -} - -// bind binds a pod to a given node defined in a binding object. -// The precedence for binding is: (1) extenders and (2) framework plugins. -// We expect this to run asynchronously, so we handle binding metrics internally. -func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (err error) { - defer func() { - sched.finishBinding(fwk, assumed, targetNode, err) - }() - - bound, err := sched.extendersBinding(assumed, targetNode) - if bound { - return err - } - bindStatus := fwk.RunBindPlugins(ctx, state, assumed, targetNode) - if bindStatus.IsSuccess() { - return nil - } - if bindStatus.Code() == framework.Error { - return bindStatus.AsError() - } - return fmt.Errorf("bind status: %s, %v", bindStatus.Code().String(), bindStatus.Message()) -} - -// TODO(#87159): Move this to a Plugin. -func (sched *Scheduler) extendersBinding(pod *v1.Pod, node string) (bool, error) { - for _, extender := range sched.Extenders { - if !extender.IsBinder() || !extender.IsInterested(pod) { - continue - } - return true, extender.Bind(&v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID}, - Target: v1.ObjectReference{Kind: "Node", Name: node}, - }) - } - return false, nil -} - -func (sched *Scheduler) finishBinding(fwk framework.Framework, assumed *v1.Pod, targetNode string, err error) { - if finErr := sched.Cache.FinishBinding(assumed); finErr != nil { - klog.ErrorS(finErr, "Scheduler cache FinishBinding failed") - } - if err != nil { - klog.V(1).InfoS("Failed to bind pod", "pod", klog.KObj(assumed)) - return - } - - fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode) -} - -var ( - clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""} -) - -// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. -func (sched *Scheduler) scheduleOne(ctx context.Context) { - podInfo := sched.NextPod() - // pod could be nil when schedulerQueue is closed - if podInfo == nil || podInfo.Pod == nil { - return - } - pod := podInfo.Pod - fwk, err := sched.frameworkForPod(pod) - if err != nil { - // This shouldn't happen, because we only accept for scheduling the pods - // which specify a scheduler name that matches one of the profiles. - klog.ErrorS(err, "Error occurred") - return - } - if sched.skipPodSchedule(fwk, pod) { - return - } - - klog.V(3).InfoS("Attempting to schedule pod", "pod", klog.KObj(pod)) - - // Synchronously attempt to find a fit for the pod. - start := time.Now() - state := framework.NewCycleState() - state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent) - // Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty. - podsToActivate := framework.NewPodsToActivate() - state.Write(framework.PodsToActivateKey, podsToActivate) - - schedulingCycleCtx, cancel := context.WithCancel(ctx) - defer cancel() - scheduleResult, err := sched.SchedulePod(schedulingCycleCtx, fwk, state, pod) - if err != nil { - // SchedulePod() may have failed because the pod would not fit on any host, so we try to - // preempt, with the expectation that the next time the pod is tried for scheduling it - // will fit due to the preemption. It is also possible that a different pod will schedule - // into the resources that were preempted, but this is harmless. - var nominatingInfo *framework.NominatingInfo - if fitError, ok := err.(*framework.FitError); ok { - if !fwk.HasPostFilterPlugins() { - klog.V(3).InfoS("No PostFilter plugins are registered, so no preemption will be performed") - } else { - // Run PostFilter plugins to try to make the pod schedulable in a future scheduling cycle. - result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap) - if status.Code() == framework.Error { - klog.ErrorS(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status) - } else { - fitError.Diagnosis.PostFilterMsg = status.Message() - klog.V(5).InfoS("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", status) - } - if result != nil { - nominatingInfo = result.NominatingInfo - } - } - // Pod did not fit anywhere, so it is counted as a failure. If preemption - // succeeds, the pod should get counted as a success the next time we try to - // schedule it. (hopefully) - metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) - } else if err == ErrNoNodesAvailable { - nominatingInfo = clearNominatedNode - // No nodes available is counted as unschedulable rather than an error. - metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) - } else { - nominatingInfo = clearNominatedNode - klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod)) - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - } - sched.handleSchedulingFailure(fwk, podInfo, err, v1.PodReasonUnschedulable, nominatingInfo) - return - } - metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) - // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. - // This allows us to keep scheduling without waiting on binding to occur. - assumedPodInfo := podInfo.DeepCopy() - assumedPod := assumedPodInfo.Pod - // assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost - err = sched.assume(assumedPod, scheduleResult.SuggestedHost) - if err != nil { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - // This is most probably result of a BUG in retrying logic. - // We report an error here so that pod scheduling can be retried. - // This relies on the fact that Error will check if the pod has been bound - // to a node and if so will not add it back to the unscheduled pods queue - // (otherwise this would cause an infinite loop). - sched.handleSchedulingFailure(fwk, assumedPodInfo, err, SchedulerError, clearNominatedNode) - return - } - - // Run the Reserve method of reserve plugins. - if sts := fwk.RunReservePluginsReserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - // trigger un-reserve to clean up state associated with the reserved Pod - fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { - klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed") - } - sched.handleSchedulingFailure(fwk, assumedPodInfo, sts.AsError(), SchedulerError, clearNominatedNode) - return - } - - // Run "permit" plugins. - runPermitStatus := fwk.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() { - var reason string - if runPermitStatus.IsUnschedulable() { - metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) - reason = v1.PodReasonUnschedulable - } else { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - reason = SchedulerError - } - // One of the plugins returned status different than success or wait. - fwk.RunReservePluginsUnreserve(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { - klog.ErrorS(forgetErr, "Scheduler cache ForgetPod failed") - } - sched.handleSchedulingFailure(fwk, assumedPodInfo, runPermitStatus.AsError(), reason, clearNominatedNode) - return - } - - // At the end of a successful scheduling cycle, pop and move up Pods if needed. - if len(podsToActivate.Map) != 0 { - sched.SchedulingQueue.Activate(podsToActivate.Map) - // Clear the entries after activation. - podsToActivate.Map = make(map[string]*v1.Pod) - } - - // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). - go func() { - bindingCycleCtx, cancel := context.WithCancel(ctx) - defer cancel() - metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Inc() - defer metrics.SchedulerGoroutines.WithLabelValues(metrics.Binding).Dec() - - waitOnPermitStatus := fwk.WaitOnPermit(bindingCycleCtx, assumedPod) - if !waitOnPermitStatus.IsSuccess() { - var reason string - if waitOnPermitStatus.IsUnschedulable() { - metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) - reason = v1.PodReasonUnschedulable - } else { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - reason = SchedulerError - } - // trigger un-reserve plugins to clean up state associated with the reserved Pod - fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { - klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed") - } else { - // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, - // as the assumed Pod had occupied a certain amount of resources in scheduler cache. - // TODO(#103853): de-duplicate the logic. - // Avoid moving the assumed Pod itself as it's always Unschedulable. - // It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would - // update `q.moveRequest` and thus move the assumed pod to backoffQ anyways. - defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, func(pod *v1.Pod) bool { - return assumedPod.UID != pod.UID - }) - } - sched.handleSchedulingFailure(fwk, assumedPodInfo, waitOnPermitStatus.AsError(), reason, clearNominatedNode) - return - } - - // Run "prebind" plugins. - preBindStatus := fwk.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if !preBindStatus.IsSuccess() { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - // trigger un-reserve plugins to clean up state associated with the reserved Pod - fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if forgetErr := sched.Cache.ForgetPod(assumedPod); forgetErr != nil { - klog.ErrorS(forgetErr, "scheduler cache ForgetPod failed") - } else { - // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, - // as the assumed Pod had occupied a certain amount of resources in scheduler cache. - // TODO(#103853): de-duplicate the logic. - sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil) - } - sched.handleSchedulingFailure(fwk, assumedPodInfo, preBindStatus.AsError(), SchedulerError, clearNominatedNode) - return - } - - err := sched.bind(bindingCycleCtx, fwk, assumedPod, scheduleResult.SuggestedHost, state) - if err != nil { - metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) - // trigger un-reserve plugins to clean up state associated with the reserved Pod - fwk.RunReservePluginsUnreserve(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - if err := sched.Cache.ForgetPod(assumedPod); err != nil { - klog.ErrorS(err, "scheduler cache ForgetPod failed") - } else { - // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, - // as the assumed Pod had occupied a certain amount of resources in scheduler cache. - // TODO(#103853): de-duplicate the logic. - sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(internalqueue.AssignedPodDelete, nil) - } - sched.handleSchedulingFailure(fwk, assumedPodInfo, fmt.Errorf("binding rejected: %w", err), SchedulerError, clearNominatedNode) - return - } - // Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2. - klog.V(2).InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) - metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start)) - metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts)) - metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(podInfo)).Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp)) - - // Run "postbind" plugins. - fwk.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost) - - // At the end of a successful binding cycle, move up Pods if needed. - if len(podsToActivate.Map) != 0 { - sched.SchedulingQueue.Activate(podsToActivate.Map) - // Unlike the logic in scheduling cycle, we don't bother deleting the entries - // as `podsToActivate.Map` is no longer consumed. - } - }() -} - -func getAttemptsLabel(p *framework.QueuedPodInfo) string { - // We breakdown the pod scheduling duration by attempts capped to a limit - // to avoid ending up with a high cardinality metric. - if p.Attempts >= 15 { - return "15+" - } - return strconv.Itoa(p.Attempts) -} - -func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) { - fwk, ok := sched.Profiles[pod.Spec.SchedulerName] - if !ok { - return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName) - } - return fwk, nil -} - -// skipPodSchedule returns true if we could skip scheduling the pod for specified cases. -func (sched *Scheduler) skipPodSchedule(fwk framework.Framework, pod *v1.Pod) bool { - // Case 1: pod is being deleted. - if pod.DeletionTimestamp != nil { - fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) - klog.V(3).InfoS("Skip schedule deleting pod", "pod", klog.KObj(pod)) - return true - } - - // Case 2: pod that has been assumed could be skipped. - // An assumed pod can be added again to the scheduling queue if it got an update event - // during its previous scheduling cycle but before getting assumed. - isAssumed, err := sched.Cache.IsAssumedPod(pod) - if err != nil { - utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err)) - return false - } - return isAssumed -} - -// NewInformerFactory creates a SharedInformerFactory and initializes a scheduler specific -// in-place podInformer. -func NewInformerFactory(cs clientset.Interface, resyncPeriod time.Duration) informers.SharedInformerFactory { - informerFactory := informers.NewSharedInformerFactory(cs, resyncPeriod) - informerFactory.InformerFor(&v1.Pod{}, newPodInformer) - return informerFactory -} - // newPodInformer creates a shared index informer that returns only non-terminal pods. func newPodInformer(cs clientset.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { selector := fmt.Sprintf("status.phase!=%v,status.phase!=%v", v1.PodSucceeded, v1.PodFailed) @@ -793,434 +503,3 @@ func newPodInformer(cs clientset.Interface, resyncPeriod time.Duration) cache.Sh } return coreinformers.NewFilteredPodInformer(cs, metav1.NamespaceAll, resyncPeriod, nil, tweakListOptions) } - -// snapshot snapshots scheduler cache and node infos for all fit and priority -// functions. -func (sched *Scheduler) snapshot() error { - // Used for all fit and priority funcs. - return sched.Cache.UpdateSnapshot(sched.nodeInfoSnapshot) -} - -// schedulePod tries to schedule the given pod to one of the nodes in the node list. -// If it succeeds, it will return the name of the node. -// If it fails, it will return a FitError with reasons. -func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) { - trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name}) - defer trace.LogIfLong(100 * time.Millisecond) - - if err := sched.snapshot(); err != nil { - return result, err - } - trace.Step("Snapshotting scheduler cache and node infos done") - - if sched.nodeInfoSnapshot.NumNodes() == 0 { - return result, ErrNoNodesAvailable - } - - feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod) - if err != nil { - return result, err - } - trace.Step("Computing predicates done") - - if len(feasibleNodes) == 0 { - return result, &framework.FitError{ - Pod: pod, - NumAllNodes: sched.nodeInfoSnapshot.NumNodes(), - Diagnosis: diagnosis, - } - } - - // When only one node after predicate, just use it. - if len(feasibleNodes) == 1 { - return ScheduleResult{ - SuggestedHost: feasibleNodes[0].Name, - EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap), - FeasibleNodes: 1, - }, nil - } - - priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes) - if err != nil { - return result, err - } - - host, err := selectHost(priorityList) - trace.Step("Prioritizing done") - - return ScheduleResult{ - SuggestedHost: host, - EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap), - FeasibleNodes: len(feasibleNodes), - }, err -} - -// selectHost takes a prioritized list of nodes and then picks one -// in a reservoir sampling manner from the nodes that had the highest score. -func selectHost(nodeScoreList framework.NodeScoreList) (string, error) { - if len(nodeScoreList) == 0 { - return "", fmt.Errorf("empty priorityList") - } - maxScore := nodeScoreList[0].Score - selected := nodeScoreList[0].Name - cntOfMaxScore := 1 - for _, ns := range nodeScoreList[1:] { - if ns.Score > maxScore { - maxScore = ns.Score - selected = ns.Name - cntOfMaxScore = 1 - } else if ns.Score == maxScore { - cntOfMaxScore++ - if rand.Intn(cntOfMaxScore) == 0 { - // Replace the candidate with probability of 1/cntOfMaxScore - selected = ns.Name - } - } - } - return selected, nil -} - -// numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops -// its search for more feasible nodes. -func (sched *Scheduler) numFeasibleNodesToFind(numAllNodes int32) (numNodes int32) { - if numAllNodes < minFeasibleNodesToFind || sched.percentageOfNodesToScore >= 100 { - return numAllNodes - } - - adaptivePercentage := sched.percentageOfNodesToScore - if adaptivePercentage <= 0 { - basePercentageOfNodesToScore := int32(50) - adaptivePercentage = basePercentageOfNodesToScore - numAllNodes/125 - if adaptivePercentage < minFeasibleNodesPercentageToFind { - adaptivePercentage = minFeasibleNodesPercentageToFind - } - } - - numNodes = numAllNodes * adaptivePercentage / 100 - if numNodes < minFeasibleNodesToFind { - return minFeasibleNodesToFind - } - - return numNodes -} - -func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*v1.Node, error) { - nnn := pod.Status.NominatedNodeName - nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn) - if err != nil { - return nil, err - } - node := []*framework.NodeInfo{nodeInfo} - feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, node) - if err != nil { - return nil, err - } - - feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) - if err != nil { - return nil, err - } - - return feasibleNodes, nil -} - -// Filters the nodes to find the ones that fit the pod based on the framework -// filter plugins and filter extenders. -func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) { - diagnosis := framework.Diagnosis{ - NodeToStatusMap: make(framework.NodeToStatusMap), - UnschedulablePlugins: sets.NewString(), - } - - // Run "prefilter" plugins. - preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod) - allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List() - if err != nil { - return nil, diagnosis, err - } - if !s.IsSuccess() { - if !s.IsUnschedulable() { - return nil, diagnosis, s.AsError() - } - // All nodes will have the same status. Some non trivial refactoring is - // needed to avoid this copy. - for _, n := range allNodes { - diagnosis.NodeToStatusMap[n.Node().Name] = s - } - // Status satisfying IsUnschedulable() gets injected into diagnosis.UnschedulablePlugins. - if s.FailedPlugin() != "" { - diagnosis.UnschedulablePlugins.Insert(s.FailedPlugin()) - } - return nil, diagnosis, nil - } - - // "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption. - // This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes. - if len(pod.Status.NominatedNodeName) > 0 { - feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis) - if err != nil { - klog.ErrorS(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName) - } - // Nominated node passes all the filters, scheduler is good to assign this node to the pod. - if len(feasibleNodes) != 0 { - return feasibleNodes, diagnosis, nil - } - } - - nodes := allNodes - if !preRes.AllNodes() { - nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames)) - for n := range preRes.NodeNames { - nInfo, err := sched.nodeInfoSnapshot.NodeInfos().Get(n) - if err != nil { - return nil, diagnosis, err - } - nodes = append(nodes, nInfo) - } - } - feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, nodes) - if err != nil { - return nil, diagnosis, err - } - - feasibleNodes, err = findNodesThatPassExtenders(sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) - if err != nil { - return nil, diagnosis, err - } - return feasibleNodes, diagnosis, nil -} - -// findNodesThatPassFilters finds the nodes that fit the filter plugins. -func (sched *Scheduler) findNodesThatPassFilters( - ctx context.Context, - fwk framework.Framework, - state *framework.CycleState, - pod *v1.Pod, - diagnosis framework.Diagnosis, - nodes []*framework.NodeInfo) ([]*v1.Node, error) { - numNodesToFind := sched.numFeasibleNodesToFind(int32(len(nodes))) - - // Create feasible list with enough space to avoid growing it - // and allow assigning. - feasibleNodes := make([]*v1.Node, numNodesToFind) - - if !fwk.HasFilterPlugins() { - length := len(nodes) - for i := range feasibleNodes { - feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%length].Node() - } - sched.nextStartNodeIndex = (sched.nextStartNodeIndex + len(feasibleNodes)) % length - return feasibleNodes, nil - } - - errCh := parallelize.NewErrorChannel() - var statusesLock sync.Mutex - var feasibleNodesLen int32 - ctx, cancel := context.WithCancel(ctx) - checkNode := func(i int) { - // We check the nodes starting from where we left off in the previous scheduling cycle, - // this is to make sure all nodes have the same chance of being examined across pods. - nodeInfo := nodes[(sched.nextStartNodeIndex+i)%len(nodes)] - status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo) - if status.Code() == framework.Error { - errCh.SendErrorWithCancel(status.AsError(), cancel) - return - } - if status.IsSuccess() { - length := atomic.AddInt32(&feasibleNodesLen, 1) - if length > numNodesToFind { - cancel() - atomic.AddInt32(&feasibleNodesLen, -1) - } else { - feasibleNodes[length-1] = nodeInfo.Node() - } - } else { - statusesLock.Lock() - diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status - diagnosis.UnschedulablePlugins.Insert(status.FailedPlugin()) - statusesLock.Unlock() - } - } - - beginCheckNode := time.Now() - statusCode := framework.Success - defer func() { - // We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins - // function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle. - // Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod. - metrics.FrameworkExtensionPointDuration.WithLabelValues(frameworkruntime.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode)) - }() - - // Stops searching for more nodes once the configured number of feasible nodes - // are found. - fwk.Parallelizer().Until(ctx, len(nodes), checkNode) - processedNodes := int(feasibleNodesLen) + len(diagnosis.NodeToStatusMap) - sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes) - - feasibleNodes = feasibleNodes[:feasibleNodesLen] - if err := errCh.ReceiveError(); err != nil { - statusCode = framework.Error - return nil, err - } - return feasibleNodes, nil -} - -func findNodesThatPassExtenders(extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*v1.Node, statuses framework.NodeToStatusMap) ([]*v1.Node, error) { - // Extenders are called sequentially. - // Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next - // extender in a decreasing manner. - for _, extender := range extenders { - if len(feasibleNodes) == 0 { - break - } - if !extender.IsInterested(pod) { - continue - } - - // Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in , - // so that the scheduler framework can respect the UnschedulableAndUnresolvable status for - // particular nodes, and this may eventually improve preemption efficiency. - // Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable - // status ahead of others. - feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes) - if err != nil { - if extender.IsIgnorable() { - klog.InfoS("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err) - continue - } - return nil, err - } - - for failedNodeName, failedMsg := range failedAndUnresolvableMap { - var aggregatedReasons []string - if _, found := statuses[failedNodeName]; found { - aggregatedReasons = statuses[failedNodeName].Reasons() - } - aggregatedReasons = append(aggregatedReasons, failedMsg) - statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...) - } - - for failedNodeName, failedMsg := range failedMap { - if _, found := failedAndUnresolvableMap[failedNodeName]; found { - // failedAndUnresolvableMap takes precedence over failedMap - // note that this only happens if the extender returns the node in both maps - continue - } - if _, found := statuses[failedNodeName]; !found { - statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg) - } else { - statuses[failedNodeName].AppendReason(failedMsg) - } - } - - feasibleNodes = feasibleList - } - return feasibleNodes, nil -} - -// prioritizeNodes prioritizes the nodes by running the score plugins, -// which return a score for each node from the call to RunScorePlugins(). -// The scores from each plugin are added together to make the score for that node, then -// any extenders are run as well. -// All scores are finally combined (added) to get the total weighted scores of all nodes -func prioritizeNodes( - ctx context.Context, - extenders []framework.Extender, - fwk framework.Framework, - state *framework.CycleState, - pod *v1.Pod, - nodes []*v1.Node, -) (framework.NodeScoreList, error) { - // If no priority configs are provided, then all nodes will have a score of one. - // This is required to generate the priority list in the required format - if len(extenders) == 0 && !fwk.HasScorePlugins() { - result := make(framework.NodeScoreList, 0, len(nodes)) - for i := range nodes { - result = append(result, framework.NodeScore{ - Name: nodes[i].Name, - Score: 1, - }) - } - return result, nil - } - - // Run PreScore plugins. - preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes) - if !preScoreStatus.IsSuccess() { - return nil, preScoreStatus.AsError() - } - - // Run the Score plugins. - scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes) - if !scoreStatus.IsSuccess() { - return nil, scoreStatus.AsError() - } - - // Additional details logged at level 10 if enabled. - klogV := klog.V(10) - if klogV.Enabled() { - for plugin, nodeScoreList := range scoresMap { - for _, nodeScore := range nodeScoreList { - klogV.InfoS("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", plugin, "node", nodeScore.Name, "score", nodeScore.Score) - } - } - } - - // Summarize all scores. - result := make(framework.NodeScoreList, 0, len(nodes)) - - for i := range nodes { - result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0}) - for j := range scoresMap { - result[i].Score += scoresMap[j][i].Score - } - } - - if len(extenders) != 0 && nodes != nil { - var mu sync.Mutex - var wg sync.WaitGroup - combinedScores := make(map[string]int64, len(nodes)) - for i := range extenders { - if !extenders[i].IsInterested(pod) { - continue - } - wg.Add(1) - go func(extIndex int) { - metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Inc() - defer func() { - metrics.SchedulerGoroutines.WithLabelValues(metrics.PrioritizingExtender).Dec() - wg.Done() - }() - prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes) - if err != nil { - // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities - klog.V(5).InfoS("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name()) - return - } - mu.Lock() - for i := range *prioritizedList { - host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score - if klogV.Enabled() { - klogV.InfoS("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", host, "score", score) - } - combinedScores[host] += score * weight - } - mu.Unlock() - }(i) - } - // wait for all go routines to finish - wg.Wait() - for i := range result { - // MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore, - // therefore we need to scale the score returned by extenders to the score range used by the scheduler. - result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority) - } - } - - if klogV.Enabled() { - for i := range result { - klogV.InfoS("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", result[i].Name, "score", result[i].Score) - } - } - return result, nil -} diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index b1133a50faf..9ded8203db1 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -18,254 +18,31 @@ package scheduler import ( "context" - "errors" "fmt" - "math" - "reflect" - "regexp" "sort" - "strconv" "strings" - "sync" "testing" "time" "github.com/google/go-cmp/cmp" v1 "k8s.io/api/core/v1" - eventsv1 "k8s.io/api/events/v1" - "k8s.io/apimachinery/pkg/api/resource" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" - clientsetfake "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/kubernetes/scheme" - clienttesting "k8s.io/client-go/testing" - clientcache "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/events" - "k8s.io/component-helpers/storage/volume" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/selectorspread" - "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" - fakecache "k8s.io/kubernetes/pkg/scheduler/internal/cache/fake" internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" "k8s.io/kubernetes/pkg/scheduler/profile" - st "k8s.io/kubernetes/pkg/scheduler/testing" - schedutil "k8s.io/kubernetes/pkg/scheduler/util" - "k8s.io/utils/pointer" + testingclock "k8s.io/utils/clock/testing" ) -var podTopologySpreadFunc = frameworkruntime.FactoryAdapter(feature.Features{}, podtopologyspread.New) - -func podWithID(id, desiredHost string) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: id, - UID: types.UID(id), - }, - Spec: v1.PodSpec{ - NodeName: desiredHost, - SchedulerName: testSchedulerName, - }, - } -} - -func deletingPod(id string) *v1.Pod { - deletionTimestamp := metav1.Now() - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: id, - UID: types.UID(id), - DeletionTimestamp: &deletionTimestamp, - }, - Spec: v1.PodSpec{ - NodeName: "", - SchedulerName: testSchedulerName, - }, - } -} - -func podWithPort(id, desiredHost string, port int) *v1.Pod { - pod := podWithID(id, desiredHost) - pod.Spec.Containers = []v1.Container{ - {Name: "ctr", Ports: []v1.ContainerPort{{HostPort: int32(port)}}}, - } - return pod -} - -func podWithResources(id, desiredHost string, limits v1.ResourceList, requests v1.ResourceList) *v1.Pod { - pod := podWithID(id, desiredHost) - pod.Spec.Containers = []v1.Container{ - {Name: "ctr", Resources: v1.ResourceRequirements{Limits: limits, Requests: requests}}, - } - return pod -} - -type mockScheduleResult struct { - result ScheduleResult - err error -} - -var ( - errPrioritize = fmt.Errorf("priority map encounters an error") -) - -type noPodsFilterPlugin struct{} - -// Name returns name of the plugin. -func (pl *noPodsFilterPlugin) Name() string { - return "NoPodsFilter" -} - -// Filter invoked at the filter extension point. -func (pl *noPodsFilterPlugin) Filter(_ context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { - if len(nodeInfo.Pods) == 0 { - return nil - } - return framework.NewStatus(framework.Unschedulable, st.ErrReasonFake) -} - -// NewNoPodsFilterPlugin initializes a noPodsFilterPlugin and returns it. -func NewNoPodsFilterPlugin(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { - return &noPodsFilterPlugin{}, nil -} - -type numericMapPlugin struct{} - -func newNumericMapPlugin() frameworkruntime.PluginFactory { - return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { - return &numericMapPlugin{}, nil - } -} - -func (pl *numericMapPlugin) Name() string { - return "NumericMap" -} - -func (pl *numericMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeName string) (int64, *framework.Status) { - score, err := strconv.Atoi(nodeName) - if err != nil { - return 0, framework.NewStatus(framework.Error, fmt.Sprintf("Error converting nodename to int: %+v", nodeName)) - } - return int64(score), nil -} - -func (pl *numericMapPlugin) ScoreExtensions() framework.ScoreExtensions { - return nil -} - -type reverseNumericMapPlugin struct{} - -func newReverseNumericMapPlugin() frameworkruntime.PluginFactory { - return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { - return &reverseNumericMapPlugin{}, nil - } -} - -func (pl *reverseNumericMapPlugin) Name() string { - return "ReverseNumericMap" -} - -func (pl *reverseNumericMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeName string) (int64, *framework.Status) { - score, err := strconv.Atoi(nodeName) - if err != nil { - return 0, framework.NewStatus(framework.Error, fmt.Sprintf("Error converting nodename to int: %+v", nodeName)) - } - return int64(score), nil -} - -func (pl *reverseNumericMapPlugin) ScoreExtensions() framework.ScoreExtensions { - return pl -} - -func (pl *reverseNumericMapPlugin) NormalizeScore(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeScores framework.NodeScoreList) *framework.Status { - var maxScore float64 - minScore := math.MaxFloat64 - - for _, hostPriority := range nodeScores { - maxScore = math.Max(maxScore, float64(hostPriority.Score)) - minScore = math.Min(minScore, float64(hostPriority.Score)) - } - for i, hostPriority := range nodeScores { - nodeScores[i] = framework.NodeScore{ - Name: hostPriority.Name, - Score: int64(maxScore + minScore - float64(hostPriority.Score)), - } - } - return nil -} - -type trueMapPlugin struct{} - -func newTrueMapPlugin() frameworkruntime.PluginFactory { - return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { - return &trueMapPlugin{}, nil - } -} - -func (pl *trueMapPlugin) Name() string { - return "TrueMap" -} - -func (pl *trueMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, _ string) (int64, *framework.Status) { - return 1, nil -} - -func (pl *trueMapPlugin) ScoreExtensions() framework.ScoreExtensions { - return pl -} - -func (pl *trueMapPlugin) NormalizeScore(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeScores framework.NodeScoreList) *framework.Status { - for _, host := range nodeScores { - if host.Name == "" { - return framework.NewStatus(framework.Error, "unexpected empty host name") - } - } - return nil -} - -type falseMapPlugin struct{} - -func newFalseMapPlugin() frameworkruntime.PluginFactory { - return func(_ runtime.Object, _ framework.Handle) (framework.Plugin, error) { - return &falseMapPlugin{}, nil - } -} - -func (pl *falseMapPlugin) Name() string { - return "FalseMap" -} - -func (pl *falseMapPlugin) Score(_ context.Context, _ *framework.CycleState, _ *v1.Pod, _ string) (int64, *framework.Status) { - return 0, framework.AsStatus(errPrioritize) -} - -func (pl *falseMapPlugin) ScoreExtensions() framework.ScoreExtensions { - return nil -} - -var emptySnapshot = internalcache.NewEmptySnapshot() - -func makeNodeList(nodeNames []string) []*v1.Node { - result := make([]*v1.Node, 0, len(nodeNames)) - for _, nodeName := range nodeNames { - result = append(result, &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}) - } - return result -} - func TestSchedulerCreation(t *testing.T) { invalidRegistry := map[string]frameworkruntime.PluginFactory{ defaultbinder.Name: defaultbinder.New, @@ -385,7 +162,7 @@ func TestSchedulerCreation(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - client := clientsetfake.NewSimpleClientset() + client := fake.NewSimpleClientset() informerFactory := informers.NewSharedInformerFactory(client, 0) eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) @@ -448,2536 +225,202 @@ func TestSchedulerCreation(t *testing.T) { } } -func TestSchedulerScheduleOne(t *testing.T) { - testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} - client := clientsetfake.NewSimpleClientset(&testNode) - eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) - errS := errors.New("scheduler") - errB := errors.New("binder") - preBindErr := errors.New("on PreBind") +func TestDefaultErrorFunc(t *testing.T) { + testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}} + testPodUpdated := testPod.DeepCopy() + testPodUpdated.Labels = map[string]string{"foo": ""} - table := []struct { - name string - injectBindError error - sendPod *v1.Pod - registerPluginFuncs []st.RegisterPluginFunc - expectErrorPod *v1.Pod - expectForgetPod *v1.Pod - expectAssumedPod *v1.Pod - expectError error - expectBind *v1.Binding - eventReason string - mockResult mockScheduleResult + tests := []struct { + name string + injectErr error + podUpdatedDuringScheduling bool // pod is updated during a scheduling cycle + podDeletedDuringScheduling bool // pod is deleted during a scheduling cycle + expect *v1.Pod }{ { - name: "error reserve pod", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, - registerPluginFuncs: []st.RegisterPluginFunc{ - st.RegisterReservePlugin("FakeReserve", st.NewFakeReservePlugin(framework.NewStatus(framework.Error, "reserve error"))), - }, - expectErrorPod: podWithID("foo", testNode.Name), - expectForgetPod: podWithID("foo", testNode.Name), - expectAssumedPod: podWithID("foo", testNode.Name), - expectError: fmt.Errorf(`running Reserve plugin "FakeReserve": %w`, errors.New("reserve error")), - eventReason: "FailedScheduling", + name: "pod is updated during a scheduling cycle", + injectErr: nil, + podUpdatedDuringScheduling: true, + expect: testPodUpdated, }, { - name: "error permit pod", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, - registerPluginFuncs: []st.RegisterPluginFunc{ - st.RegisterPermitPlugin("FakePermit", st.NewFakePermitPlugin(framework.NewStatus(framework.Error, "permit error"), time.Minute)), - }, - expectErrorPod: podWithID("foo", testNode.Name), - expectForgetPod: podWithID("foo", testNode.Name), - expectAssumedPod: podWithID("foo", testNode.Name), - expectError: fmt.Errorf(`running Permit plugin "FakePermit": %w`, errors.New("permit error")), - eventReason: "FailedScheduling", + name: "pod is not updated during a scheduling cycle", + injectErr: nil, + expect: testPod, }, { - name: "error prebind pod", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, - registerPluginFuncs: []st.RegisterPluginFunc{ - st.RegisterPreBindPlugin("FakePreBind", st.NewFakePreBindPlugin(framework.AsStatus(preBindErr))), - }, - expectErrorPod: podWithID("foo", testNode.Name), - expectForgetPod: podWithID("foo", testNode.Name), - expectAssumedPod: podWithID("foo", testNode.Name), - expectError: fmt.Errorf(`running PreBind plugin "FakePreBind": %w`, preBindErr), - eventReason: "FailedScheduling", - }, - { - name: "bind assumed pod scheduled", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, - expectBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: testNode.Name}}, - expectAssumedPod: podWithID("foo", testNode.Name), - eventReason: "Scheduled", - }, - { - name: "error pod failed scheduling", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, errS}, - expectError: errS, - expectErrorPod: podWithID("foo", ""), - eventReason: "FailedScheduling", - }, - { - name: "error bind forget pod failed scheduling", - sendPod: podWithID("foo", ""), - mockResult: mockScheduleResult{ScheduleResult{SuggestedHost: testNode.Name, EvaluatedNodes: 1, FeasibleNodes: 1}, nil}, - expectBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: testNode.Name}}, - expectAssumedPod: podWithID("foo", testNode.Name), - injectBindError: errB, - expectError: fmt.Errorf(`binding rejected: %w`, fmt.Errorf("running Bind plugin %q: %w", "DefaultBinder", errors.New("binder"))), - expectErrorPod: podWithID("foo", testNode.Name), - expectForgetPod: podWithID("foo", testNode.Name), - eventReason: "FailedScheduling", - }, - { - name: "deleting pod", - sendPod: deletingPod("foo"), - mockResult: mockScheduleResult{ScheduleResult{}, nil}, - eventReason: "FailedScheduling", + name: "pod is deleted during a scheduling cycle", + injectErr: nil, + podDeletedDuringScheduling: true, + expect: nil, }, } - for _, item := range table { - t.Run(item.name, func(t *testing.T) { - var gotError error - var gotPod *v1.Pod - var gotForgetPod *v1.Pod - var gotAssumedPod *v1.Pod - var gotBinding *v1.Binding - cache := &fakecache.Cache{ - ForgetFunc: func(pod *v1.Pod) { - gotForgetPod = pod - }, - AssumeFunc: func(pod *v1.Pod) { - gotAssumedPod = pod - }, - IsAssumedPodFunc: func(pod *v1.Pod) bool { - if pod == nil || gotAssumedPod == nil { - return false - } - return pod.UID == gotAssumedPod.UID - }, + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + + client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}) + informerFactory := informers.NewSharedInformerFactory(client, 0) + podInformer := informerFactory.Core().V1().Pods() + // Need to add/update/delete testPod to the store. + podInformer.Informer().GetStore().Add(testPod) + + queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) + schedulerCache := internalcache.New(30*time.Second, stopCh) + + queue.Add(testPod) + queue.Pop() + + if tt.podUpdatedDuringScheduling { + podInformer.Informer().GetStore().Update(testPodUpdated) + queue.Update(testPod, testPodUpdated) } - client := clientsetfake.NewSimpleClientset(item.sendPod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() != "binding" { - return false, nil, nil - } - gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - return true, gotBinding, item.injectBindError - }) - registerPluginFuncs := append(item.registerPluginFuncs, - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - ) - fwk, err := st.NewFramework(registerPluginFuncs, - testSchedulerName, - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName))) - if err != nil { - t.Fatal(err) + if tt.podDeletedDuringScheduling { + podInformer.Informer().GetStore().Delete(testPod) + queue.Delete(testPod) } - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() + testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} + errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) + errFunc(testPodInfo, tt.injectErr) - s := newScheduler( - cache, - nil, - func() *framework.QueuedPodInfo { - return &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(item.sendPod)} - }, - func(p *framework.QueuedPodInfo, err error) { - gotPod = p.Pod - gotError = err - }, - nil, - internalqueue.NewTestQueue(ctx, nil), - profile.Map{ - testSchedulerName: fwk, - }, - client, - nil, - 0) - s.SchedulePod = func(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (ScheduleResult, error) { - return item.mockResult.result, item.mockResult.err - } - called := make(chan struct{}) - stopFunc := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { - e, _ := obj.(*eventsv1.Event) - if e.Reason != item.eventReason { - t.Errorf("got event %v, want %v", e.Reason, item.eventReason) + var got *v1.Pod + if tt.podUpdatedDuringScheduling { + head, e := queue.Pop() + if e != nil { + t.Fatalf("Cannot pop pod from the activeQ: %v", e) } - close(called) - }) - s.scheduleOne(ctx) - <-called - if e, a := item.expectAssumedPod, gotAssumedPod; !reflect.DeepEqual(e, a) { - t.Errorf("assumed pod: wanted %v, got %v", e, a) + got = head.Pod + } else { + got = getPodFromPriorityQueue(queue, testPod) } - if e, a := item.expectErrorPod, gotPod; !reflect.DeepEqual(e, a) { - t.Errorf("error pod: wanted %v, got %v", e, a) + + if diff := cmp.Diff(tt.expect, got); diff != "" { + t.Errorf("Unexpected pod (-want, +got): %s", diff) } - if e, a := item.expectForgetPod, gotForgetPod; !reflect.DeepEqual(e, a) { - t.Errorf("forget pod: wanted %v, got %v", e, a) - } - if e, a := item.expectError, gotError; !reflect.DeepEqual(e, a) { - t.Errorf("error: wanted %v, got %v", e, a) - } - if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { - t.Errorf("got binding diff (-want, +got): %s", diff) - } - stopFunc() }) } } -type fakeNodeSelectorArgs struct { - NodeName string `json:"nodeName"` -} - -type fakeNodeSelector struct { - fakeNodeSelectorArgs -} - -func newFakeNodeSelector(args runtime.Object, _ framework.Handle) (framework.Plugin, error) { - pl := &fakeNodeSelector{} - if err := frameworkruntime.DecodeInto(args, &pl.fakeNodeSelectorArgs); err != nil { - return nil, err +func TestDefaultErrorFunc_NodeNotFound(t *testing.T) { + nodeFoo := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} + nodeBar := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "bar"}} + testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}} + tests := []struct { + name string + nodes []v1.Node + nodeNameToDelete string + injectErr error + expectNodeNames sets.String + }{ + { + name: "node is deleted during a scheduling cycle", + nodes: []v1.Node{*nodeFoo, *nodeBar}, + nodeNameToDelete: "foo", + injectErr: apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name), + expectNodeNames: sets.NewString("bar"), + }, + { + name: "node is not deleted but NodeNotFound is received incorrectly", + nodes: []v1.Node{*nodeFoo, *nodeBar}, + injectErr: apierrors.NewNotFound(v1.Resource("node"), nodeFoo.Name), + expectNodeNames: sets.NewString("foo", "bar"), + }, } - return pl, nil -} -func (s *fakeNodeSelector) Name() string { - return "FakeNodeSelector" -} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) -func (s *fakeNodeSelector) Filter(_ context.Context, _ *framework.CycleState, _ *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { - if nodeInfo.Node().Name != s.NodeName { - return framework.NewStatus(framework.UnschedulableAndUnresolvable) + client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: tt.nodes}) + informerFactory := informers.NewSharedInformerFactory(client, 0) + podInformer := informerFactory.Core().V1().Pods() + // Need to add testPod to the store. + podInformer.Informer().GetStore().Add(testPod) + + queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) + schedulerCache := internalcache.New(30*time.Second, stopCh) + + for i := range tt.nodes { + node := tt.nodes[i] + // Add node to schedulerCache no matter it's deleted in API server or not. + schedulerCache.AddNode(&node) + if node.Name == tt.nodeNameToDelete { + client.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}) + } + } + + testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} + errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) + errFunc(testPodInfo, tt.injectErr) + + gotNodes := schedulerCache.Dump().Nodes + gotNodeNames := sets.NewString() + for _, nodeInfo := range gotNodes { + gotNodeNames.Insert(nodeInfo.Node().Name) + } + if diff := cmp.Diff(tt.expectNodeNames, gotNodeNames); diff != "" { + t.Errorf("Unexpected nodes (-want, +got): %s", diff) + } + }) } +} + +func TestDefaultErrorFunc_PodAlreadyBound(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + + nodeFoo := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} + testPod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}, Spec: v1.PodSpec{NodeName: "foo"}} + + client := fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: []v1.Node{nodeFoo}}) + informerFactory := informers.NewSharedInformerFactory(client, 0) + podInformer := informerFactory.Core().V1().Pods() + // Need to add testPod to the store. + podInformer.Informer().GetStore().Add(testPod) + + queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) + schedulerCache := internalcache.New(30*time.Second, stopCh) + + // Add node to schedulerCache no matter it's deleted in API server or not. + schedulerCache.AddNode(&nodeFoo) + + testPodInfo := &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(testPod)} + errFunc := MakeDefaultErrorFunc(client, podInformer.Lister(), queue, schedulerCache) + errFunc(testPodInfo, fmt.Errorf("binding rejected: timeout")) + + pod := getPodFromPriorityQueue(queue, testPod) + if pod != nil { + t.Fatalf("Unexpected pod: %v should not be in PriorityQueue when the NodeName of pod is not empty", pod.Name) + } +} + +// getPodFromPriorityQueue is the function used in the TestDefaultErrorFunc test to get +// the specific pod from the given priority queue. It returns the found pod in the priority queue. +func getPodFromPriorityQueue(queue *internalqueue.PriorityQueue, pod *v1.Pod) *v1.Pod { + podList := queue.PendingPods() + if len(podList) == 0 { + return nil + } + + queryPodKey, err := cache.MetaNamespaceKeyFunc(pod) + if err != nil { + return nil + } + + for _, foundPod := range podList { + foundPodKey, err := cache.MetaNamespaceKeyFunc(foundPod) + if err != nil { + return nil + } + + if foundPodKey == queryPodKey { + return foundPod + } + } + return nil } - -func TestSchedulerMultipleProfilesScheduling(t *testing.T) { - nodes := []runtime.Object{ - st.MakeNode().Name("machine1").UID("machine1").Obj(), - st.MakeNode().Name("machine2").UID("machine2").Obj(), - st.MakeNode().Name("machine3").UID("machine3").Obj(), - } - pods := []*v1.Pod{ - st.MakePod().Name("pod1").UID("pod1").SchedulerName("match-machine3").Obj(), - st.MakePod().Name("pod2").UID("pod2").SchedulerName("match-machine2").Obj(), - st.MakePod().Name("pod3").UID("pod3").SchedulerName("match-machine2").Obj(), - st.MakePod().Name("pod4").UID("pod4").SchedulerName("match-machine3").Obj(), - } - wantBindings := map[string]string{ - "pod1": "machine3", - "pod2": "machine2", - "pod3": "machine2", - "pod4": "machine3", - } - wantControllers := map[string]string{ - "pod1": "match-machine3", - "pod2": "match-machine2", - "pod3": "match-machine2", - "pod4": "match-machine3", - } - - // Set up scheduler for the 3 nodes. - // We use a fake filter that only allows one particular node. We create two - // profiles, each with a different node in the filter configuration. - objs := append([]runtime.Object{ - &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ""}}}, nodes...) - client := clientsetfake.NewSimpleClientset(objs...) - broadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - informerFactory := informers.NewSharedInformerFactory(client, 0) - sched, err := New( - client, - informerFactory, - nil, - profile.NewRecorderFactory(broadcaster), - ctx.Done(), - WithProfiles( - schedulerapi.KubeSchedulerProfile{SchedulerName: "match-machine2", - Plugins: &schedulerapi.Plugins{ - Filter: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "FakeNodeSelector"}}}, - QueueSort: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "PrioritySort"}}}, - Bind: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "DefaultBinder"}}}, - }, - PluginConfig: []schedulerapi.PluginConfig{ - { - Name: "FakeNodeSelector", - Args: &runtime.Unknown{Raw: []byte(`{"nodeName":"machine2"}`)}, - }, - }, - }, - schedulerapi.KubeSchedulerProfile{ - SchedulerName: "match-machine3", - Plugins: &schedulerapi.Plugins{ - Filter: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "FakeNodeSelector"}}}, - QueueSort: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "PrioritySort"}}}, - Bind: schedulerapi.PluginSet{Enabled: []schedulerapi.Plugin{{Name: "DefaultBinder"}}}, - }, - PluginConfig: []schedulerapi.PluginConfig{ - { - Name: "FakeNodeSelector", - Args: &runtime.Unknown{Raw: []byte(`{"nodeName":"machine3"}`)}, - }, - }, - }, - ), - WithFrameworkOutOfTreeRegistry(frameworkruntime.Registry{ - "FakeNodeSelector": newFakeNodeSelector, - }), - ) - if err != nil { - t.Fatal(err) - } - - // Capture the bindings and events' controllers. - var wg sync.WaitGroup - wg.Add(2 * len(pods)) - bindings := make(map[string]string) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() != "binding" { - return false, nil, nil - } - binding := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - bindings[binding.Name] = binding.Target.Name - wg.Done() - return true, binding, nil - }) - controllers := make(map[string]string) - stopFn := broadcaster.StartEventWatcher(func(obj runtime.Object) { - e, ok := obj.(*eventsv1.Event) - if !ok || e.Reason != "Scheduled" { - return - } - controllers[e.Regarding.Name] = e.ReportingController - wg.Done() - }) - defer stopFn() - - // Run scheduler. - informerFactory.Start(ctx.Done()) - informerFactory.WaitForCacheSync(ctx.Done()) - go sched.Run(ctx) - - // Send pods to be scheduled. - for _, p := range pods { - _, err := client.CoreV1().Pods("").Create(ctx, p, metav1.CreateOptions{}) - if err != nil { - t.Fatal(err) - } - } - wg.Wait() - - // Verify correct bindings and reporting controllers. - if diff := cmp.Diff(wantBindings, bindings); diff != "" { - t.Errorf("pods were scheduled incorrectly (-want, +got):\n%s", diff) - } - if diff := cmp.Diff(wantControllers, controllers); diff != "" { - t.Errorf("events were reported with wrong controllers (-want, +got):\n%s", diff) - } -} - -func TestSchedulerNoPhantomPodAfterExpire(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(100*time.Millisecond, ctx.Done()) - pod := podWithPort("pod.Name", "", 8080) - node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} - scache.AddNode(&node) - - fns := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - st.RegisterPluginAsExtensions(nodeports.Name, nodeports.New, "Filter", "PreFilter"), - } - scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, pod, &node, fns...) - - waitPodExpireChan := make(chan struct{}) - timeout := make(chan struct{}) - go func() { - for { - select { - case <-timeout: - return - default: - } - pods, err := scache.PodCount() - if err != nil { - errChan <- fmt.Errorf("cache.List failed: %v", err) - return - } - if pods == 0 { - close(waitPodExpireChan) - return - } - time.Sleep(100 * time.Millisecond) - } - }() - // waiting for the assumed pod to expire - select { - case err := <-errChan: - t.Fatal(err) - case <-waitPodExpireChan: - case <-time.After(wait.ForeverTestTimeout): - close(timeout) - t.Fatalf("timeout timeout in waiting pod expire after %v", wait.ForeverTestTimeout) - } - - // We use conflicted pod ports to incur fit predicate failure if first pod not removed. - secondPod := podWithPort("bar", "", 8080) - queuedPodStore.Add(secondPod) - scheduler.scheduleOne(ctx) - select { - case b := <-bindingChan: - expectBinding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, - Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, - } - if !reflect.DeepEqual(expectBinding, b) { - t.Errorf("binding want=%v, get=%v", expectBinding, b) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) - } -} - -func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(10*time.Minute, ctx.Done()) - firstPod := podWithPort("pod.Name", "", 8080) - node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} - scache.AddNode(&node) - fns := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - st.RegisterPluginAsExtensions(nodeports.Name, nodeports.New, "Filter", "PreFilter"), - } - scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, firstPod, &node, fns...) - - // We use conflicted pod ports to incur fit predicate failure. - secondPod := podWithPort("bar", "", 8080) - queuedPodStore.Add(secondPod) - // queuedPodStore: [bar:8080] - // cache: [(assumed)foo:8080] - - scheduler.scheduleOne(ctx) - select { - case err := <-errChan: - expectErr := &framework.FitError{ - Pod: secondPod, - NumAllNodes: 1, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - node.Name: framework.NewStatus(framework.Unschedulable, nodeports.ErrReason).WithFailedPlugin(nodeports.Name), - }, - UnschedulablePlugins: sets.NewString(nodeports.Name), - }, - } - if !reflect.DeepEqual(expectErr, err) { - t.Errorf("err want=%v, get=%v", expectErr, err) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in fitting after %v", wait.ForeverTestTimeout) - } - - // We mimic the workflow of cache behavior when a pod is removed by user. - // Note: if the schedulernodeinfo timeout would be super short, the first pod would expire - // and would be removed itself (without any explicit actions on schedulernodeinfo). Even in that case, - // explicitly AddPod will as well correct the behavior. - firstPod.Spec.NodeName = node.Name - if err := scache.AddPod(firstPod); err != nil { - t.Fatalf("err: %v", err) - } - if err := scache.RemovePod(firstPod); err != nil { - t.Fatalf("err: %v", err) - } - - queuedPodStore.Add(secondPod) - scheduler.scheduleOne(ctx) - select { - case b := <-bindingChan: - expectBinding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, - Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, - } - if !reflect.DeepEqual(expectBinding, b) { - t.Errorf("binding want=%v, get=%v", expectBinding, b) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) - } -} - -// queuedPodStore: pods queued before processing. -// cache: scheduler cache that might contain assumed pods. -func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queuedPodStore *clientcache.FIFO, scache internalcache.Cache, - pod *v1.Pod, node *v1.Node, fns ...st.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { - scheduler, bindingChan, errChan := setupTestScheduler(ctx, queuedPodStore, scache, nil, nil, fns...) - - queuedPodStore.Add(pod) - // queuedPodStore: [foo:8080] - // cache: [] - - scheduler.scheduleOne(ctx) - // queuedPodStore: [] - // cache: [(assumed)foo:8080] - - select { - case b := <-bindingChan: - expectBinding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: pod.Name, UID: types.UID(pod.Name)}, - Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, - } - if !reflect.DeepEqual(expectBinding, b) { - t.Errorf("binding want=%v, get=%v", expectBinding, b) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout after %v", wait.ForeverTestTimeout) - } - return scheduler, bindingChan, errChan -} - -func TestSchedulerFailedSchedulingReasons(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(10*time.Minute, ctx.Done()) - - // Design the baseline for the pods, and we will make nodes that don't fit it later. - var cpu = int64(4) - var mem = int64(500) - podWithTooBigResourceRequests := podWithResources("bar", "", v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), - }, v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), - }) - - // create several nodes which cannot schedule the above pod - var nodes []*v1.Node - var objects []runtime.Object - for i := 0; i < 100; i++ { - uid := fmt.Sprintf("machine%v", i) - node := v1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: uid, UID: types.UID(uid)}, - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), - v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }, - Allocatable: v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), - v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }}, - } - scache.AddNode(&node) - nodes = append(nodes, &node) - objects = append(objects, &node) - } - - // Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary. - failedNodeStatues := framework.NodeToStatusMap{} - for _, node := range nodes { - failedNodeStatues[node.Name] = framework.NewStatus( - framework.Unschedulable, - fmt.Sprintf("Insufficient %v", v1.ResourceCPU), - fmt.Sprintf("Insufficient %v", v1.ResourceMemory), - ).WithFailedPlugin(noderesources.Name) - } - fns := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - st.RegisterPluginAsExtensions(noderesources.Name, frameworkruntime.FactoryAdapter(feature.Features{}, noderesources.NewFit), "Filter", "PreFilter"), - } - - informerFactory := informers.NewSharedInformerFactory(fake.NewSimpleClientset(objects...), 0) - scheduler, _, errChan := setupTestScheduler(ctx, queuedPodStore, scache, informerFactory, nil, fns...) - - queuedPodStore.Add(podWithTooBigResourceRequests) - scheduler.scheduleOne(ctx) - select { - case err := <-errChan: - expectErr := &framework.FitError{ - Pod: podWithTooBigResourceRequests, - NumAllNodes: len(nodes), - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: failedNodeStatues, - UnschedulablePlugins: sets.NewString(noderesources.Name), - }, - } - if len(fmt.Sprint(expectErr)) > 150 { - t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) - } - if !reflect.DeepEqual(expectErr, err) { - t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout after %v", wait.ForeverTestTimeout) - } -} - -// queuedPodStore: pods queued before processing. -// scache: scheduler cache that might contain assumed pods. -func setupTestScheduler(ctx context.Context, queuedPodStore *clientcache.FIFO, cache internalcache.Cache, informerFactory informers.SharedInformerFactory, broadcaster events.EventBroadcaster, fns ...st.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { - bindingChan := make(chan *v1.Binding, 1) - client := clientsetfake.NewSimpleClientset() - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - var b *v1.Binding - if action.GetSubresource() == "binding" { - b := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - bindingChan <- b - } - return true, b, nil - }) - - var recorder events.EventRecorder - if broadcaster != nil { - recorder = broadcaster.NewRecorder(scheme.Scheme, testSchedulerName) - } else { - recorder = &events.FakeRecorder{} - } - - if informerFactory == nil { - informerFactory = informers.NewSharedInformerFactory(fake.NewSimpleClientset(), 0) - } - schedulingQueue := internalqueue.NewTestQueueWithInformerFactory(ctx, nil, informerFactory) - - fwk, _ := st.NewFramework( - fns, - testSchedulerName, - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(recorder), - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), - ) - - errChan := make(chan error, 1) - sched := newScheduler( - cache, - nil, - func() *framework.QueuedPodInfo { - return &framework.QueuedPodInfo{PodInfo: framework.NewPodInfo(clientcache.Pop(queuedPodStore).(*v1.Pod))} - }, - func(p *framework.QueuedPodInfo, err error) { - errChan <- err - }, - nil, - schedulingQueue, - profile.Map{ - testSchedulerName: fwk, - }, - client, - internalcache.NewEmptySnapshot(), - schedulerapi.DefaultPercentageOfNodesToScore) - return sched, bindingChan, errChan -} - -func setupTestSchedulerWithVolumeBinding(ctx context.Context, volumeBinder volumebinding.SchedulerVolumeBinder, broadcaster events.EventBroadcaster) (*Scheduler, chan *v1.Binding, chan error) { - testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "machine1", UID: types.UID("machine1")}} - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - pod := podWithID("foo", "") - pod.Namespace = "foo-ns" - pod.Spec.Volumes = append(pod.Spec.Volumes, v1.Volume{Name: "testVol", - VolumeSource: v1.VolumeSource{PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ClaimName: "testPVC"}}}) - queuedPodStore.Add(pod) - scache := internalcache.New(10*time.Minute, ctx.Done()) - scache.AddNode(&testNode) - testPVC := v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "testPVC", Namespace: pod.Namespace, UID: types.UID("testPVC")}} - client := clientsetfake.NewSimpleClientset(&testNode, &testPVC) - informerFactory := informers.NewSharedInformerFactory(client, 0) - pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims() - pvcInformer.Informer().GetStore().Add(&testPVC) - - fns := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - st.RegisterPluginAsExtensions(volumebinding.Name, func(plArgs runtime.Object, handle framework.Handle) (framework.Plugin, error) { - return &volumebinding.VolumeBinding{Binder: volumeBinder, PVCLister: pvcInformer.Lister()}, nil - }, "PreFilter", "Filter", "Reserve", "PreBind"), - } - s, bindingChan, errChan := setupTestScheduler(ctx, queuedPodStore, scache, informerFactory, broadcaster, fns...) - return s, bindingChan, errChan -} - -// This is a workaround because golint complains that errors cannot -// end with punctuation. However, the real predicate error message does -// end with a period. -func makePredicateError(failReason string) error { - s := fmt.Sprintf("0/1 nodes are available: %v.", failReason) - return fmt.Errorf(s) -} - -func TestSchedulerWithVolumeBinding(t *testing.T) { - findErr := fmt.Errorf("find err") - assumeErr := fmt.Errorf("assume err") - bindErr := fmt.Errorf("bind err") - client := clientsetfake.NewSimpleClientset() - - eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: client.EventsV1()}) - - // This can be small because we wait for pod to finish scheduling first - chanTimeout := 2 * time.Second - - table := []struct { - name string - expectError error - expectPodBind *v1.Binding - expectAssumeCalled bool - expectBindCalled bool - eventReason string - volumeBinderConfig *volumebinding.FakeVolumeBinderConfig - }{ - { - name: "all bound", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - AllBound: true, - }, - expectAssumeCalled: true, - expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "foo-ns", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}}, - eventReason: "Scheduled", - }, - { - name: "bound/invalid pv affinity", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - AllBound: true, - FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonNodeConflict}, - }, - eventReason: "FailedScheduling", - expectError: makePredicateError("1 node(s) had volume node affinity conflict"), - }, - { - name: "unbound/no matches", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonBindConflict}, - }, - eventReason: "FailedScheduling", - expectError: makePredicateError("1 node(s) didn't find available persistent volumes to bind"), - }, - { - name: "bound and unbound unsatisfied", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - FindReasons: volumebinding.ConflictReasons{volumebinding.ErrReasonBindConflict, volumebinding.ErrReasonNodeConflict}, - }, - eventReason: "FailedScheduling", - expectError: makePredicateError("1 node(s) didn't find available persistent volumes to bind, 1 node(s) had volume node affinity conflict"), - }, - { - name: "unbound/found matches/bind succeeds", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{}, - expectAssumeCalled: true, - expectBindCalled: true, - expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "foo-ns", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}}, - eventReason: "Scheduled", - }, - { - name: "predicate error", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - FindErr: findErr, - }, - eventReason: "FailedScheduling", - expectError: fmt.Errorf("running %q filter plugin: %v", volumebinding.Name, findErr), - }, - { - name: "assume error", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - AssumeErr: assumeErr, - }, - expectAssumeCalled: true, - eventReason: "FailedScheduling", - expectError: fmt.Errorf("running Reserve plugin %q: %w", volumebinding.Name, assumeErr), - }, - { - name: "bind error", - volumeBinderConfig: &volumebinding.FakeVolumeBinderConfig{ - BindErr: bindErr, - }, - expectAssumeCalled: true, - expectBindCalled: true, - eventReason: "FailedScheduling", - expectError: fmt.Errorf("running PreBind plugin %q: %w", volumebinding.Name, bindErr), - }, - } - - for _, item := range table { - t.Run(item.name, func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - fakeVolumeBinder := volumebinding.NewFakeVolumeBinder(item.volumeBinderConfig) - s, bindingChan, errChan := setupTestSchedulerWithVolumeBinding(ctx, fakeVolumeBinder, eventBroadcaster) - eventChan := make(chan struct{}) - stopFunc := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { - e, _ := obj.(*eventsv1.Event) - if e, a := item.eventReason, e.Reason; e != a { - t.Errorf("expected %v, got %v", e, a) - } - close(eventChan) - }) - s.scheduleOne(ctx) - // Wait for pod to succeed or fail scheduling - select { - case <-eventChan: - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("scheduling timeout after %v", wait.ForeverTestTimeout) - } - stopFunc() - // Wait for scheduling to return an error or succeed binding. - var ( - gotErr error - gotBind *v1.Binding - ) - select { - case gotErr = <-errChan: - case gotBind = <-bindingChan: - case <-time.After(chanTimeout): - t.Fatalf("did not receive pod binding or error after %v", chanTimeout) - } - if item.expectError != nil { - if gotErr == nil || item.expectError.Error() != gotErr.Error() { - t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) - } - } else if gotErr != nil { - t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) - } - if !cmp.Equal(item.expectPodBind, gotBind) { - t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectPodBind, gotBind) - } - - if item.expectAssumeCalled != fakeVolumeBinder.AssumeCalled { - t.Errorf("expectedAssumeCall %v", item.expectAssumeCalled) - } - - if item.expectBindCalled != fakeVolumeBinder.BindCalled { - t.Errorf("expectedBindCall %v", item.expectBindCalled) - } - }) - } -} - -func TestSchedulerBinding(t *testing.T) { - table := []struct { - podName string - extenders []framework.Extender - wantBinderID int - name string - }{ - { - name: "the extender is not a binder", - podName: "pod0", - extenders: []framework.Extender{ - &fakeExtender{isBinder: false, interestedPodName: "pod0"}, - }, - wantBinderID: -1, // default binding. - }, - { - name: "one of the extenders is a binder and interested in pod", - podName: "pod0", - extenders: []framework.Extender{ - &fakeExtender{isBinder: false, interestedPodName: "pod0"}, - &fakeExtender{isBinder: true, interestedPodName: "pod0"}, - }, - wantBinderID: 1, - }, - { - name: "one of the extenders is a binder, but not interested in pod", - podName: "pod1", - extenders: []framework.Extender{ - &fakeExtender{isBinder: false, interestedPodName: "pod1"}, - &fakeExtender{isBinder: true, interestedPodName: "pod0"}, - }, - wantBinderID: -1, // default binding. - }, - } - - for _, test := range table { - t.Run(test.name, func(t *testing.T) { - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: test.podName, - }, - } - defaultBound := false - client := clientsetfake.NewSimpleClientset(pod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() == "binding" { - defaultBound = true - } - return false, nil, nil - }) - fwk, err := st.NewFramework([]st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, "", frameworkruntime.WithClientSet(client), frameworkruntime.WithEventRecorder(&events.FakeRecorder{})) - if err != nil { - t.Fatal(err) - } - stop := make(chan struct{}) - defer close(stop) - sched := &Scheduler{ - Extenders: test.extenders, - Cache: internalcache.New(100*time.Millisecond, stop), - nodeInfoSnapshot: nil, - percentageOfNodesToScore: 0, - } - err = sched.bind(context.Background(), fwk, pod, "node", nil) - if err != nil { - t.Error(err) - } - - // Checking default binding. - if wantBound := test.wantBinderID == -1; defaultBound != wantBound { - t.Errorf("got bound with default binding: %v, want %v", defaultBound, wantBound) - } - - // Checking extenders binding. - for i, ext := range test.extenders { - wantBound := i == test.wantBinderID - if gotBound := ext.(*fakeExtender).gotBind; gotBound != wantBound { - t.Errorf("got bound with extender #%d: %v, want %v", i, gotBound, wantBound) - } - } - - }) - } -} - -func TestUpdatePod(t *testing.T) { - tests := []struct { - name string - currentPodConditions []v1.PodCondition - newPodCondition *v1.PodCondition - currentNominatedNodeName string - newNominatingInfo *framework.NominatingInfo - expectedPatchRequests int - expectedPatchDataPattern string - }{ - { - name: "Should make patch request to add pod condition when there are none currently", - currentPodConditions: []v1.PodCondition{}, - newPodCondition: &v1.PodCondition{ - Type: "newType", - Status: "newStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), - Reason: "newReason", - Message: "newMessage", - }, - expectedPatchRequests: 1, - expectedPatchDataPattern: `{"status":{"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, - }, - { - name: "Should make patch request to add a new pod condition when there is already one with another type", - currentPodConditions: []v1.PodCondition{ - { - Type: "someOtherType", - Status: "someOtherTypeStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 11, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 10, 0, 0, 0, 0, time.UTC)), - Reason: "someOtherTypeReason", - Message: "someOtherTypeMessage", - }, - }, - newPodCondition: &v1.PodCondition{ - Type: "newType", - Status: "newStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), - Reason: "newReason", - Message: "newMessage", - }, - expectedPatchRequests: 1, - expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"someOtherType"},{"type":"newType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, - }, - { - name: "Should make patch request to update an existing pod condition", - currentPodConditions: []v1.PodCondition{ - { - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - }, - newPodCondition: &v1.PodCondition{ - Type: "currentType", - Status: "newStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)), - Reason: "newReason", - Message: "newMessage", - }, - expectedPatchRequests: 1, - expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"currentType"}]}}`, - }, - { - name: "Should make patch request to update an existing pod condition, but the transition time should remain unchanged because the status is the same", - currentPodConditions: []v1.PodCondition{ - { - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - }, - newPodCondition: &v1.PodCondition{ - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "newReason", - Message: "newMessage", - }, - expectedPatchRequests: 1, - expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","message":"newMessage","reason":"newReason","type":"currentType"}]}}`, - }, - { - name: "Should not make patch request if pod condition already exists and is identical and nominated node name is not set", - currentPodConditions: []v1.PodCondition{ - { - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - }, - newPodCondition: &v1.PodCondition{ - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - currentNominatedNodeName: "node1", - expectedPatchRequests: 0, - }, - { - name: "Should make patch request if pod condition already exists and is identical but nominated node name is set and different", - currentPodConditions: []v1.PodCondition{ - { - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - }, - newPodCondition: &v1.PodCondition{ - Type: "currentType", - Status: "currentStatus", - LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)), - LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)), - Reason: "currentReason", - Message: "currentMessage", - }, - newNominatingInfo: &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: "node1"}, - expectedPatchRequests: 1, - expectedPatchDataPattern: `{"status":{"nominatedNodeName":"node1"}}`, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - actualPatchRequests := 0 - var actualPatchData string - cs := &clientsetfake.Clientset{} - cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - actualPatchRequests++ - patch := action.(clienttesting.PatchAction) - actualPatchData = string(patch.GetPatch()) - // For this test, we don't care about the result of the patched pod, just that we got the expected - // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. - return true, &v1.Pod{}, nil - }) - - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "foo"}, - Status: v1.PodStatus{ - Conditions: test.currentPodConditions, - NominatedNodeName: test.currentNominatedNodeName, - }, - } - - if err := updatePod(cs, pod, test.newPodCondition, test.newNominatingInfo); err != nil { - t.Fatalf("Error calling update: %v", err) - } - - if actualPatchRequests != test.expectedPatchRequests { - t.Fatalf("Actual patch requests (%d) does not equal expected patch requests (%d), actual patch data: %v", actualPatchRequests, test.expectedPatchRequests, actualPatchData) - } - - regex, err := regexp.Compile(test.expectedPatchDataPattern) - if err != nil { - t.Fatalf("Error compiling regexp for %v: %v", test.expectedPatchDataPattern, err) - } - - if test.expectedPatchRequests > 0 && !regex.MatchString(actualPatchData) { - t.Fatalf("Patch data mismatch: Actual was %v, but expected to match regexp %v", actualPatchData, test.expectedPatchDataPattern) - } - }) - } -} - -func TestSelectHost(t *testing.T) { - tests := []struct { - name string - list framework.NodeScoreList - possibleHosts sets.String - expectsErr bool - }{ - { - name: "unique properly ordered scores", - list: []framework.NodeScore{ - {Name: "machine1.1", Score: 1}, - {Name: "machine2.1", Score: 2}, - }, - possibleHosts: sets.NewString("machine2.1"), - expectsErr: false, - }, - { - name: "equal scores", - list: []framework.NodeScore{ - {Name: "machine1.1", Score: 1}, - {Name: "machine1.2", Score: 2}, - {Name: "machine1.3", Score: 2}, - {Name: "machine2.1", Score: 2}, - }, - possibleHosts: sets.NewString("machine1.2", "machine1.3", "machine2.1"), - expectsErr: false, - }, - { - name: "out of order scores", - list: []framework.NodeScore{ - {Name: "machine1.1", Score: 3}, - {Name: "machine1.2", Score: 3}, - {Name: "machine2.1", Score: 2}, - {Name: "machine3.1", Score: 1}, - {Name: "machine1.3", Score: 3}, - }, - possibleHosts: sets.NewString("machine1.1", "machine1.2", "machine1.3"), - expectsErr: false, - }, - { - name: "empty priority list", - list: []framework.NodeScore{}, - possibleHosts: sets.NewString(), - expectsErr: true, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - // increase the randomness - for i := 0; i < 10; i++ { - got, err := selectHost(test.list) - if test.expectsErr { - if err == nil { - t.Error("Unexpected non-error") - } - } else { - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !test.possibleHosts.Has(got) { - t.Errorf("got %s is not in the possible map %v", got, test.possibleHosts) - } - } - } - }) - } -} - -func TestFindNodesThatPassExtenders(t *testing.T) { - tests := []struct { - name string - extenders []st.FakeExtender - nodes []*v1.Node - filteredNodesStatuses framework.NodeToStatusMap - expectsErr bool - expectedNodes []*v1.Node - expectedStatuses framework.NodeToStatusMap - }{ - { - name: "error", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{st.ErrorPredicateExtender}, - }, - }, - nodes: makeNodeList([]string{"a"}), - filteredNodesStatuses: make(framework.NodeToStatusMap), - expectsErr: true, - }, - { - name: "success", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{st.TruePredicateExtender}, - }, - }, - nodes: makeNodeList([]string{"a"}), - filteredNodesStatuses: make(framework.NodeToStatusMap), - expectsErr: false, - expectedNodes: makeNodeList([]string{"a"}), - expectedStatuses: make(framework.NodeToStatusMap), - }, - { - name: "unschedulable", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { - if node.Name == "a" { - return framework.NewStatus(framework.Success) - } - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) - }}, - }, - }, - nodes: makeNodeList([]string{"a", "b"}), - filteredNodesStatuses: make(framework.NodeToStatusMap), - expectsErr: false, - expectedNodes: makeNodeList([]string{"a"}), - expectedStatuses: framework.NodeToStatusMap{ - "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), - }, - }, - { - name: "unschedulable and unresolvable", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { - if node.Name == "a" { - return framework.NewStatus(framework.Success) - } - if node.Name == "b" { - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) - } - return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) - }}, - }, - }, - nodes: makeNodeList([]string{"a", "b", "c"}), - filteredNodesStatuses: make(framework.NodeToStatusMap), - expectsErr: false, - expectedNodes: makeNodeList([]string{"a"}), - expectedStatuses: framework.NodeToStatusMap{ - "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), - "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), - }, - }, - { - name: "extender may overwrite the statuses", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { - if node.Name == "a" { - return framework.NewStatus(framework.Success) - } - if node.Name == "b" { - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) - } - return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) - }}, - }, - }, - nodes: makeNodeList([]string{"a", "b", "c"}), - filteredNodesStatuses: framework.NodeToStatusMap{ - "c": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeFilterPlugin: node %q failed", "c")), - }, - expectsErr: false, - expectedNodes: makeNodeList([]string{"a"}), - expectedStatuses: framework.NodeToStatusMap{ - "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), - "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeFilterPlugin: node %q failed", "c"), fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), - }, - }, - { - name: "multiple extenders", - extenders: []st.FakeExtender{ - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { - if node.Name == "a" { - return framework.NewStatus(framework.Success) - } - if node.Name == "b" { - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) - } - return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("node %q is not allowed", node.Name)) - }}, - }, - { - ExtenderName: "FakeExtender1", - Predicates: []st.FitPredicate{func(pod *v1.Pod, node *v1.Node) *framework.Status { - if node.Name == "a" { - return framework.NewStatus(framework.Success) - } - return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node %q is not allowed", node.Name)) - }}, - }, - }, - nodes: makeNodeList([]string{"a", "b", "c"}), - filteredNodesStatuses: make(framework.NodeToStatusMap), - expectsErr: false, - expectedNodes: makeNodeList([]string{"a"}), - expectedStatuses: framework.NodeToStatusMap{ - "b": framework.NewStatus(framework.Unschedulable, fmt.Sprintf("FakeExtender: node %q failed", "b")), - "c": framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("FakeExtender: node %q failed and unresolvable", "c")), - }, - }, - } - - cmpOpts := []cmp.Option{ - cmp.Comparer(func(s1 framework.Status, s2 framework.Status) bool { - return s1.Code() == s2.Code() && reflect.DeepEqual(s1.Reasons(), s2.Reasons()) - }), - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var extenders []framework.Extender - for ii := range tt.extenders { - extenders = append(extenders, &tt.extenders[ii]) - } - - pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}} - got, err := findNodesThatPassExtenders(extenders, pod, tt.nodes, tt.filteredNodesStatuses) - if tt.expectsErr { - if err == nil { - t.Error("Unexpected non-error") - } - } else { - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if diff := cmp.Diff(tt.expectedNodes, got); diff != "" { - t.Errorf("filtered nodes (-want,+got):\n%s", diff) - } - if diff := cmp.Diff(tt.expectedStatuses, tt.filteredNodesStatuses, cmpOpts...); diff != "" { - t.Errorf("filtered statuses (-want,+got):\n%s", diff) - } - } - }) - } -} - -func TestSchedulerSchedulePod(t *testing.T) { - fts := feature.Features{} - tests := []struct { - name string - registerPlugins []st.RegisterPluginFunc - nodes []string - pvcs []v1.PersistentVolumeClaim - pod *v1.Pod - pods []*v1.Pod - wantNodes sets.String - wantEvaluatedNodes *int32 - wErr error - }{ - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - name: "test 1", - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - NumAllNodes: 2, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), - "machine2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), - }, - UnschedulablePlugins: sets.NewString("FalseFilter"), - }, - }, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}}, - wantNodes: sets.NewString("machine1", "machine2"), - name: "test 2", - wErr: nil, - }, - { - // Fits on a machine where the pod ID matches the machine name - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "machine2", UID: types.UID("machine2")}}, - wantNodes: sets.NewString("machine2"), - name: "test 3", - wErr: nil, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3", "2", "1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}}, - wantNodes: sets.NewString("3"), - name: "test 4", - wErr: nil, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3", "2", "1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - wantNodes: sets.NewString("2"), - name: "test 5", - wErr: nil, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterScorePlugin("ReverseNumericMap", newReverseNumericMapPlugin(), 2), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3", "2", "1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - wantNodes: sets.NewString("1"), - name: "test 6", - wErr: nil, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterFilterPlugin("FalseFilter", st.NewFalseFilterPlugin), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3", "2", "1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - name: "test 7", - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - NumAllNodes: 3, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "3": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), - "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), - "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("FalseFilter"), - }, - UnschedulablePlugins: sets.NewString("FalseFilter"), - }, - }, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("NoPodsFilter", NewNoPodsFilterPlugin), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - pods: []*v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}, - Spec: v1.PodSpec{ - NodeName: "2", - }, - Status: v1.PodStatus{ - Phase: v1.PodRunning, - }, - }, - }, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - nodes: []string{"1", "2"}, - name: "test 8", - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2", UID: types.UID("2")}}, - NumAllNodes: 2, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), - "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("NoPodsFilter"), - }, - UnschedulablePlugins: sets.NewString("MatchFilter", "NoPodsFilter"), - }, - }, - }, - { - // Pod with existing PVC - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pvcs: []v1.PersistentVolumeClaim{ - { - ObjectMeta: metav1.ObjectMeta{Name: "existingPVC", UID: types.UID("existingPVC"), Namespace: v1.NamespaceDefault}, - Spec: v1.PersistentVolumeClaimSpec{VolumeName: "existingPV"}, - }, - }, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "existingPVC", - }, - }, - }, - }, - }, - }, - wantNodes: sets.NewString("machine1", "machine2"), - name: "existing PVC", - wErr: nil, - }, - { - // Pod with non existing PVC - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "unknownPVC", - }, - }, - }, - }, - }, - }, - name: "unknown PVC", - wErr: &framework.FitError{ - Pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore")}, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "unknownPVC", - }, - }, - }, - }, - }, - }, - NumAllNodes: 2, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "unknownPVC" not found`).WithFailedPlugin(volumebinding.Name), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "unknownPVC" not found`).WithFailedPlugin(volumebinding.Name), - }, - UnschedulablePlugins: sets.NewString(volumebinding.Name), - }, - }, - }, - { - // Pod with deleting PVC - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin(volumebinding.Name, frameworkruntime.FactoryAdapter(fts, volumebinding.New)), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pvcs: []v1.PersistentVolumeClaim{{ObjectMeta: metav1.ObjectMeta{Name: "existingPVC", UID: types.UID("existingPVC"), Namespace: v1.NamespaceDefault, DeletionTimestamp: &metav1.Time{}}}}, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "existingPVC", - }, - }, - }, - }, - }, - }, - name: "deleted PVC", - wErr: &framework.FitError{ - Pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "ignore", UID: types.UID("ignore"), Namespace: v1.NamespaceDefault}, - Spec: v1.PodSpec{ - Volumes: []v1.Volume{ - { - VolumeSource: v1.VolumeSource{ - PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ - ClaimName: "existingPVC", - }, - }, - }, - }, - }, - }, - NumAllNodes: 2, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "machine1": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "existingPVC" is being deleted`).WithFailedPlugin(volumebinding.Name), - "machine2": framework.NewStatus(framework.UnschedulableAndUnresolvable, `persistentvolumeclaim "existingPVC" is being deleted`).WithFailedPlugin(volumebinding.Name), - }, - UnschedulablePlugins: sets.NewString(volumebinding.Name), - }, - }, - }, - { - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterScorePlugin("FalseMap", newFalseMapPlugin(), 1), - st.RegisterScorePlugin("TrueMap", newTrueMapPlugin(), 2), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"2", "1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "2"}}, - name: "test error with priority map", - wErr: fmt.Errorf("running Score plugins: %w", fmt.Errorf(`plugin "FalseMap" failed with: %w`, errPrioritize)), - }, - { - name: "test podtopologyspread plugin - 2 nodes with maxskew=1", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions( - podtopologyspread.Name, - podTopologySpreadFunc, - "PreFilter", - "Filter", - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2"}, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "p", UID: types.UID("p"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: "hostname", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - }, - }, - }, - pods: []*v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{Name: "pod1", UID: types.UID("pod1"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - NodeName: "machine1", - }, - Status: v1.PodStatus{ - Phase: v1.PodRunning, - }, - }, - }, - wantNodes: sets.NewString("machine2"), - wErr: nil, - }, - { - name: "test podtopologyspread plugin - 3 nodes with maxskew=2", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPluginAsExtensions( - podtopologyspread.Name, - podTopologySpreadFunc, - "PreFilter", - "Filter", - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"machine1", "machine2", "machine3"}, - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "p", UID: types.UID("p"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 2, - TopologyKey: "hostname", - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "foo", - Operator: metav1.LabelSelectorOpExists, - }, - }, - }, - }, - }, - }, - }, - pods: []*v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{Name: "pod1a", UID: types.UID("pod1a"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - NodeName: "machine1", - }, - Status: v1.PodStatus{ - Phase: v1.PodRunning, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod1b", UID: types.UID("pod1b"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - NodeName: "machine1", - }, - Status: v1.PodStatus{ - Phase: v1.PodRunning, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{Name: "pod2", UID: types.UID("pod2"), Labels: map[string]string{"foo": ""}}, - Spec: v1.PodSpec{ - NodeName: "machine2", - }, - Status: v1.PodStatus{ - Phase: v1.PodRunning, - }, - }, - }, - wantNodes: sets.NewString("machine2", "machine3"), - wErr: nil, - }, - { - name: "test with filter plugin returning Unschedulable status", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin( - "FakeFilter", - st.NewFakeFilterPlugin(map[string]framework.Code{"3": framework.Unschedulable}), - ), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, - wantNodes: nil, - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, - NumAllNodes: 1, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "3": framework.NewStatus(framework.Unschedulable, "injecting failure for pod test-filter").WithFailedPlugin("FakeFilter"), - }, - UnschedulablePlugins: sets.NewString("FakeFilter"), - }, - }, - }, - { - name: "test with filter plugin returning UnschedulableAndUnresolvable status", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin( - "FakeFilter", - st.NewFakeFilterPlugin(map[string]framework.Code{"3": framework.UnschedulableAndUnresolvable}), - ), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, - wantNodes: nil, - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, - NumAllNodes: 1, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "3": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injecting failure for pod test-filter").WithFailedPlugin("FakeFilter"), - }, - UnschedulablePlugins: sets.NewString("FakeFilter"), - }, - }, - }, - { - name: "test with partial failed filter plugin", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin( - "FakeFilter", - st.NewFakeFilterPlugin(map[string]framework.Code{"1": framework.Unschedulable}), - ), - st.RegisterScorePlugin("NumericMap", newNumericMapPlugin(), 1), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"1", "2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-filter", UID: types.UID("test-filter")}}, - wantNodes: nil, - wErr: nil, - }, - { - name: "test prefilter plugin returning Unschedulable status", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin( - "FakePreFilter", - st.NewFakePreFilterPlugin("FakePreFilter", nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status")), - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"1", "2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - wantNodes: nil, - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - NumAllNodes: 2, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "1": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status").WithFailedPlugin("FakePreFilter"), - "2": framework.NewStatus(framework.UnschedulableAndUnresolvable, "injected unschedulable status").WithFailedPlugin("FakePreFilter"), - }, - UnschedulablePlugins: sets.NewString("FakePreFilter"), - }, - }, - }, - { - name: "test prefilter plugin returning error status", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin( - "FakePreFilter", - st.NewFakePreFilterPlugin("FakePreFilter", nil, framework.NewStatus(framework.Error, "injected error status")), - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"1", "2"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - wantNodes: nil, - wErr: fmt.Errorf(`running PreFilter plugin "FakePreFilter": %w`, errors.New("injected error status")), - }, - { - name: "test prefilter plugin returning node", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin( - "FakePreFilter1", - st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), - ), - st.RegisterPreFilterPlugin( - "FakePreFilter2", - st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString("node2")}, nil), - ), - st.RegisterPreFilterPlugin( - "FakePreFilter3", - st.NewFakePreFilterPlugin("FakePreFilter3", &framework.PreFilterResult{NodeNames: sets.NewString("node1", "node2")}, nil), - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"node1", "node2", "node3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - wantNodes: sets.NewString("node2"), - wantEvaluatedNodes: pointer.Int32Ptr(1), - }, - { - name: "test prefilter plugin returning non-intersecting nodes", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin( - "FakePreFilter1", - st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), - ), - st.RegisterPreFilterPlugin( - "FakePreFilter2", - st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString("node2")}, nil), - ), - st.RegisterPreFilterPlugin( - "FakePreFilter3", - st.NewFakePreFilterPlugin("FakePreFilter3", &framework.PreFilterResult{NodeNames: sets.NewString("node1")}, nil), - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"node1", "node2", "node3"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - NumAllNodes: 3, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "node1": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), - "node2": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), - "node3": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin(s) [FakePreFilter2 FakePreFilter3] simultaneously"), - }, - UnschedulablePlugins: sets.String{}, - }, - }, - }, - { - name: "test prefilter plugin returning empty node set", - registerPlugins: []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterPreFilterPlugin( - "FakePreFilter1", - st.NewFakePreFilterPlugin("FakePreFilter1", nil, nil), - ), - st.RegisterPreFilterPlugin( - "FakePreFilter2", - st.NewFakePreFilterPlugin("FakePreFilter2", &framework.PreFilterResult{NodeNames: sets.NewString()}, nil), - ), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - nodes: []string{"node1"}, - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - wErr: &framework.FitError{ - Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-prefilter", UID: types.UID("test-prefilter")}}, - NumAllNodes: 1, - Diagnosis: framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "node1": framework.NewStatus(framework.Unschedulable, "node(s) didn't satisfy plugin FakePreFilter2"), - }, - UnschedulablePlugins: sets.String{}, - }, - }, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - cache := internalcache.New(time.Duration(0), wait.NeverStop) - for _, pod := range test.pods { - cache.AddPod(pod) - } - var nodes []*v1.Node - for _, name := range test.nodes { - node := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: name, Labels: map[string]string{"hostname": name}}} - nodes = append(nodes, node) - cache.AddNode(node) - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cs := clientsetfake.NewSimpleClientset() - informerFactory := informers.NewSharedInformerFactory(cs, 0) - for _, pvc := range test.pvcs { - metav1.SetMetaDataAnnotation(&pvc.ObjectMeta, volume.AnnBindCompleted, "true") - cs.CoreV1().PersistentVolumeClaims(pvc.Namespace).Create(ctx, &pvc, metav1.CreateOptions{}) - if pvName := pvc.Spec.VolumeName; pvName != "" { - pv := v1.PersistentVolume{ObjectMeta: metav1.ObjectMeta{Name: pvName}} - cs.CoreV1().PersistentVolumes().Create(ctx, &pv, metav1.CreateOptions{}) - } - } - snapshot := internalcache.NewSnapshot(test.pods, nodes) - fwk, err := st.NewFramework( - test.registerPlugins, "", - frameworkruntime.WithSnapshotSharedLister(snapshot), - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), - ) - if err != nil { - t.Fatal(err) - } - - scheduler := newScheduler( - cache, - nil, - nil, - nil, - nil, - nil, - nil, - nil, - snapshot, - schedulerapi.DefaultPercentageOfNodesToScore) - informerFactory.Start(ctx.Done()) - informerFactory.WaitForCacheSync(ctx.Done()) - - result, err := scheduler.SchedulePod(ctx, fwk, framework.NewCycleState(), test.pod) - if err != test.wErr { - gotFitErr, gotOK := err.(*framework.FitError) - wantFitErr, wantOK := test.wErr.(*framework.FitError) - if gotOK != wantOK { - t.Errorf("Expected err to be FitError: %v, but got %v", wantOK, gotOK) - } else if gotOK { - if diff := cmp.Diff(gotFitErr, wantFitErr); diff != "" { - t.Errorf("Unexpected fitErr: (-want, +got): %s", diff) - } - } - } - if test.wantNodes != nil && !test.wantNodes.Has(result.SuggestedHost) { - t.Errorf("Expected: %s, got: %s", test.wantNodes, result.SuggestedHost) - } - wantEvaluatedNodes := len(test.nodes) - if test.wantEvaluatedNodes != nil { - wantEvaluatedNodes = int(*test.wantEvaluatedNodes) - } - if test.wErr == nil && wantEvaluatedNodes != result.EvaluatedNodes { - t.Errorf("Expected EvaluatedNodes: %d, got: %d", wantEvaluatedNodes, result.EvaluatedNodes) - } - }) - } -} - -// makeScheduler makes a simple Scheduler for testing. -func makeScheduler(nodes []*v1.Node) *Scheduler { - cache := internalcache.New(time.Duration(0), wait.NeverStop) - for _, n := range nodes { - cache.AddNode(n) - } - - s := newScheduler( - cache, - nil, - nil, - nil, - nil, - nil, - nil, - nil, - emptySnapshot, - schedulerapi.DefaultPercentageOfNodesToScore) - cache.UpdateSnapshot(s.nodeInfoSnapshot) - return s -} - -func TestFindFitAllError(t *testing.T) { - nodes := makeNodeList([]string{"3", "2", "1"}) - scheduler := makeScheduler(nodes) - fwk, err := st.NewFramework( - []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - "", - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), - ) - if err != nil { - t.Fatal(err) - } - - _, diagnosis, err := scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), &v1.Pod{}) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - - expected := framework.Diagnosis{ - NodeToStatusMap: framework.NodeToStatusMap{ - "1": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), - "2": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), - "3": framework.NewStatus(framework.Unschedulable, st.ErrReasonFake).WithFailedPlugin("MatchFilter"), - }, - UnschedulablePlugins: sets.NewString("MatchFilter"), - } - if diff := cmp.Diff(diagnosis, expected); diff != "" { - t.Errorf("Unexpected diagnosis: (-want, +got): %s", diff) - } -} - -func TestFindFitSomeError(t *testing.T) { - nodes := makeNodeList([]string{"3", "2", "1"}) - scheduler := makeScheduler(nodes) - fwk, err := st.NewFramework( - []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterFilterPlugin("MatchFilter", st.NewMatchFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - "", - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), - ) - if err != nil { - t.Fatal(err) - } - - pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}} - _, diagnosis, err := scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), pod) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - - if len(diagnosis.NodeToStatusMap) != len(nodes)-1 { - t.Errorf("unexpected failed status map: %v", diagnosis.NodeToStatusMap) - } - - if diff := cmp.Diff(sets.NewString("MatchFilter"), diagnosis.UnschedulablePlugins); diff != "" { - t.Errorf("Unexpected unschedulablePlugins: (-want, +got): %s", diagnosis.UnschedulablePlugins) - } - - for _, node := range nodes { - if node.Name == pod.Name { - continue - } - t.Run(node.Name, func(t *testing.T) { - status, found := diagnosis.NodeToStatusMap[node.Name] - if !found { - t.Errorf("failed to find node %v in %v", node.Name, diagnosis.NodeToStatusMap) - } - reasons := status.Reasons() - if len(reasons) != 1 || reasons[0] != st.ErrReasonFake { - t.Errorf("unexpected failures: %v", reasons) - } - }) - } -} - -func TestFindFitPredicateCallCounts(t *testing.T) { - tests := []struct { - name string - pod *v1.Pod - expectedCount int32 - }{ - { - name: "nominated pods have lower priority, predicate is called once", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}, Spec: v1.PodSpec{Priority: &highPriority}}, - expectedCount: 1, - }, - { - name: "nominated pods have higher priority, predicate is called twice", - pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "1", UID: types.UID("1")}, Spec: v1.PodSpec{Priority: &lowPriority}}, - expectedCount: 2, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - nodes := makeNodeList([]string{"1"}) - - plugin := st.FakeFilterPlugin{} - registerFakeFilterFunc := st.RegisterFilterPlugin( - "FakeFilter", - func(_ runtime.Object, fh framework.Handle) (framework.Plugin, error) { - return &plugin, nil - }, - ) - registerPlugins := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - registerFakeFilterFunc, - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - } - fwk, err := st.NewFramework( - registerPlugins, "", - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), - ) - if err != nil { - t.Fatal(err) - } - - scheduler := makeScheduler(nodes) - if err := scheduler.Cache.UpdateSnapshot(scheduler.nodeInfoSnapshot); err != nil { - t.Fatal(err) - } - fwk.AddNominatedPod(framework.NewPodInfo(&v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: "nominated"}, Spec: v1.PodSpec{Priority: &midPriority}}), - &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: "1"}) - - _, _, err = scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), test.pod) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - if test.expectedCount != plugin.NumFilterCalled { - t.Errorf("predicate was called %d times, expected is %d", plugin.NumFilterCalled, test.expectedCount) - } - }) - } -} - -func makeNode(node string, milliCPU, memory int64) *v1.Node { - return &v1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: node}, - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), - "pods": *resource.NewQuantity(100, resource.DecimalSI), - }, - Allocatable: v1.ResourceList{ - - v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), - "pods": *resource.NewQuantity(100, resource.DecimalSI), - }, - }, - } -} - -// The point of this test is to show that you: -// - get the same priority for a zero-request pod as for a pod with the defaults requests, -// both when the zero-request pod is already on the machine and when the zero-request pod -// is the one being scheduled. -// - don't get the same score no matter what we schedule. -func TestZeroRequest(t *testing.T) { - // A pod with no resources. We expect spreading to count it as having the default resources. - noResources := v1.PodSpec{ - Containers: []v1.Container{ - {}, - }, - } - noResources1 := noResources - noResources1.NodeName = "machine1" - // A pod with the same resources as a 0-request pod gets by default as its resources (for spreading). - small := v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest, 10) + "m"), - v1.ResourceMemory: resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest, 10)), - }, - }, - }, - }, - } - small2 := small - small2.NodeName = "machine2" - // A larger pod. - large := v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse( - strconv.FormatInt(schedutil.DefaultMilliCPURequest*3, 10) + "m"), - v1.ResourceMemory: resource.MustParse( - strconv.FormatInt(schedutil.DefaultMemoryRequest*3, 10)), - }, - }, - }, - }, - } - large1 := large - large1.NodeName = "machine1" - large2 := large - large2.NodeName = "machine2" - tests := []struct { - pod *v1.Pod - pods []*v1.Pod - nodes []*v1.Node - name string - expectedScore int64 - }{ - // The point of these next two tests is to show you get the same priority for a zero-request pod - // as for a pod with the defaults requests, both when the zero-request pod is already on the machine - // and when the zero-request pod is the one being scheduled. - { - pod: &v1.Pod{Spec: noResources}, - nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, - name: "test priority of zero-request pod with machine with zero-request pod", - pods: []*v1.Pod{ - {Spec: large1}, {Spec: noResources1}, - {Spec: large2}, {Spec: small2}, - }, - expectedScore: 250, - }, - { - pod: &v1.Pod{Spec: small}, - nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, - name: "test priority of nonzero-request pod with machine with zero-request pod", - pods: []*v1.Pod{ - {Spec: large1}, {Spec: noResources1}, - {Spec: large2}, {Spec: small2}, - }, - expectedScore: 250, - }, - // The point of this test is to verify that we're not just getting the same score no matter what we schedule. - { - pod: &v1.Pod{Spec: large}, - nodes: []*v1.Node{makeNode("machine1", 1000, schedutil.DefaultMemoryRequest*10), makeNode("machine2", 1000, schedutil.DefaultMemoryRequest*10)}, - name: "test priority of larger pod with machine with zero-request pod", - pods: []*v1.Pod{ - {Spec: large1}, {Spec: noResources1}, - {Spec: large2}, {Spec: small2}, - }, - expectedScore: 230, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client := clientsetfake.NewSimpleClientset() - informerFactory := informers.NewSharedInformerFactory(client, 0) - - snapshot := internalcache.NewSnapshot(test.pods, test.nodes) - fts := feature.Features{} - pluginRegistrations := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterScorePlugin(noderesources.Name, frameworkruntime.FactoryAdapter(fts, noderesources.NewFit), 1), - st.RegisterScorePlugin(noderesources.BalancedAllocationName, frameworkruntime.FactoryAdapter(fts, noderesources.NewBalancedAllocation), 1), - st.RegisterScorePlugin(selectorspread.Name, selectorspread.New, 1), - st.RegisterPreScorePlugin(selectorspread.Name, selectorspread.New), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - } - fwk, err := st.NewFramework( - pluginRegistrations, "", - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithSnapshotSharedLister(snapshot), - frameworkruntime.WithClientSet(client), - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), - ) - if err != nil { - t.Fatalf("error creating framework: %+v", err) - } - - scheduler := newScheduler( - nil, - nil, - nil, - nil, - nil, - nil, - nil, - nil, - snapshot, - schedulerapi.DefaultPercentageOfNodesToScore) - - ctx := context.Background() - state := framework.NewCycleState() - _, _, err = scheduler.findNodesThatFitPod(ctx, fwk, state, test.pod) - if err != nil { - t.Fatalf("error filtering nodes: %+v", err) - } - fwk.RunPreScorePlugins(ctx, state, test.pod, test.nodes) - list, err := prioritizeNodes(ctx, nil, fwk, state, test.pod, test.nodes) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - for _, hp := range list { - if hp.Score != test.expectedScore { - t.Errorf("expected %d for all priorities, got list %#v", test.expectedScore, list) - } - } - }) - } -} - -var lowPriority, midPriority, highPriority = int32(0), int32(100), int32(1000) - -func TestNumFeasibleNodesToFind(t *testing.T) { - tests := []struct { - name string - percentageOfNodesToScore int32 - numAllNodes int32 - wantNumNodes int32 - }{ - { - name: "not set percentageOfNodesToScore and nodes number not more than 50", - numAllNodes: 10, - wantNumNodes: 10, - }, - { - name: "set percentageOfNodesToScore and nodes number not more than 50", - percentageOfNodesToScore: 40, - numAllNodes: 10, - wantNumNodes: 10, - }, - { - name: "not set percentageOfNodesToScore and nodes number more than 50", - numAllNodes: 1000, - wantNumNodes: 420, - }, - { - name: "set percentageOfNodesToScore and nodes number more than 50", - percentageOfNodesToScore: 40, - numAllNodes: 1000, - wantNumNodes: 400, - }, - { - name: "not set percentageOfNodesToScore and nodes number more than 50*125", - numAllNodes: 6000, - wantNumNodes: 300, - }, - { - name: "set percentageOfNodesToScore and nodes number more than 50*125", - percentageOfNodesToScore: 40, - numAllNodes: 6000, - wantNumNodes: 2400, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - sched := &Scheduler{ - percentageOfNodesToScore: tt.percentageOfNodesToScore, - } - if gotNumNodes := sched.numFeasibleNodesToFind(tt.numAllNodes); gotNumNodes != tt.wantNumNodes { - t.Errorf("Scheduler.numFeasibleNodesToFind() = %v, want %v", gotNumNodes, tt.wantNumNodes) - } - }) - } -} - -func TestFairEvaluationForNodes(t *testing.T) { - numAllNodes := 500 - nodeNames := make([]string, 0, numAllNodes) - for i := 0; i < numAllNodes; i++ { - nodeNames = append(nodeNames, strconv.Itoa(i)) - } - nodes := makeNodeList(nodeNames) - sched := makeScheduler(nodes) - fwk, err := st.NewFramework( - []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - st.RegisterFilterPlugin("TrueFilter", st.NewTrueFilterPlugin), - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - "", - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(nil)), - ) - if err != nil { - t.Fatal(err) - } - - // To make numAllNodes % nodesToFind != 0 - sched.percentageOfNodesToScore = 30 - nodesToFind := int(sched.numFeasibleNodesToFind(int32(numAllNodes))) - - // Iterating over all nodes more than twice - for i := 0; i < 2*(numAllNodes/nodesToFind+1); i++ { - nodesThatFit, _, err := sched.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), &v1.Pod{}) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - if len(nodesThatFit) != nodesToFind { - t.Errorf("got %d nodes filtered, want %d", len(nodesThatFit), nodesToFind) - } - if sched.nextStartNodeIndex != (i+1)*nodesToFind%numAllNodes { - t.Errorf("got %d lastProcessedNodeIndex, want %d", sched.nextStartNodeIndex, (i+1)*nodesToFind%numAllNodes) - } - } -} - -func TestPreferNominatedNodeFilterCallCounts(t *testing.T) { - tests := []struct { - name string - pod *v1.Pod - nodeReturnCodeMap map[string]framework.Code - expectedCount int32 - expectedPatchRequests int - }{ - { - name: "pod has the nominated node set, filter is called only once", - pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(), - expectedCount: 1, - }, - { - name: "pod without the nominated pod, filter is called for each node", - pod: st.MakePod().Name("p_without_nominated_node").UID("p").Priority(highPriority).Obj(), - expectedCount: 3, - }, - { - name: "nominated pod cannot pass the filter, filter is called for each node", - pod: st.MakePod().Name("p_with_nominated_node").UID("p").Priority(highPriority).NominatedNodeName("node1").Obj(), - nodeReturnCodeMap: map[string]framework.Code{"node1": framework.Unschedulable}, - expectedCount: 4, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - // create three nodes in the cluster. - nodes := makeNodeList([]string{"node1", "node2", "node3"}) - client := clientsetfake.NewSimpleClientset(test.pod) - informerFactory := informers.NewSharedInformerFactory(client, 0) - cache := internalcache.New(time.Duration(0), wait.NeverStop) - for _, n := range nodes { - cache.AddNode(n) - } - plugin := st.FakeFilterPlugin{FailedNodeReturnCodeMap: test.nodeReturnCodeMap} - registerFakeFilterFunc := st.RegisterFilterPlugin( - "FakeFilter", - func(_ runtime.Object, fh framework.Handle) (framework.Plugin, error) { - return &plugin, nil - }, - ) - registerPlugins := []st.RegisterPluginFunc{ - st.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - registerFakeFilterFunc, - st.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - } - fwk, err := st.NewFramework( - registerPlugins, "", - frameworkruntime.WithClientSet(client), - frameworkruntime.WithPodNominator(internalqueue.NewPodNominator(informerFactory.Core().V1().Pods().Lister())), - ) - if err != nil { - t.Fatal(err) - } - snapshot := internalcache.NewSnapshot(nil, nodes) - scheduler := newScheduler( - cache, - nil, - nil, - nil, - nil, - nil, - nil, - nil, - snapshot, - schedulerapi.DefaultPercentageOfNodesToScore) - - _, _, err = scheduler.findNodesThatFitPod(context.Background(), fwk, framework.NewCycleState(), test.pod) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - if test.expectedCount != plugin.NumFilterCalled { - t.Errorf("predicate was called %d times, expected is %d", plugin.NumFilterCalled, test.expectedCount) - } - }) - } -}