From 479f01d340b917ac3544f094f38edaed0ab44128 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Wed, 17 May 2017 00:03:25 -0400 Subject: [PATCH] Scheduler should use shared informer for pods Previously, the scheduler created two separate list watchers. This changes the scheduler to be able to leverage a shared informer, whether passed in externally or spawned using the new in place method. This removes the last use of a "special" informer in the codebase. Allows someone wrapping the scheduler to use a shared informer if they have more information avaliable. --- plugin/cmd/kube-scheduler/app/BUILD | 1 + plugin/cmd/kube-scheduler/app/configurator.go | 2 + plugin/cmd/kube-scheduler/app/server.go | 7 + .../defaults/compatibility_test.go | 1 + plugin/pkg/scheduler/factory/factory.go | 131 +++++++++++++----- plugin/pkg/scheduler/factory/factory_test.go | 8 ++ plugin/pkg/scheduler/scheduler.go | 15 +- test/integration/scheduler/extender_test.go | 1 + test/integration/scheduler/scheduler_test.go | 7 + test/integration/scheduler_perf/util.go | 1 + 10 files changed, 136 insertions(+), 38 deletions(-) diff --git a/plugin/cmd/kube-scheduler/app/BUILD b/plugin/cmd/kube-scheduler/app/BUILD index 2b95dc68e51..72a3c6b35b0 100644 --- a/plugin/cmd/kube-scheduler/app/BUILD +++ b/plugin/cmd/kube-scheduler/app/BUILD @@ -24,6 +24,7 @@ go_library( "//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library", "//pkg/client/leaderelection:go_default_library", "//pkg/client/leaderelection/resourcelock:go_default_library", + "//pkg/controller:go_default_library", "//pkg/util/configz:go_default_library", "//plugin/cmd/kube-scheduler/app/options:go_default_library", "//plugin/pkg/scheduler:go_default_library", diff --git a/plugin/cmd/kube-scheduler/app/configurator.go b/plugin/cmd/kube-scheduler/app/configurator.go index f5f23c3f280..a40dbbb7976 100644 --- a/plugin/cmd/kube-scheduler/app/configurator.go +++ b/plugin/cmd/kube-scheduler/app/configurator.go @@ -77,6 +77,7 @@ func CreateScheduler( s *options.SchedulerServer, kubecli *clientset.Clientset, nodeInformer coreinformers.NodeInformer, + podInformer coreinformers.PodInformer, pvInformer coreinformers.PersistentVolumeInformer, pvcInformer coreinformers.PersistentVolumeClaimInformer, replicationControllerInformer coreinformers.ReplicationControllerInformer, @@ -89,6 +90,7 @@ func CreateScheduler( s.SchedulerName, kubecli, nodeInformer, + podInformer, pvInformer, pvcInformer, replicationControllerInformer, diff --git a/plugin/cmd/kube-scheduler/app/server.go b/plugin/cmd/kube-scheduler/app/server.go index c4ddae780c3..653e3398f49 100644 --- a/plugin/cmd/kube-scheduler/app/server.go +++ b/plugin/cmd/kube-scheduler/app/server.go @@ -31,9 +31,11 @@ import ( informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/externalversions" "k8s.io/kubernetes/pkg/client/leaderelection" "k8s.io/kubernetes/pkg/client/leaderelection/resourcelock" + "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/util/configz" "k8s.io/kubernetes/plugin/cmd/kube-scheduler/app/options" _ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider" + "k8s.io/kubernetes/plugin/pkg/scheduler/factory" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" @@ -71,11 +73,14 @@ func Run(s *options.SchedulerServer) error { recorder := createRecorder(kubecli, s) informerFactory := informers.NewSharedInformerFactory(kubecli, 0) + // cache only non-terminal pods + podInformer := factory.NewPodInformer(kubecli, 0) sched, err := CreateScheduler( s, kubecli, informerFactory.Core().V1().Nodes(), + podInformer, informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -92,9 +97,11 @@ func Run(s *options.SchedulerServer) error { stop := make(chan struct{}) defer close(stop) + go podInformer.Informer().Run(stop) informerFactory.Start(stop) // Waiting for all cache to sync before scheduling. informerFactory.WaitForCacheSync(stop) + controller.WaitForCacheSync("scheduler", stop, podInformer.Informer().HasSynced) run := func(_ <-chan struct{}) { sched.Run() diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go index 267ba65cd4c..ccc57efc50b 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go @@ -352,6 +352,7 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { "some-scheduler-name", client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), diff --git a/plugin/pkg/scheduler/factory/factory.go b/plugin/pkg/scheduler/factory/factory.go index 312276f122d..0409e2b0e2d 100644 --- a/plugin/pkg/scheduler/factory/factory.go +++ b/plugin/pkg/scheduler/factory/factory.go @@ -82,7 +82,7 @@ type ConfigFactory struct { // Close this to stop all reflectors StopEverything chan struct{} - scheduledPodPopulator cache.Controller + scheduledPodsHasSynced cache.InformerSynced schedulerCache schedulercache.Cache @@ -105,6 +105,7 @@ func NewConfigFactory( schedulerName string, client clientset.Interface, nodeInformer coreinformers.NodeInformer, + podInformer coreinformers.PodInformer, pvInformer coreinformers.PersistentVolumeInformer, pvcInformer coreinformers.PersistentVolumeClaimInformer, replicationControllerInformer coreinformers.ReplicationControllerInformer, @@ -132,23 +133,60 @@ func NewConfigFactory( hardPodAffinitySymmetricWeight: hardPodAffinitySymmetricWeight, } - // On add/delete to the scheduled pods, remove from the assumed pods. - // We construct this here instead of in CreateFromKeys because - // ScheduledPodLister is something we provide to plug in functions that - // they may need to call. - var scheduledPodIndexer cache.Indexer - scheduledPodIndexer, c.scheduledPodPopulator = cache.NewIndexerInformer( - c.createAssignedNonTerminatedPodLW(), - &v1.Pod{}, - 0, - cache.ResourceEventHandlerFuncs{ - AddFunc: c.addPodToCache, - UpdateFunc: c.updatePodInCache, - DeleteFunc: c.deletePodFromCache, + c.scheduledPodsHasSynced = podInformer.Informer().HasSynced + // scheduled pod cache + podInformer.Informer().AddEventHandler( + cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch t := obj.(type) { + case *v1.Pod: + return assignedNonTerminatedPod(t) + default: + runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj)) + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: c.addPodToCache, + UpdateFunc: c.updatePodInCache, + DeleteFunc: c.deletePodFromCache, + }, }, - cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, ) - c.scheduledPodLister = corelisters.NewPodLister(scheduledPodIndexer) + // unscheduled pod queue + podInformer.Informer().AddEventHandler( + cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch t := obj.(type) { + case *v1.Pod: + return unassignedNonTerminatedPod(t) + default: + runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj)) + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if err := c.podQueue.Add(obj); err != nil { + runtime.HandleError(fmt.Errorf("unable to queue %T: %v", obj, err)) + } + }, + UpdateFunc: func(oldObj, newObj interface{}) { + if err := c.podQueue.Update(newObj); err != nil { + runtime.HandleError(fmt.Errorf("unable to update %T: %v", newObj, err)) + } + }, + DeleteFunc: func(obj interface{}) { + if err := c.podQueue.Delete(obj); err != nil { + runtime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err)) + } + }, + }, + }, + ) + // ScheduledPodLister is something we provide to plug-in functions that + // they may need to call. + c.scheduledPodLister = podInformer.Lister() // Only nodes in the "Ready" condition with status == "True" are schedulable nodeInformer.Informer().AddEventHandlerWithResyncPeriod( @@ -369,7 +407,6 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, return nil, err } - f.Run() // TODO(resouer) use equivalence cache instead of nil here when #36238 get merged algo := core.NewGenericScheduler(f.schedulerCache, nil, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders) podBackoff := util.CreateDefaultPodBackoff() @@ -381,7 +418,7 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, Binder: &binder{f.client}, PodConditionUpdater: &podConditionUpdater{f.client}, WaitForCacheSync: func() bool { - return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodPopulator.HasSynced) + return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodsHasSynced) }, NextPod: func() *v1.Pod { return f.getNextPod() @@ -450,14 +487,6 @@ func (f *ConfigFactory) getPluginArgs() (*PluginFactoryArgs, error) { }, nil } -func (f *ConfigFactory) Run() { - // Watch and queue pods that need scheduling. - cache.NewReflector(f.createUnassignedNonTerminatedPodLW(), &v1.Pod{}, f.podQueue, 0).RunUntil(f.StopEverything) - - // Begin populating scheduled pods. - go f.scheduledPodPopulator.Run(f.StopEverything) -} - func (f *ConfigFactory) getNextPod() *v1.Pod { for { pod := cache.Pop(f.podQueue).(*v1.Pod) @@ -500,19 +529,47 @@ func getNodeConditionPredicate() corelisters.NodeConditionPredicate { } } -// Returns a cache.ListWatch that finds all pods that need to be -// scheduled. -func (factory *ConfigFactory) createUnassignedNonTerminatedPodLW() *cache.ListWatch { - selector := fields.ParseSelectorOrDie("spec.nodeName==" + "" + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) - return cache.NewListWatchFromClient(factory.client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector) +// unassignedNonTerminatedPod selects pods that are unassigned and non-terminal. +func unassignedNonTerminatedPod(pod *v1.Pod) bool { + if len(pod.Spec.NodeName) != 0 { + return false + } + if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { + return false + } + return true } -// Returns a cache.ListWatch that finds all pods that are -// already scheduled. -// TODO: return a ListerWatcher interface instead? -func (factory *ConfigFactory) createAssignedNonTerminatedPodLW() *cache.ListWatch { - selector := fields.ParseSelectorOrDie("spec.nodeName!=" + "" + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) - return cache.NewListWatchFromClient(factory.client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector) +// assignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running). +func assignedNonTerminatedPod(pod *v1.Pod) bool { + if len(pod.Spec.NodeName) == 0 { + return false + } + if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { + return false + } + return true +} + +type podInformer struct { + informer cache.SharedIndexInformer +} + +func (i *podInformer) Informer() cache.SharedIndexInformer { + return i.informer +} + +func (i *podInformer) Lister() corelisters.PodLister { + return corelisters.NewPodLister(i.informer.GetIndexer()) +} + +// NewPodInformer creates a shared index informer that returns only non-terminal pods. +func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer { + selector := fields.ParseSelectorOrDie("status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) + lw := cache.NewListWatchFromClient(client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector) + return &podInformer{ + informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}), + } } func (factory *ConfigFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error) { diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index 572703ec726..d72e873a3c5 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -55,6 +55,7 @@ func TestCreate(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -85,6 +86,7 @@ func TestCreateFromConfig(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -138,6 +140,7 @@ func TestCreateFromEmptyConfig(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -193,6 +196,7 @@ func TestDefaultErrorFunc(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -304,6 +308,7 @@ func TestResponsibleForPod(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -317,6 +322,7 @@ func TestResponsibleForPod(t *testing.T) { "foo-scheduler", client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -385,6 +391,7 @@ func TestInvalidHardPodAffinitySymmetricWeight(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -429,6 +436,7 @@ func TestInvalidFactoryArgs(t *testing.T) { v1.DefaultSchedulerName, client, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), diff --git a/plugin/pkg/scheduler/scheduler.go b/plugin/pkg/scheduler/scheduler.go index bebcc454fb9..8570427dc1a 100644 --- a/plugin/pkg/scheduler/scheduler.go +++ b/plugin/pkg/scheduler/scheduler.go @@ -20,10 +20,12 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" corelisters "k8s.io/kubernetes/pkg/client/listers/core/v1" @@ -79,7 +81,6 @@ type Configurator interface { GetNodeLister() corelisters.NodeLister GetClient() clientset.Interface GetScheduledPodLister() corelisters.PodLister - Run() Create() (*Config, error) CreateFromProvider(providerName string) (*Config, error) @@ -171,6 +172,12 @@ func (sched *Scheduler) scheduleOne() { dest, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister) if err != nil { glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name) + copied, cerr := api.Scheme.Copy(pod) + if cerr != nil { + runtime.HandleError(err) + return + } + pod = copied.(*v1.Pod) sched.config.Error(pod, err) sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err) sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{ @@ -232,6 +239,12 @@ func (sched *Scheduler) scheduleOne() { if err := sched.config.SchedulerCache.ForgetPod(&assumed); err != nil { glog.Errorf("scheduler cache ForgetPod failed: %v", err) } + copied, cerr := api.Scheme.Copy(pod) + if cerr != nil { + runtime.HandleError(err) + return + } + pod = copied.(*v1.Pod) sched.config.Error(pod, err) sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err) sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{ diff --git a/test/integration/scheduler/extender_test.go b/test/integration/scheduler/extender_test.go index 4a8a2b732fd..1c758a9bc57 100644 --- a/test/integration/scheduler/extender_test.go +++ b/test/integration/scheduler/extender_test.go @@ -326,6 +326,7 @@ func TestSchedulerExtender(t *testing.T) { v1.DefaultSchedulerName, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index ae41c990dfd..0d02e8a9e92 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -122,6 +122,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) { ss.PolicyConfigMapName = configPolicyName sched, err := app.CreateScheduler(ss, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -174,6 +175,7 @@ func TestSchedulerCreationFromNonExistentConfigMap(t *testing.T) { _, err := app.CreateScheduler(ss, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -211,6 +213,7 @@ func TestSchedulerCreationInLegacyMode(t *testing.T) { sched, err := app.CreateScheduler(ss, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -245,6 +248,7 @@ func TestUnschedulableNodes(t *testing.T) { v1.DefaultSchedulerName, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -527,6 +531,7 @@ func TestMultiScheduler(t *testing.T) { v1.DefaultSchedulerName, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -612,6 +617,7 @@ func TestMultiScheduler(t *testing.T) { "foo-scheduler", clientSet2, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), @@ -721,6 +727,7 @@ func TestAllocatable(t *testing.T) { v1.DefaultSchedulerName, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(), diff --git a/test/integration/scheduler_perf/util.go b/test/integration/scheduler_perf/util.go index 66e03925aea..2bf5c547a57 100644 --- a/test/integration/scheduler_perf/util.go +++ b/test/integration/scheduler_perf/util.go @@ -65,6 +65,7 @@ func mustSetupScheduler() (schedulerConfigurator scheduler.Configurator, destroy v1.DefaultSchedulerName, clientSet, informerFactory.Core().V1().Nodes(), + informerFactory.Core().V1().Pods(), informerFactory.Core().V1().PersistentVolumes(), informerFactory.Core().V1().PersistentVolumeClaims(), informerFactory.Core().V1().ReplicationControllers(),