mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-31 13:50:01 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			508 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			508 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
| Copyright 2014 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package scheduler
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"time"
 | |
| 
 | |
| 	"k8s.io/api/core/v1"
 | |
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | |
| 	"k8s.io/apimachinery/pkg/util/sets"
 | |
| 	"k8s.io/apimachinery/pkg/util/wait"
 | |
| 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 | |
| 	clientset "k8s.io/client-go/kubernetes"
 | |
| 	corelisters "k8s.io/client-go/listers/core/v1"
 | |
| 	"k8s.io/client-go/tools/record"
 | |
| 	"k8s.io/kubernetes/pkg/features"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/algorithm"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
 | |
| 	schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
 | |
| 	schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/core"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/core/equivalence"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/metrics"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/util"
 | |
| 	"k8s.io/kubernetes/pkg/scheduler/volumebinder"
 | |
| 
 | |
| 	"github.com/golang/glog"
 | |
| )
 | |
| 
 | |
| // Binder knows how to write a binding.
 | |
| type Binder interface {
 | |
| 	Bind(binding *v1.Binding) error
 | |
| }
 | |
| 
 | |
| // PodConditionUpdater updates the condition of a pod based on the passed
 | |
| // PodCondition
 | |
| type PodConditionUpdater interface {
 | |
| 	Update(pod *v1.Pod, podCondition *v1.PodCondition) error
 | |
| }
 | |
| 
 | |
| // PodPreemptor has methods needed to delete a pod and to update
 | |
| // annotations of the preemptor pod.
 | |
| type PodPreemptor interface {
 | |
| 	GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error)
 | |
| 	DeletePod(pod *v1.Pod) error
 | |
| 	SetNominatedNodeName(pod *v1.Pod, nominatedNode string) error
 | |
| 	RemoveNominatedNodeName(pod *v1.Pod) error
 | |
| }
 | |
| 
 | |
| // Scheduler watches for new unscheduled pods. It attempts to find
 | |
| // nodes that they fit on and writes bindings back to the api server.
 | |
| type Scheduler struct {
 | |
| 	config *Config
 | |
| }
 | |
| 
 | |
| // StopEverything closes the scheduler config's StopEverything channel, to shut
 | |
| // down the Scheduler.
 | |
| func (sched *Scheduler) StopEverything() {
 | |
| 	close(sched.config.StopEverything)
 | |
| }
 | |
| 
 | |
| // Configurator defines I/O, caching, and other functionality needed to
 | |
| // construct a new scheduler. An implementation of this can be seen in
 | |
| // factory.go.
 | |
| type Configurator interface {
 | |
| 	GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error)
 | |
| 	GetPriorityMetadataProducer() (algorithm.PriorityMetadataProducer, error)
 | |
| 	GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error)
 | |
| 	GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error)
 | |
| 	GetHardPodAffinitySymmetricWeight() int32
 | |
| 	GetSchedulerName() string
 | |
| 	MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue core.SchedulingQueue) func(pod *v1.Pod, err error)
 | |
| 
 | |
| 	// Needs to be exposed for things like integration tests where we want to make fake nodes.
 | |
| 	GetNodeLister() corelisters.NodeLister
 | |
| 	GetClient() clientset.Interface
 | |
| 	GetScheduledPodLister() corelisters.PodLister
 | |
| 
 | |
| 	Create() (*Config, error)
 | |
| 	CreateFromProvider(providerName string) (*Config, error)
 | |
| 	CreateFromConfig(policy schedulerapi.Policy) (*Config, error)
 | |
| 	CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*Config, error)
 | |
| }
 | |
| 
 | |
| // Config is an implementation of the Scheduler's configured input data.
 | |
| // TODO over time we should make this struct a hidden implementation detail of the scheduler.
 | |
| type Config struct {
 | |
| 	// It is expected that changes made via SchedulerCache will be observed
 | |
| 	// by NodeLister and Algorithm.
 | |
| 	SchedulerCache schedulercache.Cache
 | |
| 	// Ecache is used for optimistically invalid affected cache items after
 | |
| 	// successfully binding a pod
 | |
| 	Ecache     *equivalence.Cache
 | |
| 	NodeLister algorithm.NodeLister
 | |
| 	Algorithm  algorithm.ScheduleAlgorithm
 | |
| 	GetBinder  func(pod *v1.Pod) Binder
 | |
| 	// PodConditionUpdater is used only in case of scheduling errors. If we succeed
 | |
| 	// with scheduling, PodScheduled condition will be updated in apiserver in /bind
 | |
| 	// handler so that binding and setting PodCondition it is atomic.
 | |
| 	PodConditionUpdater PodConditionUpdater
 | |
| 	// PodPreemptor is used to evict pods and update pod annotations.
 | |
| 	PodPreemptor PodPreemptor
 | |
| 
 | |
| 	// NextPod should be a function that blocks until the next pod
 | |
| 	// is available. We don't use a channel for this, because scheduling
 | |
| 	// a pod may take some amount of time and we don't want pods to get
 | |
| 	// stale while they sit in a channel.
 | |
| 	NextPod func() *v1.Pod
 | |
| 
 | |
| 	// WaitForCacheSync waits for scheduler cache to populate.
 | |
| 	// It returns true if it was successful, false if the controller should shutdown.
 | |
| 	WaitForCacheSync func() bool
 | |
| 
 | |
| 	// Error is called if there is an error. It is passed the pod in
 | |
| 	// question, and the error
 | |
| 	Error func(*v1.Pod, error)
 | |
| 
 | |
| 	// Recorder is the EventRecorder to use
 | |
| 	Recorder record.EventRecorder
 | |
| 
 | |
| 	// Close this to shut down the scheduler.
 | |
| 	StopEverything chan struct{}
 | |
| 
 | |
| 	// VolumeBinder handles PVC/PV binding for the pod.
 | |
| 	VolumeBinder *volumebinder.VolumeBinder
 | |
| 
 | |
| 	// Disable pod preemption or not.
 | |
| 	DisablePreemption bool
 | |
| }
 | |
| 
 | |
| // NewFromConfigurator returns a new scheduler that is created entirely by the Configurator.  Assumes Create() is implemented.
 | |
| // Supports intermediate Config mutation for now if you provide modifier functions which will run after Config is created.
 | |
| func NewFromConfigurator(c Configurator, modifiers ...func(c *Config)) (*Scheduler, error) {
 | |
| 	cfg, err := c.Create()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	// Mutate it if any functions were provided, changes might be required for certain types of tests (i.e. change the recorder).
 | |
| 	for _, modifier := range modifiers {
 | |
| 		modifier(cfg)
 | |
| 	}
 | |
| 	// From this point on the config is immutable to the outside.
 | |
| 	s := &Scheduler{
 | |
| 		config: cfg,
 | |
| 	}
 | |
| 	metrics.Register()
 | |
| 	return s, nil
 | |
| }
 | |
| 
 | |
| // NewFromConfig returns a new scheduler using the provided Config.
 | |
| func NewFromConfig(config *Config) *Scheduler {
 | |
| 	metrics.Register()
 | |
| 	return &Scheduler{
 | |
| 		config: config,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
 | |
| func (sched *Scheduler) Run() {
 | |
| 	if !sched.config.WaitForCacheSync() {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
 | |
| 		go sched.config.VolumeBinder.Run(sched.bindVolumesWorker, sched.config.StopEverything)
 | |
| 	}
 | |
| 
 | |
| 	go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
 | |
| }
 | |
| 
 | |
| // Config return scheduler's config pointer. It is exposed for testing purposes.
 | |
| func (sched *Scheduler) Config() *Config {
 | |
| 	return sched.config
 | |
| }
 | |
| 
 | |
| // schedule implements the scheduling algorithm and returns the suggested host.
 | |
| func (sched *Scheduler) schedule(pod *v1.Pod) (string, error) {
 | |
| 	host, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister)
 | |
| 	if err != nil {
 | |
| 		glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name)
 | |
| 		pod = pod.DeepCopy()
 | |
| 		sched.config.Error(pod, err)
 | |
| 		sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
 | |
| 		sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
 | |
| 			Type:    v1.PodScheduled,
 | |
| 			Status:  v1.ConditionFalse,
 | |
| 			Reason:  v1.PodReasonUnschedulable,
 | |
| 			Message: err.Error(),
 | |
| 		})
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return host, err
 | |
| }
 | |
| 
 | |
| // preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible.
 | |
| // If it succeeds, it adds the name of the node where preemption has happened to the pod annotations.
 | |
| // It returns the node name and an error if any.
 | |
| func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) {
 | |
| 	if !util.PodPriorityEnabled() || sched.config.DisablePreemption {
 | |
| 		glog.V(3).Infof("Pod priority feature is not enabled or preemption is disabled by scheduler configuration." +
 | |
| 			" No preemption is performed.")
 | |
| 		return "", nil
 | |
| 	}
 | |
| 	preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor)
 | |
| 	if err != nil {
 | |
| 		glog.Errorf("Error getting the updated preemptor pod object: %v", err)
 | |
| 		return "", err
 | |
| 	}
 | |
| 
 | |
| 	node, victims, nominatedPodsToClear, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr)
 | |
| 	metrics.PreemptionVictims.Set(float64(len(victims)))
 | |
| 	if err != nil {
 | |
| 		glog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name)
 | |
| 		return "", err
 | |
| 	}
 | |
| 	var nodeName = ""
 | |
| 	if node != nil {
 | |
| 		nodeName = node.Name
 | |
| 		err = sched.config.PodPreemptor.SetNominatedNodeName(preemptor, nodeName)
 | |
| 		if err != nil {
 | |
| 			glog.Errorf("Error in preemption process. Cannot update pod %v annotations: %v", preemptor.Name, err)
 | |
| 			return "", err
 | |
| 		}
 | |
| 		for _, victim := range victims {
 | |
| 			if err := sched.config.PodPreemptor.DeletePod(victim); err != nil {
 | |
| 				glog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err)
 | |
| 				return "", err
 | |
| 			}
 | |
| 			sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, nodeName)
 | |
| 		}
 | |
| 	}
 | |
| 	// Clearing nominated pods should happen outside of "if node != nil". Node could
 | |
| 	// be nil when a pod with nominated node name is eligible to preempt again,
 | |
| 	// but preemption logic does not find any node for it. In that case Preempt()
 | |
| 	// function of generic_scheduler.go returns the pod itself for removal of the annotation.
 | |
| 	for _, p := range nominatedPodsToClear {
 | |
| 		rErr := sched.config.PodPreemptor.RemoveNominatedNodeName(p)
 | |
| 		if rErr != nil {
 | |
| 			glog.Errorf("Cannot remove nominated node annotation of pod: %v", rErr)
 | |
| 			// We do not return as this error is not critical.
 | |
| 		}
 | |
| 	}
 | |
| 	return nodeName, err
 | |
| }
 | |
| 
 | |
| // assumeAndBindVolumes will update the volume cache and then asynchronously bind volumes if required.
 | |
| //
 | |
| // If volume binding is required, then the bind volumes routine will update the pod to send it back through
 | |
| // the scheduler.
 | |
| //
 | |
| // Otherwise, return nil error and continue to assume the pod.
 | |
| //
 | |
| // This function modifies assumed if volume binding is required.
 | |
| func (sched *Scheduler) assumeAndBindVolumes(assumed *v1.Pod, host string) error {
 | |
| 	if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
 | |
| 		allBound, bindingRequired, err := sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host)
 | |
| 		if err != nil {
 | |
| 			sched.config.Error(assumed, err)
 | |
| 			sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePodVolumes failed: %v", err)
 | |
| 			sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 | |
| 				Type:    v1.PodScheduled,
 | |
| 				Status:  v1.ConditionFalse,
 | |
| 				Reason:  "SchedulerError",
 | |
| 				Message: err.Error(),
 | |
| 			})
 | |
| 			return err
 | |
| 		}
 | |
| 		if !allBound {
 | |
| 			err = fmt.Errorf("Volume binding started, waiting for completion")
 | |
| 			if bindingRequired {
 | |
| 				if sched.config.Ecache != nil {
 | |
| 					invalidPredicates := sets.NewString(predicates.CheckVolumeBindingPred)
 | |
| 					sched.config.Ecache.InvalidatePredicates(invalidPredicates)
 | |
| 				}
 | |
| 
 | |
| 				// bindVolumesWorker() will update the Pod object to put it back in the scheduler queue
 | |
| 				sched.config.VolumeBinder.BindQueue.Add(assumed)
 | |
| 			} else {
 | |
| 				// We are just waiting for PV controller to finish binding, put it back in the
 | |
| 				// scheduler queue
 | |
| 				sched.config.Error(assumed, err)
 | |
| 				sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "FailedScheduling", "%v", err)
 | |
| 				sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 | |
| 					Type:   v1.PodScheduled,
 | |
| 					Status: v1.ConditionFalse,
 | |
| 					Reason: "VolumeBindingWaiting",
 | |
| 				})
 | |
| 			}
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // bindVolumesWorker() processes pods queued in assumeAndBindVolumes() and tries to
 | |
| // make the API update for volume binding.
 | |
| // This function runs forever until the volume BindQueue is closed.
 | |
| func (sched *Scheduler) bindVolumesWorker() {
 | |
| 	workFunc := func() bool {
 | |
| 		keyObj, quit := sched.config.VolumeBinder.BindQueue.Get()
 | |
| 		if quit {
 | |
| 			return true
 | |
| 		}
 | |
| 		defer sched.config.VolumeBinder.BindQueue.Done(keyObj)
 | |
| 
 | |
| 		assumed, ok := keyObj.(*v1.Pod)
 | |
| 		if !ok {
 | |
| 			glog.V(4).Infof("Object is not a *v1.Pod")
 | |
| 			return false
 | |
| 		}
 | |
| 
 | |
| 		// TODO: add metrics
 | |
| 		var reason string
 | |
| 		var eventType string
 | |
| 
 | |
| 		glog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
 | |
| 
 | |
| 		// The Pod is always sent back to the scheduler afterwards.
 | |
| 		err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed)
 | |
| 		if err != nil {
 | |
| 			glog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\": %v", assumed.Namespace, assumed.Name, err)
 | |
| 			reason = "VolumeBindingFailed"
 | |
| 			eventType = v1.EventTypeWarning
 | |
| 		} else {
 | |
| 			glog.V(4).Infof("Successfully bound volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
 | |
| 			reason = "VolumeBindingWaiting"
 | |
| 			eventType = v1.EventTypeNormal
 | |
| 			err = fmt.Errorf("Volume binding started, waiting for completion")
 | |
| 		}
 | |
| 
 | |
| 		// Always fail scheduling regardless of binding success.
 | |
| 		// The Pod needs to be sent back through the scheduler to:
 | |
| 		// * Retry volume binding if it fails.
 | |
| 		// * Retry volume binding if dynamic provisioning fails.
 | |
| 		// * Bind the Pod to the Node once all volumes are bound.
 | |
| 		sched.config.Error(assumed, err)
 | |
| 		sched.config.Recorder.Eventf(assumed, eventType, "FailedScheduling", "%v", err)
 | |
| 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 | |
| 			Type:   v1.PodScheduled,
 | |
| 			Status: v1.ConditionFalse,
 | |
| 			Reason: reason,
 | |
| 		})
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	for {
 | |
| 		if quit := workFunc(); quit {
 | |
| 			glog.V(4).Infof("bindVolumesWorker shutting down")
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
 | |
| // assume modifies `assumed`.
 | |
| func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
 | |
| 	// Optimistically assume that the binding will succeed and send it to apiserver
 | |
| 	// in the background.
 | |
| 	// If the binding fails, scheduler will release resources allocated to assumed pod
 | |
| 	// immediately.
 | |
| 	assumed.Spec.NodeName = host
 | |
| 	// NOTE: Because the scheduler uses snapshots of SchedulerCache and the live
 | |
| 	// version of Ecache, updates must be written to SchedulerCache before
 | |
| 	// invalidating Ecache.
 | |
| 	if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil {
 | |
| 		glog.Errorf("scheduler cache AssumePod failed: %v", err)
 | |
| 
 | |
| 		// This is most probably result of a BUG in retrying logic.
 | |
| 		// We report an error here so that pod scheduling can be retried.
 | |
| 		// This relies on the fact that Error will check if the pod has been bound
 | |
| 		// to a node and if so will not add it back to the unscheduled pods queue
 | |
| 		// (otherwise this would cause an infinite loop).
 | |
| 		sched.config.Error(assumed, err)
 | |
| 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err)
 | |
| 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 | |
| 			Type:    v1.PodScheduled,
 | |
| 			Status:  v1.ConditionFalse,
 | |
| 			Reason:  "SchedulerError",
 | |
| 			Message: err.Error(),
 | |
| 		})
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Optimistically assume that the binding will succeed, so we need to invalidate affected
 | |
| 	// predicates in equivalence cache.
 | |
| 	// If the binding fails, these invalidated item will not break anything.
 | |
| 	if sched.config.Ecache != nil {
 | |
| 		sched.config.Ecache.InvalidateCachedPredicateItemForPodAdd(assumed, host)
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // bind binds a pod to a given node defined in a binding object.  We expect this to run asynchronously, so we
 | |
| // handle binding metrics internally.
 | |
| func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 | |
| 	bindingStart := time.Now()
 | |
| 	// If binding succeeded then PodScheduled condition will be updated in apiserver so that
 | |
| 	// it's atomic with setting host.
 | |
| 	err := sched.config.GetBinder(assumed).Bind(b)
 | |
| 	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
 | |
| 		glog.Errorf("scheduler cache FinishBinding failed: %v", err)
 | |
| 	}
 | |
| 	if err != nil {
 | |
| 		glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name)
 | |
| 		if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil {
 | |
| 			glog.Errorf("scheduler cache ForgetPod failed: %v", err)
 | |
| 		}
 | |
| 		sched.config.Error(assumed, err)
 | |
| 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err)
 | |
| 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
 | |
| 			Type:   v1.PodScheduled,
 | |
| 			Status: v1.ConditionFalse,
 | |
| 			Reason: "BindingRejected",
 | |
| 		})
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
 | |
| 	metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart))
 | |
| 	sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name)
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // scheduleOne does the entire scheduling workflow for a single pod.  It is serialized on the scheduling algorithm's host fitting.
 | |
| func (sched *Scheduler) scheduleOne() {
 | |
| 	pod := sched.config.NextPod()
 | |
| 	if pod.DeletionTimestamp != nil {
 | |
| 		sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
 | |
| 		glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)
 | |
| 
 | |
| 	// Synchronously attempt to find a fit for the pod.
 | |
| 	start := time.Now()
 | |
| 	suggestedHost, err := sched.schedule(pod)
 | |
| 	if err != nil {
 | |
| 		// schedule() may have failed because the pod would not fit on any host, so we try to
 | |
| 		// preempt, with the expectation that the next time the pod is tried for scheduling it
 | |
| 		// will fit due to the preemption. It is also possible that a different pod will schedule
 | |
| 		// into the resources that were preempted, but this is harmless.
 | |
| 		if fitError, ok := err.(*core.FitError); ok {
 | |
| 			preemptionStartTime := time.Now()
 | |
| 			sched.preempt(pod, fitError)
 | |
| 			metrics.PreemptionAttempts.Inc()
 | |
| 			metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime))
 | |
| 			metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime))
 | |
| 		}
 | |
| 		return
 | |
| 	}
 | |
| 	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
 | |
| 	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
 | |
| 	// This allows us to keep scheduling without waiting on binding to occur.
 | |
| 	assumedPod := pod.DeepCopy()
 | |
| 
 | |
| 	// Assume volumes first before assuming the pod.
 | |
| 	//
 | |
| 	// If no volumes need binding, then nil is returned, and continue to assume the pod.
 | |
| 	//
 | |
| 	// Otherwise, error is returned and volume binding is started asynchronously for all of the pod's volumes.
 | |
| 	// scheduleOne() returns immediately on error, so that it doesn't continue to assume the pod.
 | |
| 	//
 | |
| 	// After the asynchronous volume binding updates are made, it will send the pod back through the scheduler for
 | |
| 	// subsequent passes until all volumes are fully bound.
 | |
| 	//
 | |
| 	// This function modifies 'assumedPod' if volume binding is required.
 | |
| 	err = sched.assumeAndBindVolumes(assumedPod, suggestedHost)
 | |
| 	if err != nil {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// assume modifies `assumedPod` by setting NodeName=suggestedHost
 | |
| 	err = sched.assume(assumedPod, suggestedHost)
 | |
| 	if err != nil {
 | |
| 		return
 | |
| 	}
 | |
| 	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
 | |
| 	go func() {
 | |
| 		err := sched.bind(assumedPod, &v1.Binding{
 | |
| 			ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID},
 | |
| 			Target: v1.ObjectReference{
 | |
| 				Kind: "Node",
 | |
| 				Name: suggestedHost,
 | |
| 			},
 | |
| 		})
 | |
| 		metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
 | |
| 		if err != nil {
 | |
| 			glog.Errorf("Internal error binding pod: (%v)", err)
 | |
| 		}
 | |
| 	}()
 | |
| }
 |