Scheduler changes to assume volume and pod together, and then bind

volume and pod asynchronously afterwards. This will also make it easier
to migrate to the scheduler framework.
This commit is contained in:
Michelle Au 2018-08-17 17:45:51 -07:00
parent 37d46a1e3f
commit 01d83fa104
4 changed files with 55 additions and 135 deletions

View File

@ -17,7 +17,6 @@ limitations under the License.
package scheduler
import (
"fmt"
"time"
"k8s.io/api/core/v1"
@ -184,10 +183,6 @@ func (sched *Scheduler) Run() {
return
}
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
go sched.config.VolumeBinder.Run(sched.bindVolumesWorker, sched.config.StopEverything)
}
go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
}
@ -265,17 +260,12 @@ func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, e
return nodeName, err
}
// assumeAndBindVolumes will update the volume cache and then asynchronously bind volumes if required.
//
// If volume binding is required, then the bind volumes routine will update the pod to send it back through
// the scheduler.
//
// Otherwise, return nil error and continue to assume the pod.
// assumeVolumes will update the volume cache with the chosen bindings
//
// This function modifies assumed if volume binding is required.
func (sched *Scheduler) assumeAndBindVolumes(assumed *v1.Pod, host string) error {
func (sched *Scheduler) assumeVolumes(assumed *v1.Pod, host string) (allBound bool, err error) {
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
allBound, bindingRequired, err := sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host)
allBound, err = sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host)
if err != nil {
sched.config.Error(assumed, err)
sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePodVolumes failed: %v", err)
@ -285,76 +275,38 @@ func (sched *Scheduler) assumeAndBindVolumes(assumed *v1.Pod, host string) error
Reason: "SchedulerError",
Message: err.Error(),
})
return err
}
if !allBound {
err = fmt.Errorf("Volume binding started, waiting for completion")
if bindingRequired {
if sched.config.Ecache != nil {
invalidPredicates := sets.NewString(predicates.CheckVolumeBindingPred)
sched.config.Ecache.InvalidatePredicates(invalidPredicates)
}
// bindVolumesWorker() will update the Pod object to put it back in the scheduler queue
sched.config.VolumeBinder.BindQueue.Add(assumed)
} else {
// We are just waiting for PV controller to finish binding, put it back in the
// scheduler queue
sched.config.Error(assumed, err)
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "FailedScheduling", "%v", err)
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: "VolumeBindingWaiting",
})
}
return err
// Invalidate ecache because assumed volumes could have affected the cached
// pvs for other pods
if sched.config.Ecache != nil {
invalidPredicates := sets.NewString(predicates.CheckVolumeBindingPred)
sched.config.Ecache.InvalidatePredicates(invalidPredicates)
}
}
return nil
return
}
// bindVolumesWorker() processes pods queued in assumeAndBindVolumes() and tries to
// make the API update for volume binding.
// This function runs forever until the volume BindQueue is closed.
func (sched *Scheduler) bindVolumesWorker() {
workFunc := func() bool {
keyObj, quit := sched.config.VolumeBinder.BindQueue.Get()
if quit {
return true
}
defer sched.config.VolumeBinder.BindQueue.Done(keyObj)
// bindVolumes will make the API update with the assumed bindings and wait until
// the PV controller has completely finished the binding operation.
//
// If binding errors, times out or gets undone, then an error will be returned to
// retry scheduling.
func (sched *Scheduler) bindVolumes(assumed *v1.Pod) error {
var reason string
var eventType string
assumed, ok := keyObj.(*v1.Pod)
if !ok {
glog.V(4).Infof("Object is not a *v1.Pod")
return false
glog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed)
if err != nil {
glog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\": %v", assumed.Namespace, assumed.Name, err)
// Unassume the Pod and retry scheduling
if forgetErr := sched.config.SchedulerCache.ForgetPod(assumed); forgetErr != nil {
glog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
}
// TODO: add metrics
var reason string
var eventType string
glog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
// The Pod is always sent back to the scheduler afterwards.
err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed)
if err != nil {
glog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\": %v", assumed.Namespace, assumed.Name, err)
reason = "VolumeBindingFailed"
eventType = v1.EventTypeWarning
} else {
glog.V(4).Infof("Successfully bound volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
reason = "VolumeBindingWaiting"
eventType = v1.EventTypeNormal
err = fmt.Errorf("Volume binding started, waiting for completion")
}
// Always fail scheduling regardless of binding success.
// The Pod needs to be sent back through the scheduler to:
// * Retry volume binding if it fails.
// * Retry volume binding if dynamic provisioning fails.
// * Bind the Pod to the Node once all volumes are bound.
reason = "VolumeBindingFailed"
eventType = v1.EventTypeWarning
sched.config.Error(assumed, err)
sched.config.Recorder.Eventf(assumed, eventType, "FailedScheduling", "%v", err)
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
@ -362,15 +314,11 @@ func (sched *Scheduler) bindVolumesWorker() {
Status: v1.ConditionFalse,
Reason: reason,
})
return false
return err
}
for {
if quit := workFunc(); quit {
glog.V(4).Infof("bindVolumesWorker shutting down")
break
}
}
glog.V(5).Infof("Success binding volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
return nil
}
// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
@ -478,16 +426,12 @@ func (sched *Scheduler) scheduleOne() {
// Assume volumes first before assuming the pod.
//
// If no volumes need binding, then nil is returned, and continue to assume the pod.
// If all volumes are completely bound, then allBound is true and binding will be skipped.
//
// Otherwise, error is returned and volume binding is started asynchronously for all of the pod's volumes.
// scheduleOne() returns immediately on error, so that it doesn't continue to assume the pod.
//
// After the asynchronous volume binding updates are made, it will send the pod back through the scheduler for
// subsequent passes until all volumes are fully bound.
// Otherwise, binding of volumes is started after the pod is assumed, but before pod binding.
//
// This function modifies 'assumedPod' if volume binding is required.
err = sched.assumeAndBindVolumes(assumedPod, suggestedHost)
allBound, err := sched.assumeVolumes(assumedPod, suggestedHost)
if err != nil {
return
}
@ -499,6 +443,14 @@ func (sched *Scheduler) scheduleOne() {
}
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
go func() {
// Bind volumes first before Pod
if !allBound {
err = sched.bindVolumes(assumedPod)
if err != nil {
return
}
}
err := sched.bind(assumedPod, &v1.Binding{
ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID},
Target: v1.ObjectReference{

View File

@ -707,8 +707,7 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
},
expectAssumeCalled: true,
expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}},
eventReason: "Scheduled",
eventReason: "Scheduled",
},
{
name: "bound/invalid pv affinity",
@ -739,28 +738,15 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
expectError: makePredicateError("1 node(s) didn't find available persistent volumes to bind, 1 node(s) had volume node affinity conflict"),
},
{
name: "unbound/found matches",
name: "unbound/found matches/bind succeeds",
volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{
FindUnboundSatsified: true,
FindBoundSatsified: true,
AssumeBindingRequired: true,
FindUnboundSatsified: true,
FindBoundSatsified: true,
},
expectAssumeCalled: true,
expectBindCalled: true,
eventReason: "FailedScheduling",
expectError: fmt.Errorf("Volume binding started, waiting for completion"),
},
{
name: "unbound/found matches/already-bound",
volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{
FindUnboundSatsified: true,
FindBoundSatsified: true,
AssumeBindingRequired: false,
},
expectAssumeCalled: true,
expectBindCalled: false,
eventReason: "FailedScheduling",
expectError: fmt.Errorf("Volume binding started, waiting for completion"),
expectPodBind: &v1.Binding{ObjectMeta: metav1.ObjectMeta{Name: "foo", UID: types.UID("foo")}, Target: v1.ObjectReference{Kind: "Node", Name: "machine1"}},
eventReason: "Scheduled",
},
{
name: "predicate error",
@ -784,10 +770,9 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
{
name: "bind error",
volumeBinderConfig: &persistentvolume.FakeVolumeBinderConfig{
FindUnboundSatsified: true,
FindBoundSatsified: true,
AssumeBindingRequired: true,
BindErr: bindErr,
FindUnboundSatsified: true,
FindBoundSatsified: true,
BindErr: bindErr,
},
expectAssumeCalled: true,
expectBindCalled: true,
@ -814,8 +799,6 @@ func TestSchedulerWithVolumeBinding(t *testing.T) {
close(eventChan)
})
go fakeVolumeBinder.Run(s.bindVolumesWorker, stop)
s.scheduleOne()
// Wait for pod to succeed or fail scheduling

View File

@ -8,11 +8,9 @@ go_library(
deps = [
"//pkg/controller/volume/persistentvolume:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/storage/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/util/workqueue:go_default_library",
],
)

View File

@ -20,19 +20,15 @@ import (
"time"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/controller/volume/persistentvolume"
)
// VolumeBinder sets up the volume binding library and manages
// the volume binding operations with a queue.
// VolumeBinder sets up the volume binding library
type VolumeBinder struct {
Binder persistentvolume.SchedulerVolumeBinder
BindQueue *workqueue.Type
Binder persistentvolume.SchedulerVolumeBinder
}
// NewVolumeBinder sets up the volume binding library and binding queue
@ -43,27 +39,18 @@ func NewVolumeBinder(
storageClassInformer storageinformers.StorageClassInformer) *VolumeBinder {
return &VolumeBinder{
Binder: persistentvolume.NewVolumeBinder(client, pvcInformer, pvInformer, storageClassInformer),
BindQueue: workqueue.NewNamed("podsToBind"),
// TODO: what is a good bind timeout value?
Binder: persistentvolume.NewVolumeBinder(client, pvcInformer, pvInformer, storageClassInformer, 10*time.Minute),
}
}
// NewFakeVolumeBinder sets up a fake volume binder and binding queue
func NewFakeVolumeBinder(config *persistentvolume.FakeVolumeBinderConfig) *VolumeBinder {
return &VolumeBinder{
Binder: persistentvolume.NewFakeVolumeBinder(config),
BindQueue: workqueue.NewNamed("podsToBind"),
Binder: persistentvolume.NewFakeVolumeBinder(config),
}
}
// Run starts a goroutine to handle the binding queue with the given function.
func (b *VolumeBinder) Run(bindWorkFunc func(), stopCh <-chan struct{}) {
go wait.Until(bindWorkFunc, time.Second, stopCh)
<-stopCh
b.BindQueue.ShutDown()
}
// DeletePodBindings will delete the cached volume bindings for the given pod.
func (b *VolumeBinder) DeletePodBindings(pod *v1.Pod) {
cache := b.Binder.GetBindingsCache()