mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-04 01:40:07 +00:00
implement PodActivator to activate when preemption fails
This commit is contained in:
parent
8f2243fe74
commit
c322294883
@ -770,6 +770,8 @@ type Framework interface {
|
|||||||
|
|
||||||
// SetPodNominator sets the PodNominator
|
// SetPodNominator sets the PodNominator
|
||||||
SetPodNominator(nominator PodNominator)
|
SetPodNominator(nominator PodNominator)
|
||||||
|
// SetPodActivator sets the PodActivator
|
||||||
|
SetPodActivator(activator PodActivator)
|
||||||
|
|
||||||
// Close calls Close method of each plugin.
|
// Close calls Close method of each plugin.
|
||||||
Close() error
|
Close() error
|
||||||
@ -783,6 +785,8 @@ type Handle interface {
|
|||||||
PodNominator
|
PodNominator
|
||||||
// PluginsRunner abstracts operations to run some plugins.
|
// PluginsRunner abstracts operations to run some plugins.
|
||||||
PluginsRunner
|
PluginsRunner
|
||||||
|
// PodActivator abstracts operations in the scheduling queue.
|
||||||
|
PodActivator
|
||||||
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
|
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
|
||||||
// is taken at the beginning of a scheduling cycle and remains unchanged until
|
// is taken at the beginning of a scheduling cycle and remains unchanged until
|
||||||
// a pod finishes "Permit" point.
|
// a pod finishes "Permit" point.
|
||||||
@ -896,6 +900,12 @@ func (ni *NominatingInfo) Mode() NominatingMode {
|
|||||||
return ni.NominatingMode
|
return ni.NominatingMode
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PodActivator abstracts operations in the scheduling queue.
|
||||||
|
type PodActivator interface {
|
||||||
|
// Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
|
||||||
|
Activate(logger klog.Logger, pods map[string]*v1.Pod)
|
||||||
|
}
|
||||||
|
|
||||||
// PodNominator abstracts operations to maintain nominated Pods.
|
// PodNominator abstracts operations to maintain nominated Pods.
|
||||||
type PodNominator interface {
|
type PodNominator interface {
|
||||||
// AddNominatedPod adds the given pod to the nominator or
|
// AddNominatedPod adds the given pod to the nominator or
|
||||||
|
@ -492,6 +492,13 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
|
|||||||
result := metrics.GoroutineResultSuccess
|
result := metrics.GoroutineResultSuccess
|
||||||
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
|
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
|
||||||
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
|
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
|
||||||
|
defer func() {
|
||||||
|
if result == metrics.GoroutineResultError {
|
||||||
|
// When API call isn't successful, the Pod may get stuck in the unschedulable pod pool in the worst case.
|
||||||
|
// So, we should move the Pod to the activeQ anyways.
|
||||||
|
ev.Handler.Activate(logger, map[string]*v1.Pod{pod.Name: pod})
|
||||||
|
}
|
||||||
|
}()
|
||||||
defer cancel()
|
defer cancel()
|
||||||
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
|
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
|
||||||
|
|
||||||
@ -506,13 +513,6 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
|
|||||||
// We do not return as this error is not critical.
|
// We do not return as this error is not critical.
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can evict all victims in parallel, but the last one.
|
|
||||||
// We have to remove the pod from the preempting map before the last one is evicted
|
|
||||||
// because, otherwise, the pod removal might be notified to the scheduling queue before
|
|
||||||
// we remove this pod from the preempting map,
|
|
||||||
// and the pod could end up stucking at the unschedulable pod pool
|
|
||||||
// by all the pod removal events being ignored.
|
|
||||||
|
|
||||||
if len(c.Victims().Pods) == 0 {
|
if len(c.Victims().Pods) == 0 {
|
||||||
ev.mu.Lock()
|
ev.mu.Lock()
|
||||||
delete(ev.preempting, pod.UID)
|
delete(ev.preempting, pod.UID)
|
||||||
@ -521,9 +521,15 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We can evict all victims in parallel, but the last one.
|
||||||
|
// We have to remove the pod from the preempting map before the last one is evicted
|
||||||
|
// because, otherwise, the pod removal might be notified to the scheduling queue before
|
||||||
|
// we remove this pod from the preempting map,
|
||||||
|
// and the pod could end up stucking at the unschedulable pod pool
|
||||||
|
// by all the pod removal events being ignored.
|
||||||
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
|
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
|
||||||
if err := errCh.ReceiveError(); err != nil {
|
if err := errCh.ReceiveError(); err != nil {
|
||||||
logger.Error(err, "Error occurred during preemption")
|
logger.Error(err, "Error occurred during async preemption")
|
||||||
result = metrics.GoroutineResultError
|
result = metrics.GoroutineResultError
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -532,7 +538,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName
|
|||||||
ev.mu.Unlock()
|
ev.mu.Unlock()
|
||||||
|
|
||||||
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
|
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
|
||||||
logger.Error(err, "Error occurred during preemption")
|
logger.Error(err, "Error occurred during async preemption")
|
||||||
result = metrics.GoroutineResultError
|
result = metrics.GoroutineResultError
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -37,6 +38,7 @@ import (
|
|||||||
"k8s.io/client-go/kubernetes/scheme"
|
"k8s.io/client-go/kubernetes/scheme"
|
||||||
clienttesting "k8s.io/client-go/testing"
|
clienttesting "k8s.io/client-go/testing"
|
||||||
"k8s.io/client-go/tools/events"
|
"k8s.io/client-go/tools/events"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
"k8s.io/klog/v2/ktesting"
|
"k8s.io/klog/v2/ktesting"
|
||||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||||
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
|
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
|
||||||
@ -86,6 +88,16 @@ func (pl *FakePostFilterPlugin) OrderedScoreFuncs(ctx context.Context, nodesToVi
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fakePodActivator struct {
|
||||||
|
activatedPods map[string]*v1.Pod
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakePodActivator) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
|
||||||
|
for name, pod := range pods {
|
||||||
|
f.activatedPods[name] = pod
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type FakePreemptionScorePostFilterPlugin struct{}
|
type FakePreemptionScorePostFilterPlugin struct{}
|
||||||
|
|
||||||
func (pl *FakePreemptionScorePostFilterPlugin) SelectVictimsOnNode(
|
func (pl *FakePreemptionScorePostFilterPlugin) SelectVictimsOnNode(
|
||||||
@ -437,6 +449,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
expectedStatus *framework.Status
|
expectedStatus *framework.Status
|
||||||
// Only compared when async preemption is enabled.
|
// Only compared when async preemption is enabled.
|
||||||
expectedPreemptingMap sets.Set[types.UID]
|
expectedPreemptingMap sets.Set[types.UID]
|
||||||
|
expectedActivatedPods map[string]*v1.Pod
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "no victims",
|
name: "no victims",
|
||||||
@ -527,6 +540,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
expectedStatus: framework.AsStatus(errors.New("delete pod failed")),
|
expectedStatus: framework.AsStatus(errors.New("delete pod failed")),
|
||||||
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "one victim, not-found victim error is ignored when deleting",
|
name: "one victim, not-found victim error is ignored when deleting",
|
||||||
@ -563,6 +577,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
nodeNames: []string{node1Name},
|
nodeNames: []string{node1Name},
|
||||||
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
||||||
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "two victims without condition, one passes successfully and the second fails",
|
name: "two victims without condition, one passes successfully and the second fails",
|
||||||
@ -585,6 +600,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
expectedDeletedPods: []string{"victim2"},
|
expectedDeletedPods: []string{"victim2"},
|
||||||
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
expectedStatus: framework.AsStatus(errors.New("patch pod status failed")),
|
||||||
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
expectedPreemptingMap: sets.New(types.UID("preemptor")),
|
||||||
|
expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -638,6 +654,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
|
|
||||||
informerFactory := informers.NewSharedInformerFactory(cs, 0)
|
informerFactory := informers.NewSharedInformerFactory(cs, 0)
|
||||||
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()})
|
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()})
|
||||||
|
fakeActivator := &fakePodActivator{activatedPods: make(map[string]*v1.Pod)}
|
||||||
fwk, err := tf.NewFramework(
|
fwk, err := tf.NewFramework(
|
||||||
ctx,
|
ctx,
|
||||||
registeredPlugins, "",
|
registeredPlugins, "",
|
||||||
@ -648,6 +665,7 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)),
|
frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)),
|
||||||
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)),
|
||||||
frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")),
|
frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")),
|
||||||
|
frameworkruntime.WithPodActivator(fakeActivator),
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
@ -671,8 +689,6 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
pe.mu.Unlock()
|
pe.mu.Unlock()
|
||||||
// make the requests complete
|
// make the requests complete
|
||||||
close(requestStopper)
|
close(requestStopper)
|
||||||
|
|
||||||
return
|
|
||||||
} else {
|
} else {
|
||||||
close(requestStopper) // no need to stop requests
|
close(requestStopper) // no need to stop requests
|
||||||
status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin")
|
status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin")
|
||||||
@ -705,6 +721,18 @@ func TestPrepareCandidate(t *testing.T) {
|
|||||||
lastErrMsg = fmt.Sprintf("expected patch error %v, got %v", tt.expectedPatchError, patchFailure)
|
lastErrMsg = fmt.Sprintf("expected patch error %v, got %v", tt.expectedPatchError, patchFailure)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if asyncPreemptionEnabled {
|
||||||
|
if tt.expectedActivatedPods != nil && !reflect.DeepEqual(tt.expectedActivatedPods, fakeActivator.activatedPods) {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected activated pods %v, got %v", tt.expectedActivatedPods, fakeActivator.activatedPods)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
if tt.expectedActivatedPods == nil && len(fakeActivator.activatedPods) != 0 {
|
||||||
|
lastErrMsg = fmt.Sprintf("expected no activated pods, got %v", fakeActivator.activatedPods)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true, nil
|
return true, nil
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
t.Fatal(lastErrMsg)
|
t.Fatal(lastErrMsg)
|
||||||
|
@ -84,6 +84,7 @@ type frameworkImpl struct {
|
|||||||
|
|
||||||
extenders []framework.Extender
|
extenders []framework.Extender
|
||||||
framework.PodNominator
|
framework.PodNominator
|
||||||
|
framework.PodActivator
|
||||||
|
|
||||||
parallelizer parallelize.Parallelizer
|
parallelizer parallelize.Parallelizer
|
||||||
}
|
}
|
||||||
@ -131,6 +132,7 @@ type frameworkOptions struct {
|
|||||||
snapshotSharedLister framework.SharedLister
|
snapshotSharedLister framework.SharedLister
|
||||||
metricsRecorder *metrics.MetricAsyncRecorder
|
metricsRecorder *metrics.MetricAsyncRecorder
|
||||||
podNominator framework.PodNominator
|
podNominator framework.PodNominator
|
||||||
|
podActivator framework.PodActivator
|
||||||
extenders []framework.Extender
|
extenders []framework.Extender
|
||||||
captureProfile CaptureProfile
|
captureProfile CaptureProfile
|
||||||
parallelizer parallelize.Parallelizer
|
parallelizer parallelize.Parallelizer
|
||||||
@ -200,6 +202,12 @@ func WithPodNominator(nominator framework.PodNominator) Option {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithPodActivator(activator framework.PodActivator) Option {
|
||||||
|
return func(o *frameworkOptions) {
|
||||||
|
o.podActivator = activator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// WithExtenders sets extenders for the scheduling frameworkImpl.
|
// WithExtenders sets extenders for the scheduling frameworkImpl.
|
||||||
func WithExtenders(extenders []framework.Extender) Option {
|
func WithExtenders(extenders []framework.Extender) Option {
|
||||||
return func(o *frameworkOptions) {
|
return func(o *frameworkOptions) {
|
||||||
@ -279,6 +287,7 @@ func NewFramework(ctx context.Context, r Registry, profile *config.KubeScheduler
|
|||||||
metricsRecorder: options.metricsRecorder,
|
metricsRecorder: options.metricsRecorder,
|
||||||
extenders: options.extenders,
|
extenders: options.extenders,
|
||||||
PodNominator: options.podNominator,
|
PodNominator: options.podNominator,
|
||||||
|
PodActivator: options.podActivator,
|
||||||
parallelizer: options.parallelizer,
|
parallelizer: options.parallelizer,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
}
|
}
|
||||||
@ -427,6 +436,10 @@ func (f *frameworkImpl) SetPodNominator(n framework.PodNominator) {
|
|||||||
f.PodNominator = n
|
f.PodNominator = n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f *frameworkImpl) SetPodActivator(a framework.PodActivator) {
|
||||||
|
f.PodActivator = a
|
||||||
|
}
|
||||||
|
|
||||||
// Close closes each plugin, when they implement io.Closer interface.
|
// Close closes each plugin, when they implement io.Closer interface.
|
||||||
func (f *frameworkImpl) Close() error {
|
func (f *frameworkImpl) Close() error {
|
||||||
var errs []error
|
var errs []error
|
||||||
|
@ -355,6 +355,7 @@ func New(ctx context.Context,
|
|||||||
|
|
||||||
for _, fwk := range profiles {
|
for _, fwk := range profiles {
|
||||||
fwk.SetPodNominator(podQueue)
|
fwk.SetPodNominator(podQueue)
|
||||||
|
fwk.SetPodActivator(podQueue)
|
||||||
}
|
}
|
||||||
|
|
||||||
schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod)
|
schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod)
|
||||||
|
Loading…
Reference in New Issue
Block a user