Merge pull request #120253 from pohly/dra-scheduler-podschedulingcontext-updates

dra scheduler: refactor PodSchedulingContext updates
This commit is contained in:
Kubernetes Prow Robot 2023-09-08 02:48:14 -07:00 committed by GitHub
commit a64a3e16ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 178 additions and 148 deletions

View File

@ -31,6 +31,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2" resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
@ -54,6 +55,9 @@ const (
// framework.CycleState, in the later phases we don't need to call Write method // framework.CycleState, in the later phases we don't need to call Write method
// to update the value // to update the value
type stateData struct { type stateData struct {
// preScored is true if PreScore was invoked.
preScored bool
// A copy of all claims for the Pod (i.e. 1:1 match with // A copy of all claims for the Pod (i.e. 1:1 match with
// pod.Spec.ResourceClaims), initially with the status from the start // pod.Spec.ResourceClaims), initially with the status from the start
// of the scheduling cycle. Each claim instance is read-only because it // of the scheduling cycle. Each claim instance is read-only because it
@ -72,17 +76,9 @@ type stateData struct {
// protected by the mutex. Used by PostFilter. // protected by the mutex. Used by PostFilter.
unavailableClaims sets.Int unavailableClaims sets.Int
// A pointer to the PodSchedulingContext object for the pod, if one exists. // podSchedulingState keeps track of the PodSchedulingContext
// Gets set on demand. // (if one exists) and the changes made to it.
// podSchedulingState podSchedulingState
// Conceptually, this object belongs into the scheduler framework
// where it might get shared by different plugins. But in practice,
// it is currently only used by dynamic provisioning and thus
// managed entirely here.
schedulingCtx *resourcev1alpha2.PodSchedulingContext
// podSchedulingDirty is true if the current copy was locally modified.
podSchedulingDirty bool
mutex sync.Mutex mutex sync.Mutex
@ -123,91 +119,108 @@ func (d *stateData) updateClaimStatus(ctx context.Context, clientset kubernetes.
return nil return nil
} }
// initializePodSchedulingContext can be called concurrently. It returns an existing PodSchedulingContext type podSchedulingState struct {
// object if there is one already, retrieves one if not, or as a last resort creates // A pointer to the PodSchedulingContext object for the pod, if one exists
// one from scratch. // in the API server.
func (d *stateData) initializePodSchedulingContexts(ctx context.Context, pod *v1.Pod, podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister) (*resourcev1alpha2.PodSchedulingContext, error) { //
// TODO (#113701): check if this mutex locking can be avoided by calling initializePodSchedulingContext during PreFilter. // Conceptually, this object belongs into the scheduler framework
d.mutex.Lock() // where it might get shared by different plugins. But in practice,
defer d.mutex.Unlock() // it is currently only used by dynamic provisioning and thus
// managed entirely here.
schedulingCtx *resourcev1alpha2.PodSchedulingContext
if d.schedulingCtx != nil { // selectedNode is set if (and only if) a node has been selected.
return d.schedulingCtx, nil selectedNode *string
}
// potentialNodes is set if (and only if) the potential nodes field
// needs to be updated or set.
potentialNodes *[]string
}
func (p *podSchedulingState) isDirty() bool {
return p.selectedNode != nil ||
p.potentialNodes != nil
}
// init checks whether there is already a PodSchedulingContext object.
// Must not be called concurrently,
func (p *podSchedulingState) init(ctx context.Context, pod *v1.Pod, podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister) error {
schedulingCtx, err := podSchedulingContextLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) schedulingCtx, err := podSchedulingContextLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
switch { switch {
case apierrors.IsNotFound(err): case apierrors.IsNotFound(err):
controller := true return nil
schedulingCtx = &resourcev1alpha2.PodSchedulingContext{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "v1",
Kind: "Pod",
Name: pod.Name,
UID: pod.UID,
Controller: &controller,
},
},
},
}
err = nil
case err != nil: case err != nil:
return nil, err return err
default: default:
// We have an object, but it might be obsolete. // We have an object, but it might be obsolete.
if !metav1.IsControlledBy(schedulingCtx, pod) { if !metav1.IsControlledBy(schedulingCtx, pod) {
return nil, fmt.Errorf("PodSchedulingContext object with UID %s is not owned by Pod %s/%s", schedulingCtx.UID, pod.Namespace, pod.Name) return fmt.Errorf("PodSchedulingContext object with UID %s is not owned by Pod %s/%s", schedulingCtx.UID, pod.Namespace, pod.Name)
} }
} }
d.schedulingCtx = schedulingCtx p.schedulingCtx = schedulingCtx
return schedulingCtx, err return nil
} }
// publishPodSchedulingContext creates or updates the PodSchedulingContext object. // publish creates or updates the PodSchedulingContext object, if necessary.
func (d *stateData) publishPodSchedulingContexts(ctx context.Context, clientset kubernetes.Interface, schedulingCtx *resourcev1alpha2.PodSchedulingContext) error { // Must not be called concurrently.
d.mutex.Lock() func (p *podSchedulingState) publish(ctx context.Context, pod *v1.Pod, clientset kubernetes.Interface) error {
defer d.mutex.Unlock() if !p.isDirty() {
return nil
}
var err error var err error
logger := klog.FromContext(ctx) logger := klog.FromContext(ctx)
msg := "Updating PodSchedulingContext" if p.schedulingCtx != nil {
if schedulingCtx.UID == "" { // Update it.
msg = "Creating PodSchedulingContext" schedulingCtx := p.schedulingCtx.DeepCopy()
} if p.selectedNode != nil {
if loggerV := logger.V(6); loggerV.Enabled() { schedulingCtx.Spec.SelectedNode = *p.selectedNode
// At a high enough log level, dump the entire object. }
loggerV.Info(msg, "podSchedulingCtxDump", klog.Format(schedulingCtx)) if p.potentialNodes != nil {
schedulingCtx.Spec.PotentialNodes = *p.potentialNodes
}
if loggerV := logger.V(6); loggerV.Enabled() {
// At a high enough log level, dump the entire object.
loggerV.Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx))
} else {
logger.V(5).Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx))
}
_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Update(ctx, schedulingCtx, metav1.UpdateOptions{})
} else { } else {
logger.V(5).Info(msg, "podSchedulingCtx", klog.KObj(schedulingCtx)) // Create it.
} schedulingCtx := &resourcev1alpha2.PodSchedulingContext{
if schedulingCtx.UID == "" { ObjectMeta: metav1.ObjectMeta{
schedulingCtx, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Create(ctx, schedulingCtx, metav1.CreateOptions{}) Name: pod.Name,
} else { Namespace: pod.Namespace,
// TODO (#113700): patch here to avoid racing with drivers which update the status. OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(pod, schema.GroupVersionKind{Version: "v1", Kind: "Pod"})},
schedulingCtx, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Update(ctx, schedulingCtx, metav1.UpdateOptions{}) },
}
if p.selectedNode != nil {
schedulingCtx.Spec.SelectedNode = *p.selectedNode
}
if p.potentialNodes != nil {
schedulingCtx.Spec.PotentialNodes = *p.potentialNodes
}
if loggerV := logger.V(6); loggerV.Enabled() {
// At a high enough log level, dump the entire object.
loggerV.Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx))
} else {
logger.V(5).Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx))
}
_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Create(ctx, schedulingCtx, metav1.CreateOptions{})
} }
if err != nil { if err != nil {
return err return err
} }
d.schedulingCtx = schedulingCtx p.potentialNodes = nil
d.podSchedulingDirty = false p.selectedNode = nil
return nil return nil
} }
// storePodSchedulingContext replaces the pod schedulingCtx object in the state.
func (d *stateData) storePodSchedulingContexts(schedulingCtx *resourcev1alpha2.PodSchedulingContext) {
d.mutex.Lock()
defer d.mutex.Unlock()
d.schedulingCtx = schedulingCtx
d.podSchedulingDirty = true
}
func statusForClaim(schedulingCtx *resourcev1alpha2.PodSchedulingContext, podClaimName string) *resourcev1alpha2.ResourceClaimSchedulingStatus { func statusForClaim(schedulingCtx *resourcev1alpha2.PodSchedulingContext, podClaimName string) *resourcev1alpha2.ResourceClaimSchedulingStatus {
if schedulingCtx == nil {
return nil
}
for _, status := range schedulingCtx.Status.ResourceClaims { for _, status := range schedulingCtx.Status.ResourceClaims {
if status.Name == podClaimName { if status.Name == podClaimName {
return &status return &status
@ -564,6 +577,11 @@ func (pl *dynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
return nil, framework.NewStatus(framework.Skip) return nil, framework.NewStatus(framework.Skip)
} }
// Fetch s.podSchedulingState.schedulingCtx, it's going to be needed when checking claims.
if err := s.podSchedulingState.init(ctx, pod, pl.podSchedulingContextLister); err != nil {
return nil, statusError(logger, err)
}
s.informationsForClaim = make([]informationForClaim, len(claims)) s.informationsForClaim = make([]informationForClaim, len(claims))
for index, claim := range claims { for index, claim := range claims {
if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeImmediate && if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeImmediate &&
@ -614,11 +632,7 @@ func (pl *dynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
s.informationsForClaim[index].availableOnNode = selector s.informationsForClaim[index].availableOnNode = selector
} }
// Now we need information from drivers. // Now we need information from drivers.
schedulingCtx, err := s.initializePodSchedulingContexts(ctx, pod, pl.podSchedulingContextLister) s.informationsForClaim[index].status = statusForClaim(s.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name)
if err != nil {
return nil, statusError(logger, err)
}
s.informationsForClaim[index].status = statusForClaim(schedulingCtx, pod.Spec.ResourceClaims[index].Name)
} }
} }
@ -772,64 +786,71 @@ func (pl *dynamicResources) PreScore(ctx context.Context, cs *framework.CycleSta
if err != nil { if err != nil {
return statusError(klog.FromContext(ctx), err) return statusError(klog.FromContext(ctx), err)
} }
defer func() {
state.preScored = true
}()
if len(state.claims) == 0 { if len(state.claims) == 0 {
return nil return nil
} }
logger := klog.FromContext(ctx) logger := klog.FromContext(ctx)
schedulingCtx, err := state.initializePodSchedulingContexts(ctx, pod, pl.podSchedulingContextLister)
if err != nil {
return statusError(logger, err)
}
pending := false pending := false
for _, claim := range state.claims { for _, claim := range state.claims {
if claim.Status.Allocation == nil { if claim.Status.Allocation == nil {
pending = true pending = true
} }
} }
if pending && !haveAllNodes(schedulingCtx.Spec.PotentialNodes, nodes) { if !pending {
// Remember the potential nodes. The object will get created or logger.V(5).Info("no pending claims", "pod", klog.KObj(pod))
// updated in Reserve. This is both an optimization and return nil
// covers the case that PreScore doesn't get called when there
// is only a single node.
logger.V(5).Info("remembering potential nodes", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes))
schedulingCtx = schedulingCtx.DeepCopy()
numNodes := len(nodes)
if numNodes > resourcev1alpha2.PodSchedulingNodeListMaxSize {
numNodes = resourcev1alpha2.PodSchedulingNodeListMaxSize
}
schedulingCtx.Spec.PotentialNodes = make([]string, 0, numNodes)
if numNodes == len(nodes) {
// Copy all node names.
for _, node := range nodes {
schedulingCtx.Spec.PotentialNodes = append(schedulingCtx.Spec.PotentialNodes, node.Name)
}
} else {
// Select a random subset of the nodes to comply with
// the PotentialNodes length limit. Randomization is
// done for us by Go which iterates over map entries
// randomly.
nodeNames := map[string]struct{}{}
for _, node := range nodes {
nodeNames[node.Name] = struct{}{}
}
for nodeName := range nodeNames {
if len(schedulingCtx.Spec.PotentialNodes) >= resourcev1alpha2.PodSchedulingNodeListMaxSize {
break
}
schedulingCtx.Spec.PotentialNodes = append(schedulingCtx.Spec.PotentialNodes, nodeName)
}
}
sort.Strings(schedulingCtx.Spec.PotentialNodes)
state.storePodSchedulingContexts(schedulingCtx)
} }
logger.V(5).Info("all potential nodes already set", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes)) if haveAllPotentialNodes(state.podSchedulingState.schedulingCtx, nodes) {
logger.V(5).Info("all potential nodes already set", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes))
return nil
}
// Remember the potential nodes. The object will get created or
// updated in Reserve. This is both an optimization and
// covers the case that PreScore doesn't get called when there
// is only a single node.
logger.V(5).Info("remembering potential nodes", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes))
numNodes := len(nodes)
if numNodes > resourcev1alpha2.PodSchedulingNodeListMaxSize {
numNodes = resourcev1alpha2.PodSchedulingNodeListMaxSize
}
potentialNodes := make([]string, 0, numNodes)
if numNodes == len(nodes) {
// Copy all node names.
for _, node := range nodes {
potentialNodes = append(potentialNodes, node.Name)
}
} else {
// Select a random subset of the nodes to comply with
// the PotentialNodes length limit. Randomization is
// done for us by Go which iterates over map entries
// randomly.
nodeNames := map[string]struct{}{}
for _, node := range nodes {
nodeNames[node.Name] = struct{}{}
}
for nodeName := range nodeNames {
if len(potentialNodes) >= resourcev1alpha2.PodSchedulingNodeListMaxSize {
break
}
potentialNodes = append(potentialNodes, nodeName)
}
}
sort.Strings(potentialNodes)
state.podSchedulingState.potentialNodes = &potentialNodes
return nil return nil
} }
func haveAllNodes(nodeNames []string, nodes []*v1.Node) bool { func haveAllPotentialNodes(schedulingCtx *resourcev1alpha2.PodSchedulingContext, nodes []*v1.Node) bool {
if schedulingCtx == nil {
return false
}
for _, node := range nodes { for _, node := range nodes {
if !haveNode(nodeNames, node.Name) { if !haveNode(schedulingCtx.Spec.PotentialNodes, node.Name) {
return false return false
} }
} }
@ -861,10 +882,6 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
numDelayedAllocationPending := 0 numDelayedAllocationPending := 0
numClaimsWithStatusInfo := 0 numClaimsWithStatusInfo := 0
logger := klog.FromContext(ctx) logger := klog.FromContext(ctx)
schedulingCtx, err := state.initializePodSchedulingContexts(ctx, pod, pl.podSchedulingContextLister)
if err != nil {
return statusError(logger, err)
}
for index, claim := range state.claims { for index, claim := range state.claims {
if claim.Status.Allocation != nil { if claim.Status.Allocation != nil {
// Allocated, but perhaps not reserved yet. // Allocated, but perhaps not reserved yet.
@ -894,7 +911,7 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
// Did the driver provide information that steered node // Did the driver provide information that steered node
// selection towards a node that it can support? // selection towards a node that it can support?
if statusForClaim(schedulingCtx, pod.Spec.ResourceClaims[index].Name) != nil { if statusForClaim(state.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name) != nil {
numClaimsWithStatusInfo++ numClaimsWithStatusInfo++
} }
} }
@ -905,16 +922,19 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
return nil return nil
} }
podSchedulingDirty := state.podSchedulingDirty if !state.preScored {
if len(schedulingCtx.Spec.PotentialNodes) == 0 { // There was only one candidate that passed the Filters and
// PreScore was not called, probably because there was // therefore PreScore was not called.
// only one candidate. We need to ask whether that //
// node is suitable, otherwise the scheduler will pick // We need to ask whether that node is suitable, otherwise the
// it forever even when it cannot satisfy the claim. // scheduler will pick it forever even when it cannot satisfy
schedulingCtx = schedulingCtx.DeepCopy() // the claim.
schedulingCtx.Spec.PotentialNodes = []string{nodeName} if state.podSchedulingState.schedulingCtx == nil ||
logger.V(5).Info("asking for information about single potential node", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) !containsNode(state.podSchedulingState.schedulingCtx.Spec.PotentialNodes, nodeName) {
podSchedulingDirty = true potentialNodes := []string{nodeName}
state.podSchedulingState.potentialNodes = &potentialNodes
logger.V(5).Info("asking for information about single potential node", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
}
} }
// When there is only one pending resource, we can go ahead with // When there is only one pending resource, we can go ahead with
@ -922,26 +942,26 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
// the driver yet. Otherwise we wait for information before blindly // the driver yet. Otherwise we wait for information before blindly
// making a decision that might have to be reversed later. // making a decision that might have to be reversed later.
if numDelayedAllocationPending == 1 || numClaimsWithStatusInfo == numDelayedAllocationPending { if numDelayedAllocationPending == 1 || numClaimsWithStatusInfo == numDelayedAllocationPending {
schedulingCtx = schedulingCtx.DeepCopy()
// TODO: can we increase the chance that the scheduler picks // TODO: can we increase the chance that the scheduler picks
// the same node as before when allocation is on-going, // the same node as before when allocation is on-going,
// assuming that that node still fits the pod? Picking a // assuming that that node still fits the pod? Picking a
// different node may lead to some claims being allocated for // different node may lead to some claims being allocated for
// one node and others for another, which then would have to be // one node and others for another, which then would have to be
// resolved with deallocation. // resolved with deallocation.
schedulingCtx.Spec.SelectedNode = nodeName if state.podSchedulingState.schedulingCtx == nil ||
logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName {
if err := state.publishPodSchedulingContexts(ctx, pl.clientset, schedulingCtx); err != nil { state.podSchedulingState.selectedNode = &nodeName
return statusError(logger, err) logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
return statusError(logger, err)
}
return statusUnschedulable(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
} }
return statusUnschedulable(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
} }
// May have been modified earlier in PreScore or above. // May have been modified earlier in PreScore or above.
if podSchedulingDirty { if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
if err := state.publishPodSchedulingContexts(ctx, pl.clientset, schedulingCtx); err != nil { return statusError(logger, err)
return statusError(logger, err)
}
} }
// More than one pending claim and not enough information about all of them. // More than one pending claim and not enough information about all of them.
@ -954,6 +974,15 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
return statusUnschedulable(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod)) return statusUnschedulable(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod))
} }
func containsNode(hay []string, needle string) bool {
for _, node := range hay {
if node == needle {
return true
}
}
return false
}
// Unreserve clears the ReservedFor field for all claims. // Unreserve clears the ReservedFor field for all claims.
// It's idempotent, and does nothing if no state found for the given pod. // It's idempotent, and does nothing if no state found for the given pod.
func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) { func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {

View File

@ -967,11 +967,12 @@ func (wrapper *PodSchedulingWrapper) Namespace(s string) *PodSchedulingWrapper {
func (wrapper *PodSchedulingWrapper) OwnerReference(name, uid string, gvk schema.GroupVersionKind) *PodSchedulingWrapper { func (wrapper *PodSchedulingWrapper) OwnerReference(name, uid string, gvk schema.GroupVersionKind) *PodSchedulingWrapper {
wrapper.OwnerReferences = []metav1.OwnerReference{ wrapper.OwnerReferences = []metav1.OwnerReference{
{ {
APIVersion: gvk.GroupVersion().String(), APIVersion: gvk.GroupVersion().String(),
Kind: gvk.Kind, Kind: gvk.Kind,
Name: name, Name: name,
UID: types.UID(uid), UID: types.UID(uid),
Controller: pointer.Bool(true), Controller: pointer.Bool(true),
BlockOwnerDeletion: pointer.Bool(true),
}, },
} }
return wrapper return wrapper