diff --git a/pkg/scheduler/eventhandlers.go b/pkg/scheduler/eventhandlers.go index 3a22e2541f0..9d60a9b7d0f 100644 --- a/pkg/scheduler/eventhandlers.go +++ b/pkg/scheduler/eventhandlers.go @@ -458,6 +458,15 @@ func addAllEventHandlers( } handlers = append(handlers, handlerRegistration) } + case framework.ResourceClass: + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { + if handlerRegistration, err = informerFactory.Resource().V1alpha2().ResourceClasses().Informer().AddEventHandler( + buildEvtResHandler(at, framework.ResourceClass, "ResourceClass"), + ); err != nil { + return err + } + handlers = append(handlers, handlerRegistration) + } case framework.StorageClass: if at&framework.Add != 0 { if handlerRegistration, err = informerFactory.Storage().V1().StorageClasses().Informer().AddEventHandler( diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 41e80b4ce24..6d32ff47cfe 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -272,6 +272,8 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint // A resource might depend on node labels for topology filtering. // A new or updated node may make pods schedulable. {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}}, + // A pod might be waiting for a class to get created or modified. + {Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}}, } return events } @@ -595,7 +597,13 @@ func (pl *dynamicResources) PreFilter(ctx context.Context, state *framework.Cycl // about the specific pod. class, err := pl.classLister.Get(claim.Spec.ResourceClassName) if err != nil { - // If the class does not exist, then allocation cannot proceed. + // If the class cannot be retrieved, allocation cannot proceed. + if apierrors.IsNotFound(err) { + // Here we mark the pod as "unschedulable", so it'll sleep in + // the unscheduleable queue until a ResourceClass event occurs. + return nil, statusUnschedulable(logger, fmt.Sprintf("resource class %s does not exist", claim.Spec.ResourceClassName)) + } + // Other error, retry with backoff. return nil, statusError(logger, fmt.Errorf("look up resource class: %v", err)) } if class.SuitableNodes != nil { diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index 2d80193d2a3..b284a7bfa8f 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -331,7 +331,7 @@ func TestPlugin(t *testing.T) { claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim}, want: want{ prefilter: result{ - status: framework.AsStatus(fmt.Errorf(`look up resource class: resourceclass.resource.k8s.io "%s" not found`, className)), + status: framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("resource class %s does not exist", className)), }, postfilter: result{ status: framework.NewStatus(framework.Unschedulable, `no new claims to deallocate`), diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 08b76183d98..7da7645afbd 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -71,6 +71,7 @@ const ( PersistentVolumeClaim GVK = "PersistentVolumeClaim" PodSchedulingContext GVK = "PodSchedulingContext" ResourceClaim GVK = "ResourceClaim" + ResourceClass GVK = "ResourceClass" StorageClass GVK = "storage.k8s.io/StorageClass" CSINode GVK = "storage.k8s.io/CSINode" CSIDriver GVK = "storage.k8s.io/CSIDriver" diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go index 72377a9717c..3f803c3db12 100644 --- a/test/e2e/dra/dra.go +++ b/test/e2e/dra/dra.go @@ -208,6 +208,68 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu b.testPod(ctx, f.ClientSet, pod) }) + ginkgo.It("retries pod scheduling after creating resource class", func(ctx context.Context) { + parameters := b.parameters() + pod, template := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + class, err := f.ClientSet.ResourceV1alpha2().ResourceClasses().Get(ctx, template.Spec.Spec.ResourceClassName, metav1.GetOptions{}) + framework.ExpectNoError(err) + template.Spec.Spec.ResourceClassName += "-b" + b.create(ctx, parameters, template, pod) + + // There's no way to be sure that the scheduler has checked the pod. + // But if we sleep for a short while, it's likely and if there are any + // bugs that prevent the scheduler from handling creation of the class, + // those bugs should show up as test flakes. + time.Sleep(time.Second) + + class.UID = "" + class.ResourceVersion = "" + class.Name = template.Spec.Spec.ResourceClassName + b.create(ctx, class) + + b.testPod(ctx, f.ClientSet, pod) + }) + + ginkgo.It("retries pod scheduling after updating resource class", func(ctx context.Context) { + parameters := b.parameters() + pod, template := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + + // First modify the class so that it matches no nodes. + class, err := f.ClientSet.ResourceV1alpha2().ResourceClasses().Get(ctx, template.Spec.Spec.ResourceClassName, metav1.GetOptions{}) + framework.ExpectNoError(err) + class.SuitableNodes = &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "no-such-label", + Operator: v1.NodeSelectorOpIn, + Values: []string{"no-such-value"}, + }, + }, + }, + }, + } + class, err = f.ClientSet.ResourceV1alpha2().ResourceClasses().Update(ctx, class, metav1.UpdateOptions{}) + framework.ExpectNoError(err) + + // Now create the pod. + b.create(ctx, parameters, template, pod) + + // There's no way to be sure that the scheduler has checked the pod. + // But if we sleep for a short while, it's likely and if there are any + // bugs that prevent the scheduler from handling updates of the class, + // those bugs should show up as test flakes. + time.Sleep(time.Second) + + // Unblock the pod. + class.SuitableNodes = nil + _, err = f.ClientSet.ResourceV1alpha2().ResourceClasses().Update(ctx, class, metav1.UpdateOptions{}) + framework.ExpectNoError(err) + + b.testPod(ctx, f.ClientSet, pod) + }) + ginkgo.It("runs a pod without a generated resource claim", func(ctx context.Context) { pod, _ /* template */ := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) created := b.create(ctx, pod)