From 89440b1239bc4c7476ec7e6a75722f8ef55c9a40 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 6 Mar 2025 19:50:29 +0100 Subject: [PATCH] DRA: integration tests for prioritized list This adds dedicated integration tests for the feature to the general test/integration/dra for the API and some minimal testing with the scheduler. It also adds non-performance test cases for scheduler_perf because that is a better place for running through the complete flow (for example, can reuse infrastructure for setting up nodes). --- .../structured/allocator_test.go | 2 +- test/integration/dra/dra_test.go | 96 ++++++++++++++++++- .../dra/performance-config.yaml | 60 ++++++++++++ ...resourceclaimtemplate-first-available.yaml | 14 +++ 4 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 test/integration/scheduler_perf/dra/templates/resourceclaimtemplate-first-available.yaml diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go index 52b1f6dc8ed..992bf046906 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go @@ -1923,7 +1923,7 @@ func TestAllocator(t *testing.T) { node: node(node1, region1), expectResults: nil, - expectError: gomega.MatchError(gomega.ContainSubstring("claim claim-0, request req-0: has subrequests, but the feature is disabled")), + expectError: gomega.MatchError(gomega.ContainSubstring("claim claim-0, request req-0: has subrequests, but the DRAPrioritizedList feature is disabled")), }, "prioritized-list-multi-request": { prioritizedList: true, diff --git a/test/integration/dra/dra_test.go b/test/integration/dra/dra_test.go index 9b57883295e..eea35f5a620 100644 --- a/test/integration/dra/dra_test.go +++ b/test/integration/dra/dra_test.go @@ -17,13 +17,19 @@ limitations under the License. package dra import ( + "context" + "errors" "fmt" "regexp" "sort" "strings" "testing" + "time" + "github.com/onsi/gomega" + "github.com/onsi/gomega/gstruct" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" resourcealphaapi "k8s.io/api/resource/v1alpha3" @@ -34,10 +40,15 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/component-base/featuregate" featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/klog/v2" + kubeschedulerconfigv1 "k8s.io/kube-scheduler/config/v1" kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing" "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/scheduler/apis/config" + kubeschedulerscheme "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" st "k8s.io/kubernetes/pkg/scheduler/testing" "k8s.io/kubernetes/test/integration/framework" + "k8s.io/kubernetes/test/integration/util" "k8s.io/kubernetes/test/utils/ktesting" "k8s.io/utils/ptr" ) @@ -54,11 +65,21 @@ var ( Container("my-container"). PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimName: &claimName}). Obj() + class = &resourceapi.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: className, + }, + } claim = st.MakeResourceClaim(). Name(claimName). Namespace(namespace). Request(className). Obj() + claimPrioritizedList = st.MakeResourceClaim(). + Name(claimName). + Namespace(namespace). + RequestWithPrioritizedList(className). + Obj() ) // createTestNamespace creates a namespace with a name that is derived from the @@ -106,6 +127,7 @@ func TestDRA(t *testing.T) { features: map[featuregate.Feature]bool{features.DynamicResourceAllocation: true}, f: func(tCtx ktesting.TContext) { tCtx.Run("AdminAccess", func(tCtx ktesting.TContext) { testAdminAccess(tCtx, false) }) + tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, false) }) tCtx.Run("Pod", func(tCtx ktesting.TContext) { testPod(tCtx, true) }) }, }, @@ -119,11 +141,13 @@ func TestDRA(t *testing.T) { // Additional DRA feature gates go here, // in alphabetical order, // as needed by tests for them. - features.DRAAdminAccess: true, + features.DRAAdminAccess: true, + features.DRAPrioritizedList: true, }, f: func(tCtx ktesting.TContext) { tCtx.Run("AdminAccess", func(tCtx ktesting.TContext) { testAdminAccess(tCtx, true) }) tCtx.Run("Convert", testConvert) + tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, true) }) }, }, } { @@ -146,21 +170,43 @@ func TestDRA(t *testing.T) { etcdOptions := framework.SharedEtcd() apiServerOptions := kubeapiservertesting.NewDefaultTestServerOptions() apiServerFlags := framework.DefaultTestServerFlags() - // Default kube-apiserver behavior, must be requested explicitly for test server. - runtimeConfigs := []string{"api/alpha=false", "api/beta=false"} + var runtimeConfigs []string for key, value := range tc.apis { runtimeConfigs = append(runtimeConfigs, fmt.Sprintf("%s=%t", key, value)) } apiServerFlags = append(apiServerFlags, "--runtime-config="+strings.Join(runtimeConfigs, ",")) server := kubeapiservertesting.StartTestServerOrDie(t, apiServerOptions, apiServerFlags, etcdOptions) tCtx.Cleanup(server.TearDownFn) - tCtx = ktesting.WithRESTConfig(tCtx, server.ClientConfig) + tc.f(tCtx) }) } } +func startScheduler(tCtx ktesting.TContext) { + // Run scheduler with default configuration. + tCtx.Log("Scheduler starting...") + schedulerCtx := klog.NewContext(tCtx, klog.LoggerWithName(tCtx.Logger(), "scheduler")) + schedulerCtx, cancel := context.WithCancelCause(schedulerCtx) + _, informerFactory := util.StartScheduler(schedulerCtx, tCtx.Client(), tCtx.RESTConfig(), newDefaultSchedulerComponentConfig(tCtx), nil) + // Stop clients of the apiserver before stopping the apiserver itself, + // otherwise it delays its shutdown. + tCtx.Cleanup(informerFactory.Shutdown) + tCtx.Cleanup(func() { + tCtx.Log("Stoping scheduler...") + cancel(errors.New("test is done")) + }) +} + +func newDefaultSchedulerComponentConfig(tCtx ktesting.TContext) *config.KubeSchedulerConfiguration { + gvk := kubeschedulerconfigv1.SchemeGroupVersion.WithKind("KubeSchedulerConfiguration") + cfg := config.KubeSchedulerConfiguration{} + _, _, err := kubeschedulerscheme.Codecs.UniversalDecoder().Decode(nil, &gvk, &cfg) + tCtx.ExpectNoError(err, "decode default scheduler configuration") + return &cfg +} + // testPod creates a pod with a resource claim reference and then checks // whether that field is or isn't getting dropped. func testPod(tCtx ktesting.TContext, draEnabled bool) { @@ -220,3 +266,45 @@ func testAdminAccess(tCtx ktesting.TContext, adminAccessEnabled bool) { } } } + +func testPrioritizedList(tCtx ktesting.TContext, enabled bool) { + tCtx.Parallel() + _, err := tCtx.Client().ResourceV1beta1().DeviceClasses().Create(tCtx, class, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create class") + namespace := createTestNamespace(tCtx) + claim := claimPrioritizedList.DeepCopy() + claim.Namespace = namespace + claim, err = tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{}) + + if !enabled { + require.Error(tCtx, err, "claim should have become invalid after dropping FirstAvailable") + return + } + + require.NotEmpty(tCtx, claim.Spec.Devices.Requests[0].FirstAvailable, "should store FirstAvailable") + tCtx.Run("scheduler", func(tCtx ktesting.TContext) { + startScheduler(tCtx) + + // The fake cluster configuration is not complete enough to actually schedule pods. + // That is covered over in test/integration/scheduler_perf. + // Here we only test that we get to the point where it notices that, without failing + // during PreFilter because of FirstAvailable. + pod := podWithClaimName.DeepCopy() + pod.Namespace = namespace + _, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create pod") + schedulingAttempted := gomega.HaveField("Status.Conditions", gomega.ContainElement( + gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{ + "Type": gomega.Equal(v1.PodScheduled), + "Status": gomega.Equal(v1.ConditionFalse), + "Reason": gomega.Equal("Unschedulable"), + "Message": gomega.Equal("no nodes available to schedule pods"), + }), + )) + ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod { + pod, err := tCtx.Client().CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{}) + tCtx.ExpectNoError(err, "get pod") + return pod + }).WithTimeout(time.Minute).WithPolling(time.Second).Should(schedulingAttempted) + }) +} diff --git a/test/integration/scheduler_perf/dra/performance-config.yaml b/test/integration/scheduler_perf/dra/performance-config.yaml index 35bea6a2c00..77badbaa57f 100644 --- a/test/integration/scheduler_perf/dra/performance-config.yaml +++ b/test/integration/scheduler_perf/dra/performance-config.yaml @@ -294,6 +294,66 @@ maxClaimsPerNode: 10 duration: 10s +# SteadyStateResourceClaimTemplateFirstAvailable is a variant of SteadyStateResourceClaimTemplate +# with a claim template that uses the "firstAvailable" subrequests, aka DRAPrioritizedList. +- name: SteadyStateClusterResourceClaimTemplateFirstAvailable + featureGates: + DynamicResourceAllocation: true + DRAPrioritizedList: true + workloadTemplate: + - opcode: createNodes + countParam: $nodesWithoutDRA + - opcode: createNodes + nodeTemplatePath: templates/node-with-dra-test-driver.yaml + countParam: $nodesWithDRA + - opcode: createResourceDriver + driverName: test-driver.cdi.k8s.io + nodes: scheduler-perf-dra-* + maxClaimsPerNodeParam: $maxClaimsPerNode + - opcode: createAny + templatePath: templates/deviceclass.yaml + - opcode: createAny + templatePath: templates/resourceclaim.yaml + countParam: $initClaims + namespace: init + - opcode: allocResourceClaims + namespace: init + - opcode: createAny + templatePath: templates/resourceclaimtemplate-first-available.yaml + namespace: test + - opcode: createPods + namespace: test + count: 10 + steadyState: true + durationParam: $duration + podTemplatePath: templates/pod-with-claim-template.yaml + collectMetrics: true + workloads: + - name: fast + featureGates: + SchedulerQueueingHints: false + labels: [integration-test, short] + params: + # This testcase runs through all code paths without + # taking too long overall. + nodesWithDRA: 1 + nodesWithoutDRA: 1 + initClaims: 0 + maxClaimsPerNode: 10 + duration: 2s + - name: fast_QueueingHintsEnabled + featureGates: + SchedulerQueueingHints: true + labels: [integration-test, short] + params: + # This testcase runs through all code paths without + # taking too long overall. + nodesWithDRA: 1 + nodesWithoutDRA: 1 + initClaims: 0 + maxClaimsPerNode: 10 + duration: 2s + # SchedulingWithResourceClaimTemplate uses ResourceClaims # with deterministic names that are shared between pods. # There is a fixed ratio of 1:5 between claims and pods. diff --git a/test/integration/scheduler_perf/dra/templates/resourceclaimtemplate-first-available.yaml b/test/integration/scheduler_perf/dra/templates/resourceclaimtemplate-first-available.yaml new file mode 100644 index 00000000000..b343ee13df5 --- /dev/null +++ b/test/integration/scheduler_perf/dra/templates/resourceclaimtemplate-first-available.yaml @@ -0,0 +1,14 @@ +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + name: test-claim-template +spec: + spec: + devices: + requests: + - name: req-0 + firstAvailable: + - name: sub-0 + deviceClassName: no-such-class + - name: sub-1 + deviceClassName: test-class