DRA: integration tests for prioritized list

This adds dedicated integration tests for the feature to the general
test/integration/dra for the API and some minimal testing with the scheduler.

It also adds non-performance test cases for scheduler_perf because that is a
better place for running through the complete flow (for example, can reuse
infrastructure for setting up nodes).
This commit is contained in:
Patrick Ohly 2025-03-06 19:50:29 +01:00
parent dfb8ab6521
commit 89440b1239
4 changed files with 167 additions and 5 deletions

View File

@ -1923,7 +1923,7 @@ func TestAllocator(t *testing.T) {
node: node(node1, region1),
expectResults: nil,
expectError: gomega.MatchError(gomega.ContainSubstring("claim claim-0, request req-0: has subrequests, but the feature is disabled")),
expectError: gomega.MatchError(gomega.ContainSubstring("claim claim-0, request req-0: has subrequests, but the DRAPrioritizedList feature is disabled")),
},
"prioritized-list-multi-request": {
prioritizedList: true,

View File

@ -17,13 +17,19 @@ limitations under the License.
package dra
import (
"context"
"errors"
"fmt"
"regexp"
"sort"
"strings"
"testing"
"time"
"github.com/onsi/gomega"
"github.com/onsi/gomega/gstruct"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"
resourcealphaapi "k8s.io/api/resource/v1alpha3"
@ -34,10 +40,15 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/featuregate"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/klog/v2"
kubeschedulerconfigv1 "k8s.io/kube-scheduler/config/v1"
kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
kubeschedulerscheme "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
st "k8s.io/kubernetes/pkg/scheduler/testing"
"k8s.io/kubernetes/test/integration/framework"
"k8s.io/kubernetes/test/integration/util"
"k8s.io/kubernetes/test/utils/ktesting"
"k8s.io/utils/ptr"
)
@ -54,11 +65,21 @@ var (
Container("my-container").
PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimName: &claimName}).
Obj()
class = &resourceapi.DeviceClass{
ObjectMeta: metav1.ObjectMeta{
Name: className,
},
}
claim = st.MakeResourceClaim().
Name(claimName).
Namespace(namespace).
Request(className).
Obj()
claimPrioritizedList = st.MakeResourceClaim().
Name(claimName).
Namespace(namespace).
RequestWithPrioritizedList(className).
Obj()
)
// createTestNamespace creates a namespace with a name that is derived from the
@ -106,6 +127,7 @@ func TestDRA(t *testing.T) {
features: map[featuregate.Feature]bool{features.DynamicResourceAllocation: true},
f: func(tCtx ktesting.TContext) {
tCtx.Run("AdminAccess", func(tCtx ktesting.TContext) { testAdminAccess(tCtx, false) })
tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, false) })
tCtx.Run("Pod", func(tCtx ktesting.TContext) { testPod(tCtx, true) })
},
},
@ -119,11 +141,13 @@ func TestDRA(t *testing.T) {
// Additional DRA feature gates go here,
// in alphabetical order,
// as needed by tests for them.
features.DRAAdminAccess: true,
features.DRAAdminAccess: true,
features.DRAPrioritizedList: true,
},
f: func(tCtx ktesting.TContext) {
tCtx.Run("AdminAccess", func(tCtx ktesting.TContext) { testAdminAccess(tCtx, true) })
tCtx.Run("Convert", testConvert)
tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, true) })
},
},
} {
@ -146,21 +170,43 @@ func TestDRA(t *testing.T) {
etcdOptions := framework.SharedEtcd()
apiServerOptions := kubeapiservertesting.NewDefaultTestServerOptions()
apiServerFlags := framework.DefaultTestServerFlags()
// Default kube-apiserver behavior, must be requested explicitly for test server.
runtimeConfigs := []string{"api/alpha=false", "api/beta=false"}
var runtimeConfigs []string
for key, value := range tc.apis {
runtimeConfigs = append(runtimeConfigs, fmt.Sprintf("%s=%t", key, value))
}
apiServerFlags = append(apiServerFlags, "--runtime-config="+strings.Join(runtimeConfigs, ","))
server := kubeapiservertesting.StartTestServerOrDie(t, apiServerOptions, apiServerFlags, etcdOptions)
tCtx.Cleanup(server.TearDownFn)
tCtx = ktesting.WithRESTConfig(tCtx, server.ClientConfig)
tc.f(tCtx)
})
}
}
func startScheduler(tCtx ktesting.TContext) {
// Run scheduler with default configuration.
tCtx.Log("Scheduler starting...")
schedulerCtx := klog.NewContext(tCtx, klog.LoggerWithName(tCtx.Logger(), "scheduler"))
schedulerCtx, cancel := context.WithCancelCause(schedulerCtx)
_, informerFactory := util.StartScheduler(schedulerCtx, tCtx.Client(), tCtx.RESTConfig(), newDefaultSchedulerComponentConfig(tCtx), nil)
// Stop clients of the apiserver before stopping the apiserver itself,
// otherwise it delays its shutdown.
tCtx.Cleanup(informerFactory.Shutdown)
tCtx.Cleanup(func() {
tCtx.Log("Stoping scheduler...")
cancel(errors.New("test is done"))
})
}
func newDefaultSchedulerComponentConfig(tCtx ktesting.TContext) *config.KubeSchedulerConfiguration {
gvk := kubeschedulerconfigv1.SchemeGroupVersion.WithKind("KubeSchedulerConfiguration")
cfg := config.KubeSchedulerConfiguration{}
_, _, err := kubeschedulerscheme.Codecs.UniversalDecoder().Decode(nil, &gvk, &cfg)
tCtx.ExpectNoError(err, "decode default scheduler configuration")
return &cfg
}
// testPod creates a pod with a resource claim reference and then checks
// whether that field is or isn't getting dropped.
func testPod(tCtx ktesting.TContext, draEnabled bool) {
@ -220,3 +266,45 @@ func testAdminAccess(tCtx ktesting.TContext, adminAccessEnabled bool) {
}
}
}
func testPrioritizedList(tCtx ktesting.TContext, enabled bool) {
tCtx.Parallel()
_, err := tCtx.Client().ResourceV1beta1().DeviceClasses().Create(tCtx, class, metav1.CreateOptions{})
tCtx.ExpectNoError(err, "create class")
namespace := createTestNamespace(tCtx)
claim := claimPrioritizedList.DeepCopy()
claim.Namespace = namespace
claim, err = tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{})
if !enabled {
require.Error(tCtx, err, "claim should have become invalid after dropping FirstAvailable")
return
}
require.NotEmpty(tCtx, claim.Spec.Devices.Requests[0].FirstAvailable, "should store FirstAvailable")
tCtx.Run("scheduler", func(tCtx ktesting.TContext) {
startScheduler(tCtx)
// The fake cluster configuration is not complete enough to actually schedule pods.
// That is covered over in test/integration/scheduler_perf.
// Here we only test that we get to the point where it notices that, without failing
// during PreFilter because of FirstAvailable.
pod := podWithClaimName.DeepCopy()
pod.Namespace = namespace
_, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{})
tCtx.ExpectNoError(err, "create pod")
schedulingAttempted := gomega.HaveField("Status.Conditions", gomega.ContainElement(
gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{
"Type": gomega.Equal(v1.PodScheduled),
"Status": gomega.Equal(v1.ConditionFalse),
"Reason": gomega.Equal("Unschedulable"),
"Message": gomega.Equal("no nodes available to schedule pods"),
}),
))
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod {
pod, err := tCtx.Client().CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{})
tCtx.ExpectNoError(err, "get pod")
return pod
}).WithTimeout(time.Minute).WithPolling(time.Second).Should(schedulingAttempted)
})
}

View File

@ -294,6 +294,66 @@
maxClaimsPerNode: 10
duration: 10s
# SteadyStateResourceClaimTemplateFirstAvailable is a variant of SteadyStateResourceClaimTemplate
# with a claim template that uses the "firstAvailable" subrequests, aka DRAPrioritizedList.
- name: SteadyStateClusterResourceClaimTemplateFirstAvailable
featureGates:
DynamicResourceAllocation: true
DRAPrioritizedList: true
workloadTemplate:
- opcode: createNodes
countParam: $nodesWithoutDRA
- opcode: createNodes
nodeTemplatePath: templates/node-with-dra-test-driver.yaml
countParam: $nodesWithDRA
- opcode: createResourceDriver
driverName: test-driver.cdi.k8s.io
nodes: scheduler-perf-dra-*
maxClaimsPerNodeParam: $maxClaimsPerNode
- opcode: createAny
templatePath: templates/deviceclass.yaml
- opcode: createAny
templatePath: templates/resourceclaim.yaml
countParam: $initClaims
namespace: init
- opcode: allocResourceClaims
namespace: init
- opcode: createAny
templatePath: templates/resourceclaimtemplate-first-available.yaml
namespace: test
- opcode: createPods
namespace: test
count: 10
steadyState: true
durationParam: $duration
podTemplatePath: templates/pod-with-claim-template.yaml
collectMetrics: true
workloads:
- name: fast
featureGates:
SchedulerQueueingHints: false
labels: [integration-test, short]
params:
# This testcase runs through all code paths without
# taking too long overall.
nodesWithDRA: 1
nodesWithoutDRA: 1
initClaims: 0
maxClaimsPerNode: 10
duration: 2s
- name: fast_QueueingHintsEnabled
featureGates:
SchedulerQueueingHints: true
labels: [integration-test, short]
params:
# This testcase runs through all code paths without
# taking too long overall.
nodesWithDRA: 1
nodesWithoutDRA: 1
initClaims: 0
maxClaimsPerNode: 10
duration: 2s
# SchedulingWithResourceClaimTemplate uses ResourceClaims
# with deterministic names that are shared between pods.
# There is a fixed ratio of 1:5 between claims and pods.

View File

@ -0,0 +1,14 @@
apiVersion: resource.k8s.io/v1alpha3
kind: ResourceClaimTemplate
metadata:
name: test-claim-template
spec:
spec:
devices:
requests:
- name: req-0
firstAvailable:
- name: sub-0
deviceClassName: no-such-class
- name: sub-1
deviceClassName: test-class