From 5e40afca060b9a26c68299494eeea3408806ffe3 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 11 Dec 2023 20:41:55 +0100 Subject: [PATCH] dra testing: add tests for structured parameters The test driver now supports a ConfigMap (as before) and the named resources structured parameter model. It doesn't have any instance attributes. --- test/e2e/dra/deploy.go | 55 ++- test/e2e/dra/dra.go | 341 ++++++++++++++++-- test/e2e/dra/test-driver/app/kubeletplugin.go | 5 + 3 files changed, 367 insertions(+), 34 deletions(-) diff --git a/test/e2e/dra/deploy.go b/test/e2e/dra/deploy.go index 71378705ebb..8f5bacb854e 100644 --- a/test/e2e/dra/deploy.go +++ b/test/e2e/dra/deploy.go @@ -24,6 +24,7 @@ import ( "net" "path" "sort" + "strings" "sync" "time" @@ -33,6 +34,7 @@ import ( appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" @@ -125,6 +127,12 @@ type Driver struct { Name string Nodes map[string]*app.ExamplePlugin + parameterMode parameterMode + parameterAPIGroup string + parameterAPIVersion string + claimParameterAPIKind string + classParameterAPIKind string + NodeV1alpha2, NodeV1alpha3 bool mutex sync.Mutex @@ -132,6 +140,14 @@ type Driver struct { callCounts map[MethodInstance]int64 } +type parameterMode string + +const ( + parameterModeConfigMap parameterMode = "configmap" // ConfigMap parameters, control plane controller. + parameterModeStructured parameterMode = "structured" // No ConfigMaps, directly create and reference in-tree parameter objects. + parameterModeTranslated parameterMode = "translated" // Reference ConfigMaps in claim and class, generate in-tree parameter objects. +) + func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { ginkgo.By(fmt.Sprintf("deploying driver on nodes %v", nodes.NodeNames)) d.Nodes = map[string]*app.ExamplePlugin{} @@ -147,19 +163,40 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { d.ctx = ctx d.cleanup = append(d.cleanup, cancel) - // The controller is easy: we simply connect to the API server. - d.Controller = app.NewController(d.f.ClientSet, resources) - d.wg.Add(1) - go func() { - defer d.wg.Done() - d.Controller.Run(d.ctx, 5 /* workers */) - }() + switch d.parameterMode { + case "", parameterModeConfigMap: + // The controller is easy: we simply connect to the API server. + d.Controller = app.NewController(d.f.ClientSet, resources) + d.wg.Add(1) + go func() { + defer d.wg.Done() + d.Controller.Run(d.ctx, 5 /* workers */) + }() + } manifests := []string{ // The code below matches the content of this manifest (ports, // container names, etc.). "test/e2e/testing-manifests/dra/dra-test-driver-proxy.yaml", } + if d.parameterMode == "" { + d.parameterMode = parameterModeConfigMap + } + switch d.parameterMode { + case parameterModeConfigMap, parameterModeTranslated: + d.parameterAPIGroup = "" + d.parameterAPIVersion = "v1" + d.claimParameterAPIKind = "ConfigMap" + d.classParameterAPIKind = "ConfigMap" + case parameterModeStructured: + d.parameterAPIGroup = "resource.k8s.io" + d.parameterAPIVersion = "v1alpha2" + d.claimParameterAPIKind = "ResourceClaimParameters" + d.classParameterAPIKind = "ResourceClassParameters" + default: + framework.Failf("unknown test driver parameter mode: %s", d.parameterMode) + } + instanceKey := "app.kubernetes.io/instance" rsName := "" draAddr := path.Join(framework.TestContext.KubeletRootDir, "plugins", d.Name+".sock") @@ -192,6 +229,10 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { item.Spec.Template.Spec.Volumes[2].HostPath.Path = path.Join(framework.TestContext.KubeletRootDir, "plugins_registry") item.Spec.Template.Spec.Containers[0].Args = append(item.Spec.Template.Spec.Containers[0].Args, "--endpoint=/plugins_registry/"+d.Name+"-reg.sock") item.Spec.Template.Spec.Containers[1].Args = append(item.Spec.Template.Spec.Containers[1].Args, "--endpoint=/dra/"+d.Name+".sock") + case *apiextensionsv1.CustomResourceDefinition: + item.Name = strings.ReplaceAll(item.Name, "dra.e2e.example.com", d.parameterAPIGroup) + item.Spec.Group = d.parameterAPIGroup + } return nil }, manifests...) diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go index d9ce944a5b7..7e71eb90847 100644 --- a/test/e2e/dra/dra.go +++ b/test/e2e/dra/dra.go @@ -18,9 +18,11 @@ package dra import ( "context" + "encoding/json" "errors" "fmt" "strings" + "sync" "time" "github.com/onsi/ginkgo/v2" @@ -32,6 +34,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" "k8s.io/dynamic-resource-allocation/controller" "k8s.io/klog/v2" @@ -42,6 +45,7 @@ import ( e2epod "k8s.io/kubernetes/test/e2e/framework/pod" admissionapi "k8s.io/pod-security-admission/api" utilpointer "k8s.io/utils/pointer" + "k8s.io/utils/ptr" ) const ( @@ -142,8 +146,8 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, pod.Spec.NodeName = nodes.NodeNames[0] b.create(ctx, pod) - gomega.Consistently(func() error { - testPod, err := b.f.ClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) + gomega.Consistently(ctx, func(ctx context.Context) error { + testPod, err := b.f.ClientSet.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) if err != nil { return fmt.Errorf("expected the test pod %s to exist: %w", pod.Name, err) } @@ -191,25 +195,188 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, }) }) - ginkgo.Context("driver", func() { + driverTest := func(parameterMode parameterMode) { nodes := NewNodes(f, 1, 1) - driver := NewDriver(f, nodes, networkResources) // All tests get their own driver instance. + maxAllocations := 1 + numPods := 10 + generateResources := func() app.Resources { + resources := perNode(maxAllocations, nodes)() + resources.Shareable = true + return resources + } + driver := NewDriver(f, nodes, generateResources) // All tests get their own driver instance. + driver.parameterMode = parameterMode b := newBuilder(f, driver) // We need the parameters name *before* creating it. b.parametersCounter = 1 b.classParametersName = b.parametersName() + expectedEnv := []string{"user_a", "b", "user_request_foo", "bar", "admin_x", "y"} + genParameters := func() []klog.KMetadata { + var objects []klog.KMetadata + switch parameterMode { + case parameterModeConfigMap: + objects = append(objects, + b.parameters("x", "y"), + b.parameters("a", "b", "request_foo", "bar"), + ) + case parameterModeTranslated: + objects = append(objects, + b.parameters("x", "y"), + b.classParameters(b.parametersName(), "x", "y"), + b.parameters("a", "b", "request_foo", "bar"), + b.claimParameters(b.parametersName(), []string{"a", "b"}, []string{"request_foo", "bar"}), + ) + // The parameters object is not the last one but the second-last. + b.parametersCounter-- + case parameterModeStructured: + objects = append(objects, + b.classParameters("", "x", "y"), + b.claimParameters("", []string{"a", "b"}, []string{"request_foo", "bar"}), + ) + } + return objects + } + ginkgo.It("supports claim and class parameters", func(ctx context.Context) { - classParameters := b.parameters("x", "y") - claimParameters := b.parameters() + objects := genParameters() + + // TODO: replace with publishing NodeResourceSlice through kubelet + if parameterMode == parameterModeTranslated || parameterMode == parameterModeStructured { + objects = append(objects, b.nodeResourceSlice(nodes.NodeNames[0], maxAllocations)) + } + pod, template := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + objects = append(objects, pod, template) - b.create(ctx, classParameters, claimParameters, pod, template) + b.create(ctx, objects...) - b.testPod(ctx, f.ClientSet, pod, "user_a", "b", "admin_x", "y") + b.testPod(ctx, f.ClientSet, pod, expectedEnv...) }) + + ginkgo.It("supports reusing resources", func(ctx context.Context) { + objects := genParameters() + pods := make([]*v1.Pod, numPods) + for i := 0; i < numPods; i++ { + pod, template := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + pods[i] = pod + objects = append(objects, pod, template) + } + + // TODO: replace with publishing NodeResourceSlice through kubelet + if parameterMode == parameterModeTranslated || parameterMode == parameterModeStructured { + objects = append(objects, b.nodeResourceSlice(nodes.NodeNames[0], maxAllocations)) + } + + b.create(ctx, objects...) + + // We don't know the order. All that matters is that all of them get scheduled eventually. + var wg sync.WaitGroup + wg.Add(numPods) + for i := 0; i < numPods; i++ { + pod := pods[i] + go func() { + defer ginkgo.GinkgoRecover() + defer wg.Done() + b.testPod(ctx, f.ClientSet, pod, expectedEnv...) + err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) + framework.ExpectNoError(err, "delete pod") + framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodStartSlow)) + }() + } + wg.Wait() + }) + + ginkgo.It("supports sharing a claim concurrently", func(ctx context.Context) { + objects := genParameters() + objects = append(objects, b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer)) + + pods := make([]*v1.Pod, numPods) + for i := 0; i < numPods; i++ { + pod := b.podExternal() + pods[i] = pod + objects = append(objects, pod) + } + + // TODO: replace with publishing NodeResourceSlice through kubelet + if parameterMode == parameterModeTranslated || parameterMode == parameterModeStructured { + objects = append(objects, b.nodeResourceSlice(nodes.NodeNames[0], maxAllocations)) + } + + b.create(ctx, objects...) + + // We don't know the order. All that matters is that all of them get scheduled eventually. + f.Timeouts.PodStartSlow *= time.Duration(numPods) + var wg sync.WaitGroup + wg.Add(numPods) + for i := 0; i < numPods; i++ { + pod := pods[i] + go func() { + defer ginkgo.GinkgoRecover() + defer wg.Done() + b.testPod(ctx, f.ClientSet, pod, expectedEnv...) + }() + } + wg.Wait() + }) + + ginkgo.It("supports sharing a claim sequentially", func(ctx context.Context) { + objects := genParameters() + + // Change from "shareable" to "not shareable", if possible. + switch parameterMode { + case parameterModeConfigMap: + ginkgo.Skip("cannot change the driver's controller behavior on-the-fly") + case parameterModeTranslated, parameterModeStructured: + objects[len(objects)-1].(*resourcev1alpha2.ResourceClaimParameters).Shareable = false + } + + objects = append(objects, b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer)) + + pods := make([]*v1.Pod, numPods) + for i := 0; i < numPods; i++ { + pod := b.podExternal() + pods[i] = pod + objects = append(objects, pod) + } + + // TODO: replace with publishing NodeResourceSlice through kubelet + if parameterMode == parameterModeTranslated || parameterMode == parameterModeStructured { + objects = append(objects, b.nodeResourceSlice(nodes.NodeNames[0], maxAllocations)) + } + + b.create(ctx, objects...) + + // We don't know the order. All that matters is that all of them get scheduled eventually. + f.Timeouts.PodStartSlow *= time.Duration(numPods) + var wg sync.WaitGroup + wg.Add(numPods) + for i := 0; i < numPods; i++ { + pod := pods[i] + go func() { + defer ginkgo.GinkgoRecover() + defer wg.Done() + b.testPod(ctx, f.ClientSet, pod, expectedEnv...) + // We need to delete each running pod, otherwise the others cannot use the claim. + err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) + framework.ExpectNoError(err, "delete pod") + framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodStartSlow)) + }() + } + wg.Wait() + }) + } + + ginkgo.Context("driver", func() { + ginkgo.Context("with ConfigMap parameters", func() { driverTest(parameterModeConfigMap) }) + ginkgo.Context("with translated parameters", func() { driverTest(parameterModeTranslated) }) + ginkgo.Context("with structured parameters", func() { driverTest(parameterModeStructured) }) }) + // TODO: move most of the test below into `testDriver` so that they get + // executed with different parameters. Not done yet because it'll be easier + // once publishing NodeResourceSlices works. + ginkgo.Context("cluster", func() { nodes := NewNodes(f, 1, 1) driver := NewDriver(f, nodes, networkResources) @@ -942,12 +1109,14 @@ func (b *builder) class() *resourcev1alpha2.ResourceClass { ObjectMeta: metav1.ObjectMeta{ Name: b.className(), }, - DriverName: b.driver.Name, - SuitableNodes: b.nodeSelector(), + DriverName: b.driver.Name, + SuitableNodes: b.nodeSelector(), + StructuredParameters: ptr.To(b.driver.parameterMode != parameterModeConfigMap), } if b.classParametersName != "" { class.ParametersRef = &resourcev1alpha2.ResourceClassParametersReference{ - Kind: "ConfigMap", + APIGroup: b.driver.parameterAPIGroup, + Kind: b.driver.classParameterAPIKind, Name: b.classParametersName, Namespace: b.f.Namespace.Name, } @@ -988,8 +1157,9 @@ func (b *builder) externalClaim(allocationMode resourcev1alpha2.AllocationMode) Spec: resourcev1alpha2.ResourceClaimSpec{ ResourceClassName: b.className(), ParametersRef: &resourcev1alpha2.ResourceClaimParametersReference{ - Kind: "ConfigMap", - Name: b.parametersName(), + APIGroup: b.driver.parameterAPIGroup, + Kind: b.driver.claimParameterAPIKind, + Name: b.parametersName(), }, AllocationMode: allocationMode, }, @@ -1005,20 +1175,15 @@ func (b *builder) parametersName() string { // parametersEnv returns the default env variables. func (b *builder) parametersEnv() map[string]string { return map[string]string{ - "a": "b", + "a": "b", + "request_foo": "bar", } } // parameters returns a config map with the default env variables. func (b *builder) parameters(kv ...string) *v1.ConfigMap { + data := b.parameterData(kv...) b.parametersCounter++ - data := map[string]string{} - for i := 0; i < len(kv); i += 2 { - data[kv[i]] = kv[i+1] - } - if len(data) == 0 { - data = b.parametersEnv() - } return &v1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Namespace: b.f.Namespace.Name, @@ -1028,6 +1193,116 @@ func (b *builder) parameters(kv ...string) *v1.ConfigMap { } } +func (b *builder) classParameters(generatedFrom string, kv ...string) *resourcev1alpha2.ResourceClassParameters { + raw := b.rawParameterData(kv...) + b.parametersCounter++ + parameters := &resourcev1alpha2.ResourceClassParameters{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: b.f.Namespace.Name, + Name: b.parametersName(), + }, + + VendorParameters: []resourcev1alpha2.VendorParameters{ + {DriverName: b.driver.Name, Parameters: runtime.RawExtension{Raw: raw}}, + }, + } + + if generatedFrom != "" { + parameters.GeneratedFrom = &resourcev1alpha2.ResourceClassParametersReference{ + Kind: "ConfigMap", + Namespace: b.f.Namespace.Name, + Name: generatedFrom, + } + } + + return parameters +} + +func (b *builder) claimParameters(generatedFrom string, claimKV, requestKV []string) *resourcev1alpha2.ResourceClaimParameters { + b.parametersCounter++ + parameters := &resourcev1alpha2.ResourceClaimParameters{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: b.f.Namespace.Name, + Name: b.parametersName(), + }, + + Shareable: true, + + // Without any request, nothing gets allocated and vendor + // parameters are also not passed down because they get + // attached to the allocation result. + // TODO: is that the semantic we want? + DriverRequests: []resourcev1alpha2.DriverRequests{ + { + DriverName: b.driver.Name, + VendorParameters: runtime.RawExtension{Raw: b.rawParameterData(claimKV...)}, + Requests: []resourcev1alpha2.ResourceRequest{ + { + VendorParameters: runtime.RawExtension{Raw: b.rawParameterData(requestKV...)}, + ResourceRequestModel: resourcev1alpha2.ResourceRequestModel{ + NamedResources: &resourcev1alpha2.NamedResourcesRequest{ + Selector: "true", + }, + }, + }, + }, + }, + }, + } + + if generatedFrom != "" { + parameters.GeneratedFrom = &resourcev1alpha2.ResourceClaimParametersReference{ + Kind: "ConfigMap", + Name: generatedFrom, + } + } + + return parameters +} + +func (b *builder) parameterData(kv ...string) map[string]string { + data := map[string]string{} + for i := 0; i < len(kv); i += 2 { + data[kv[i]] = kv[i+1] + } + if len(data) == 0 { + data = b.parametersEnv() + } + return data +} + +func (b *builder) rawParameterData(kv ...string) []byte { + data := b.parameterData(kv...) + raw, err := json.Marshal(data) + framework.ExpectNoError(err, "JSON encoding of parameter data") + return raw +} + +func (b *builder) nodeResourceSlice(nodeName string, capacity int) *resourcev1alpha2.NodeResourceSlice { + slice := &resourcev1alpha2.NodeResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: b.driver.Name + "-" + nodeName, + }, + + NodeName: nodeName, + DriverName: b.driver.Name, + + NodeResourceModel: resourcev1alpha2.NodeResourceModel{ + NamedResources: &resourcev1alpha2.NamedResourcesResources{}, + }, + } + + for i := 0; i < capacity; i++ { + slice.NodeResourceModel.NamedResources.Instances = append(slice.NodeResourceModel.NamedResources.Instances, + resourcev1alpha2.NamedResourcesInstance{ + Name: fmt.Sprintf("instance-%d", i), + }, + ) + } + + return slice +} + // makePod returns a simple pod with no resource claims. // The pod prints its env and waits. func (b *builder) pod() *v1.Pod { @@ -1078,8 +1353,9 @@ func (b *builder) podInline(allocationMode resourcev1alpha2.AllocationMode) (*v1 Spec: resourcev1alpha2.ResourceClaimSpec{ ResourceClassName: b.className(), ParametersRef: &resourcev1alpha2.ResourceClaimParametersReference{ - Kind: "ConfigMap", - Name: b.parametersName(), + APIGroup: b.driver.parameterAPIGroup, + Kind: b.driver.claimParameterAPIKind, + Name: b.parametersName(), }, AllocationMode: allocationMode, }, @@ -1134,14 +1410,28 @@ func (b *builder) create(ctx context.Context, objs ...klog.KMetadata) []klog.KMe switch obj := obj.(type) { case *resourcev1alpha2.ResourceClass: createdObj, err = b.f.ClientSet.ResourceV1alpha2().ResourceClasses().Create(ctx, obj, metav1.CreateOptions{}) + ginkgo.DeferCleanup(func(ctx context.Context) { + err := b.f.ClientSet.ResourceV1alpha2().ResourceClasses().Delete(ctx, createdObj.GetName(), metav1.DeleteOptions{}) + framework.ExpectNoError(err, "delete resource class") + }) case *v1.Pod: createdObj, err = b.f.ClientSet.CoreV1().Pods(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) case *v1.ConfigMap: - _, err = b.f.ClientSet.CoreV1().ConfigMaps(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) + createdObj, err = b.f.ClientSet.CoreV1().ConfigMaps(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) case *resourcev1alpha2.ResourceClaim: createdObj, err = b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) case *resourcev1alpha2.ResourceClaimTemplate: createdObj, err = b.f.ClientSet.ResourceV1alpha2().ResourceClaimTemplates(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) + case *resourcev1alpha2.ResourceClassParameters: + createdObj, err = b.f.ClientSet.ResourceV1alpha2().ResourceClassParameters(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) + case *resourcev1alpha2.ResourceClaimParameters: + createdObj, err = b.f.ClientSet.ResourceV1alpha2().ResourceClaimParameters(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{}) + case *resourcev1alpha2.NodeResourceSlice: + createdObj, err = b.f.ClientSet.ResourceV1alpha2().NodeResourceSlices().Create(ctx, obj, metav1.CreateOptions{}) + ginkgo.DeferCleanup(func(ctx context.Context) { + err := b.f.ClientSet.ResourceV1alpha2().NodeResourceSlices().Delete(ctx, createdObj.GetName(), metav1.DeleteOptions{}) + framework.ExpectNoError(err, "delete node resource slice") + }) default: framework.Fail(fmt.Sprintf("internal error, unsupported type %T", obj), 1) } @@ -1190,9 +1480,6 @@ func (b *builder) setUp() { } func (b *builder) tearDown(ctx context.Context) { - err := b.f.ClientSet.ResourceV1alpha2().ResourceClasses().Delete(ctx, b.className(), metav1.DeleteOptions{}) - framework.ExpectNoError(err, "delete resource class") - // Before we allow the namespace and all objects in it do be deleted by // the framework, we must ensure that test pods and the claims that // they use are deleted. Otherwise the driver might get deleted first, diff --git a/test/e2e/dra/test-driver/app/kubeletplugin.go b/test/e2e/dra/test-driver/app/kubeletplugin.go index b51521a63b0..9a564f10e78 100644 --- a/test/e2e/dra/test-driver/app/kubeletplugin.go +++ b/test/e2e/dra/test-driver/app/kubeletplugin.go @@ -182,6 +182,11 @@ func (ex *ExamplePlugin) NodePrepareResource(ctx context.Context, req *drapbv1al if err := extractParameters(handle.VendorClaimParameters, &p.EnvVars, "user"); err != nil { return nil, err } + for _, result := range handle.Results { + if err := extractParameters(result.VendorRequestParameters, &p.EnvVars, "user"); err != nil { + return nil, err + } + } default: // Huh? return nil, fmt.Errorf("invalid length of NodePrepareResourceRequest.StructuredResourceHandle: %d", len(req.StructuredResourceHandle))