diff --git a/test/e2e_node/dra_test.go b/test/e2e_node/dra_test.go new file mode 100644 index 00000000000..2b283614fbe --- /dev/null +++ b/test/e2e_node/dra_test.go @@ -0,0 +1,230 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* +E2E Node test for DRA (Dynamic Resource Allocation) +This test covers node-specific aspects of DRA +The test can be run locally on Linux this way: + make test-e2e-node FOCUS='\[NodeFeature:DynamicResourceAllocation\]' SKIP='\[Flaky\]' PARALLELISM=1 \ + TEST_ARGS='--feature-gates="DynamicResourceAllocation=true" --service-feature-gates="DynamicResourceAllocation=true" --runtime-config=api/all=true' +*/ + +package e2enode + +import ( + "context" + "os" + "path" + "path/filepath" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + + v1 "k8s.io/api/core/v1" + resourcev1alpha2 "k8s.io/api/resource/v1alpha2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + admissionapi "k8s.io/pod-security-admission/api" + + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + + "k8s.io/dynamic-resource-allocation/kubeletplugin" + testdriver "k8s.io/kubernetes/test/e2e/dra/test-driver/app" +) + +const ( + driverName = "test-driver.cdi.k8s.io" + cdiDir = "/var/run/cdi" + endpoint = "/var/lib/kubelet/plugins/test-driver/dra.sock" + pluginRegistrationPath = "/var/lib/kubelet/plugins_registry" + draAddress = "/var/lib/kubelet/plugins/test-driver/dra.sock" + pluginRegistrationTimeout = time.Second * 60 // how long to wait for a node plugin to be registered +) + +var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation][NodeFeature:DynamicResourceAllocation]", func() { + f := framework.NewDefaultFramework("dra-node") + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline + + var kubeletPlugin *testdriver.ExamplePlugin + + ginkgo.Context("Resource Kubelet Plugin [Serial]", func() { + ginkgo.BeforeEach(func(ctx context.Context) { + kubeletPlugin = newKubeletPlugin(getNodeName(ctx, f)) + }) + + ginkgo.It("must register after Kubelet restart", func(ctx context.Context) { + oldCalls := kubeletPlugin.GetGRPCCalls() + getNewCalls := func() []testdriver.GRPCCall { + calls := kubeletPlugin.GetGRPCCalls() + return calls[len(oldCalls):] + } + + ginkgo.By("restarting Kubelet") + restartKubelet(true) + + ginkgo.By("wait for Kubelet plugin re-registration") + gomega.Eventually(getNewCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) + }) + + ginkgo.It("must register after plugin restart", func(ctx context.Context) { + ginkgo.By("restart Kubelet Plugin") + kubeletPlugin.Stop() + kubeletPlugin = newKubeletPlugin(getNodeName(ctx, f)) + + ginkgo.By("wait for Kubelet plugin re-registration") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) + }) + + ginkgo.It("must process pod created when kubelet is not running", func(ctx context.Context) { + // Stop Kubelet + startKubelet := stopKubelet() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod") + // Pod must be in pending state + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + // Start Kubelet + startKubelet() + // Pod should succeed + err = e2epod.WaitForPodSuccessInNamespaceTimeout(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartShortTimeout) + framework.ExpectNoError(err) + }) + }) +}) + +// Run Kubelet plugin and wait until it's registered +func newKubeletPlugin(nodeName string) *testdriver.ExamplePlugin { + ginkgo.By("start Kubelet plugin") + logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", nodeName) + + // Ensure that directories exist, creating them if necessary. We want + // to know early if there is a setup problem that would prevent + // creating those directories. + err := os.MkdirAll(cdiDir, os.FileMode(0750)) + framework.ExpectNoError(err, "create CDI directory") + err = os.MkdirAll(filepath.Dir(endpoint), 0750) + framework.ExpectNoError(err, "create socket directory") + + plugin, err := testdriver.StartPlugin( + logger, + cdiDir, + driverName, + "", + testdriver.FileOperations{}, + kubeletplugin.PluginSocketPath(endpoint), + kubeletplugin.RegistrarSocketPath(path.Join(pluginRegistrationPath, driverName+"-reg.sock")), + kubeletplugin.KubeletPluginSocketPath(draAddress), + ) + framework.ExpectNoError(err) + + gomega.Eventually(plugin.GetGRPCCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) + + ginkgo.DeferCleanup(plugin.Stop) + + return plugin +} + +// createTestObjects creates objects required by the test +// NOTE: as scheduler and controller manager are not running by the Node e2e, +// the objects must contain all required data to be processed correctly by the API server +// and placed on the node without involving the scheduler and the DRA controller +func createTestObjects(ctx context.Context, clientSet kubernetes.Interface, nodename, namespace, className, claimName, podName string) *v1.Pod { + // ResourceClass + class := &resourcev1alpha2.ResourceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: className, + }, + DriverName: driverName, + } + _, err := clientSet.ResourceV1alpha2().ResourceClasses().Create(ctx, class, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.DeferCleanup(clientSet.ResourceV1alpha2().ResourceClasses().Delete, className, metav1.DeleteOptions{}) + + // ResourceClaim + podClaimName := "resource-claim" + claim := &resourcev1alpha2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: claimName, + }, + Spec: resourcev1alpha2.ResourceClaimSpec{ + ResourceClassName: className, + }, + } + createdClaim, err := clientSet.ResourceV1alpha2().ResourceClaims(namespace).Create(ctx, claim, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.DeferCleanup(clientSet.ResourceV1alpha2().ResourceClaims(namespace).Delete, claimName, metav1.DeleteOptions{}) + + // Pod + containerName := "testcontainer" + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + NodeName: nodename, // Assign the node as the scheduler is not running + ResourceClaims: []v1.PodResourceClaim{ + { + Name: podClaimName, + Source: v1.ClaimSource{ + ResourceClaimName: &claimName, + }, + }, + }, + Containers: []v1.Container{ + { + Name: containerName, + Image: e2epod.GetDefaultTestImage(), + Resources: v1.ResourceRequirements{ + Claims: []v1.ResourceClaim{{Name: podClaimName}}, + }, + Command: []string{"/bin/sh", "-c", "env | grep DRA_PARAM1=PARAM1_VALUE"}, + }, + }, + RestartPolicy: v1.RestartPolicyNever, + }, + } + createdPod, err := clientSet.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.DeferCleanup(clientSet.CoreV1().Pods(namespace).Delete, podName, metav1.DeleteOptions{}) + + // Update claim status: set ReservedFor and AllocationResult + // NOTE: This is usually done by the DRA controller + createdClaim.Status = resourcev1alpha2.ResourceClaimStatus{ + DriverName: driverName, + ReservedFor: []resourcev1alpha2.ResourceClaimConsumerReference{ + {Resource: "pods", Name: podName, UID: createdPod.UID}, + }, + Allocation: &resourcev1alpha2.AllocationResult{ + ResourceHandles: []resourcev1alpha2.ResourceHandle{ + { + DriverName: driverName, + Data: "{\"EnvVars\":{\"DRA_PARAM1\":\"PARAM1_VALUE\"},\"NodeName\":\"\"}", + }, + }, + }, + } + _, err = clientSet.ResourceV1alpha2().ResourceClaims(namespace).UpdateStatus(ctx, createdClaim, metav1.UpdateOptions{}) + framework.ExpectNoError(err) + + return pod +}