diff --git a/test/e2e/dra/test-driver/app/gomega.go b/test/e2e/dra/test-driver/app/gomega.go index 45f4ceff5b5..70bc1bde9ea 100644 --- a/test/e2e/dra/test-driver/app/gomega.go +++ b/test/e2e/dra/test-driver/app/gomega.go @@ -33,22 +33,42 @@ var BeRegistered = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error return false, nil }).WithMessage("contain successful NotifyRegistrationStatus call") -// NodePrepareResouceCalled checks that NodePrepareResource API has been called -var NodePrepareResourceCalled = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { +// NodePrepareResoucesSucceeded checks that NodePrepareResources API has been called and succeeded +var NodePrepareResourcesSucceeded = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { for _, call := range actualCalls { - if strings.HasSuffix(call.FullMethod, "/NodePrepareResource") && call.Err == nil { + if strings.HasSuffix(call.FullMethod, "/NodePrepareResources") && call.Response != nil && call.Err == nil { return true, nil } } return false, nil -}).WithMessage("contain NodePrepareResource call") +}).WithMessage("contain successful NodePrepareResources call") -// NodePrepareResoucesCalled checks that NodePrepareResources API has been called -var NodePrepareResourcesCalled = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { +// NodePrepareResoucesFailed checks that NodePrepareResources API has been called and returned an error +var NodePrepareResourcesFailed = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { for _, call := range actualCalls { - if strings.HasSuffix(call.FullMethod, "/NodePrepareResources") && call.Err == nil { + if strings.HasSuffix(call.FullMethod, "/NodePrepareResources") && call.Err != nil { return true, nil } } return false, nil -}).WithMessage("contain NodePrepareResources call") +}).WithMessage("contain unsuccessful NodePrepareResources call") + +// NodeUnprepareResoucesSucceeded checks that NodeUnprepareResources API has been called and succeeded +var NodeUnprepareResourcesSucceeded = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { + for _, call := range actualCalls { + if strings.HasSuffix(call.FullMethod, "/NodeUnprepareResources") && call.Response != nil && call.Err == nil { + return true, nil + } + } + return false, nil +}).WithMessage("contain successful NodeUnprepareResources call") + +// NodeUnprepareResoucesFailed checks that NodeUnprepareResources API has been called and returned an error +var NodeUnprepareResourcesFailed = gcustom.MakeMatcher(func(actualCalls []GRPCCall) (bool, error) { + for _, call := range actualCalls { + if strings.HasSuffix(call.FullMethod, "/NodeUnprepareResources") && call.Err != nil { + return true, nil + } + } + return false, nil +}).WithMessage("contain unsuccessful NodeUnprepareResources call") diff --git a/test/e2e/dra/test-driver/app/kubeletplugin.go b/test/e2e/dra/test-driver/app/kubeletplugin.go index ac1f2bfa69f..2656a45e10d 100644 --- a/test/e2e/dra/test-driver/app/kubeletplugin.go +++ b/test/e2e/dra/test-driver/app/kubeletplugin.go @@ -23,6 +23,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" "github.com/google/go-cmp/cmp" @@ -54,7 +55,14 @@ type ExamplePlugin struct { prepared map[ClaimID]any gRPCCalls []GRPCCall - block bool + blockPrepareResourcesMutex sync.Mutex + blockUnprepareResourcesMutex sync.Mutex + + prepareResourcesFailure error + failPrepareResourcesMutex sync.Mutex + + unprepareResourcesFailure error + failUnprepareResourcesMutex sync.Mutex } type GRPCCall struct { @@ -162,10 +170,60 @@ func (ex *ExamplePlugin) IsRegistered() bool { return status.PluginRegistered } -// Block sets a flag to block Node[Un]PrepareResources -// to emulate time consuming or stuck calls -func (ex *ExamplePlugin) Block() { - ex.block = true +// BlockNodePrepareResources locks blockPrepareResourcesMutex and returns unlocking function for it +func (ex *ExamplePlugin) BlockNodePrepareResources() func() { + ex.blockPrepareResourcesMutex.Lock() + return func() { + ex.blockPrepareResourcesMutex.Unlock() + } +} + +// BlockNodeUnprepareResources locks blockUnprepareResourcesMutex and returns unlocking function for it +func (ex *ExamplePlugin) BlockNodeUnprepareResources() func() { + ex.blockUnprepareResourcesMutex.Lock() + return func() { + ex.blockUnprepareResourcesMutex.Unlock() + } +} + +// SetNodePrepareResourcesFailureMode sets the failure mode for NodePrepareResources call +// and returns a function to unset the failure mode +func (ex *ExamplePlugin) SetNodePrepareResourcesFailureMode() func() { + ex.failPrepareResourcesMutex.Lock() + ex.prepareResourcesFailure = errors.New("simulated PrepareResources failure") + ex.failPrepareResourcesMutex.Unlock() + + return func() { + ex.failPrepareResourcesMutex.Lock() + ex.prepareResourcesFailure = nil + ex.failPrepareResourcesMutex.Unlock() + } +} + +func (ex *ExamplePlugin) getPrepareResourcesFailure() error { + ex.failPrepareResourcesMutex.Lock() + defer ex.failPrepareResourcesMutex.Unlock() + return ex.prepareResourcesFailure +} + +// SetNodeUnprepareResourcesFailureMode sets the failure mode for NodeUnprepareResources call +// and returns a function to unset the failure mode +func (ex *ExamplePlugin) SetNodeUnprepareResourcesFailureMode() func() { + ex.failUnprepareResourcesMutex.Lock() + ex.unprepareResourcesFailure = errors.New("simulated UnprepareResources failure") + ex.failUnprepareResourcesMutex.Unlock() + + return func() { + ex.failUnprepareResourcesMutex.Lock() + ex.unprepareResourcesFailure = nil + ex.failUnprepareResourcesMutex.Unlock() + } +} + +func (ex *ExamplePlugin) getUnprepareResourcesFailure() error { + ex.failUnprepareResourcesMutex.Lock() + defer ex.failUnprepareResourcesMutex.Unlock() + return ex.unprepareResourcesFailure } // NodePrepareResource ensures that the CDI file for the claim exists. It uses @@ -175,15 +233,10 @@ func (ex *ExamplePlugin) Block() { func (ex *ExamplePlugin) nodePrepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) ([]string, error) { logger := klog.FromContext(ctx) - // Block to emulate plugin stuckness or slowness. - // By default the call will not be blocked as ex.block = false. - if ex.block { - <-ctx.Done() - return nil, ctx.Err() - } - ex.mutex.Lock() defer ex.mutex.Unlock() + ex.blockPrepareResourcesMutex.Lock() + defer ex.blockPrepareResourcesMutex.Unlock() deviceName := "claim-" + claimUID vendor := ex.driverName @@ -309,6 +362,11 @@ func (ex *ExamplePlugin) NodePrepareResources(ctx context.Context, req *drapbv1a resp := &drapbv1alpha3.NodePrepareResourcesResponse{ Claims: make(map[string]*drapbv1alpha3.NodePrepareResourceResponse), } + + if failure := ex.getPrepareResourcesFailure(); failure != nil { + return resp, failure + } + for _, claimReq := range req.Claims { cdiDevices, err := ex.nodePrepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle) if err != nil { @@ -328,14 +386,10 @@ func (ex *ExamplePlugin) NodePrepareResources(ctx context.Context, req *drapbv1a // NodePrepareResource. It's idempotent, therefore it is not an error when that // file is already gone. func (ex *ExamplePlugin) nodeUnprepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) error { - logger := klog.FromContext(ctx) + ex.blockUnprepareResourcesMutex.Lock() + defer ex.blockUnprepareResourcesMutex.Unlock() - // Block to emulate plugin stuckness or slowness. - // By default the call will not be blocked as ex.block = false. - if ex.block { - <-ctx.Done() - return ctx.Err() - } + logger := klog.FromContext(ctx) filePath := ex.getJSONFilePath(claimUID) if err := ex.fileOps.Remove(filePath); err != nil { @@ -381,6 +435,11 @@ func (ex *ExamplePlugin) NodeUnprepareResources(ctx context.Context, req *drapbv resp := &drapbv1alpha3.NodeUnprepareResourcesResponse{ Claims: make(map[string]*drapbv1alpha3.NodeUnprepareResourceResponse), } + + if failure := ex.getUnprepareResourcesFailure(); failure != nil { + return resp, failure + } + for _, claimReq := range req.Claims { err := ex.nodeUnprepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle) if err != nil { @@ -487,3 +546,14 @@ func (ex *ExamplePlugin) GetGRPCCalls() []GRPCCall { calls = append(calls, ex.gRPCCalls...) return calls } + +// CountCalls counts GRPC calls with the given method suffix. +func (ex *ExamplePlugin) CountCalls(methodSuffix string) int { + count := 0 + for _, call := range ex.GetGRPCCalls() { + if strings.HasSuffix(call.FullMethod, methodSuffix) { + count += 1 + } + } + return count +} diff --git a/test/e2e_node/dra_test.go b/test/e2e_node/dra_test.go index ad2fab45d6d..ec53ac6fb07 100644 --- a/test/e2e_node/dra_test.go +++ b/test/e2e_node/dra_test.go @@ -26,6 +26,7 @@ package e2enode import ( "context" + "fmt" "os" "path" "path/filepath" @@ -39,7 +40,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" - dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" + draplugin "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" admissionapi "k8s.io/pod-security-admission/api" "k8s.io/kubernetes/test/e2e/feature" @@ -52,10 +53,11 @@ import ( const ( driverName = "test-driver.cdi.k8s.io" + kubeletPlugin1Name = "test-driver1.cdi.k8s.io" + kubeletPlugin2Name = "test-driver2.cdi.k8s.io" cdiDir = "/var/run/cdi" - endpoint = "/var/lib/kubelet/plugins/test-driver/dra.sock" + endpointTemplate = "/var/lib/kubelet/plugins/%s/dra.sock" pluginRegistrationPath = "/var/lib/kubelet/plugins_registry" - draAddress = "/var/lib/kubelet/plugins/test-driver/dra.sock" pluginRegistrationTimeout = time.Second * 60 // how long to wait for a node plugin to be registered podInPendingStateTimeout = time.Second * 60 // how long to wait for a pod to stay in pending state ) @@ -64,11 +66,11 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, f := framework.NewDefaultFramework("dra-node") f.NamespacePodSecurityLevel = admissionapi.LevelBaseline - var kubeletPlugin *testdriver.ExamplePlugin + var kubeletPlugin, kubeletPlugin1, kubeletPlugin2 *testdriver.ExamplePlugin f.Context("Resource Kubelet Plugin", f.WithSerial(), func() { ginkgo.BeforeEach(func(ctx context.Context) { - kubeletPlugin = newKubeletPlugin(ctx, getNodeName(ctx, f)) + kubeletPlugin = newKubeletPlugin(ctx, getNodeName(ctx, f), driverName) }) ginkgo.It("must register after Kubelet restart", func(ctx context.Context) { @@ -88,7 +90,7 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, ginkgo.It("must register after plugin restart", func(ctx context.Context) { ginkgo.By("restart Kubelet Plugin") kubeletPlugin.Stop() - kubeletPlugin = newKubeletPlugin(ctx, getNodeName(ctx, f)) + kubeletPlugin = newKubeletPlugin(ctx, getNodeName(ctx, f), driverName) ginkgo.By("wait for Kubelet plugin re-registration") gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) @@ -97,7 +99,7 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, ginkgo.It("must process pod created when kubelet is not running", func(ctx context.Context) { // Stop Kubelet startKubelet := stopKubelet() - pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod") + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) // Pod must be in pending state err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { return pod.Status.Phase == v1.PodPending, nil @@ -111,9 +113,9 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, }) ginkgo.It("must keep pod in pending state if NodePrepareResources times out", func(ctx context.Context) { - ginkgo.By("set delay for the NodePrepareResources call") - kubeletPlugin.Block() - pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod") + unblock := kubeletPlugin.BlockNodePrepareResources() + defer unblock() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) ginkgo.By("wait for pod to be in Pending state") err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { @@ -121,22 +123,312 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, }) framework.ExpectNoError(err) - ginkgo.By("wait for NodePrepareResources call") - gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(dra.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesCalled) - // TODO: Check condition or event when implemented // see https://github.com/kubernetes/kubernetes/issues/118468 for details ginkgo.By("check that pod is consistently in Pending state") gomega.Consistently(ctx, e2epod.Get(f.ClientSet, pod)).WithTimeout(podInPendingStateTimeout).Should(e2epod.BeInPhase(v1.PodPending), "Pod should be in Pending state as resource preparation time outed") }) + + ginkgo.It("must run pod if NodePrepareResources fails and then succeeds", func(ctx context.Context) { + unset := kubeletPlugin.SetNodePrepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) + + ginkgo.By("wait for pod to be in Pending state") + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("wait for NodePrepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesFailed) + + unset() + + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err = e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must run pod if NodeUnprepareResources fails and then succeeds", func(ctx context.Context) { + unset := kubeletPlugin.SetNodeUnprepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) + + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + unset() + + ginkgo.By("wait for NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must retry NodePrepareResources after Kubelet restart", func(ctx context.Context) { + unset := kubeletPlugin.SetNodePrepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) + + ginkgo.By("wait for pod to be in Pending state") + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("wait for NodePrepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesFailed) + + ginkgo.By("stop Kubelet") + startKubelet := stopKubelet() + + unset() + + ginkgo.By("start Kubelet") + startKubelet() + + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err = e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must retry NodeUnprepareResources after Kubelet restart", func(ctx context.Context) { + unset := kubeletPlugin.SetNodeUnprepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{driverName}) + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + ginkgo.By("stop Kubelet") + startKubelet := stopKubelet() + + unset() + + ginkgo.By("start Kubelet") + startKubelet() + + ginkgo.By("wait for NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must call NodeUnprepareResources for deleted pod", func(ctx context.Context) { + unset := kubeletPlugin.SetNodeUnprepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", false, []string{driverName}) + + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + ginkgo.By("delete pod") + e2epod.DeletePodOrFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + unset() + + ginkgo.By("wait for NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + }) + + ginkgo.It("must call NodeUnprepareResources for deleted pod after Kubelet restart", func(ctx context.Context) { + unset := kubeletPlugin.SetNodeUnprepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", false, []string{driverName}) + + ginkgo.By("wait for NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + ginkgo.By("delete pod") + err := e2epod.DeletePodWithGracePeriod(ctx, f.ClientSet, pod, 0) + framework.ExpectNoError(err) + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + ginkgo.By("restart Kubelet") + stopKubelet()() + + ginkgo.By("wait for NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + unset() + + ginkgo.By("wait for NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + }) + + ginkgo.It("must not call NodePrepareResources for deleted pod after Kubelet restart", func(ctx context.Context) { + unblock := kubeletPlugin.BlockNodePrepareResources() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", false, []string{driverName}) + + ginkgo.By("wait for pod to be in Pending state") + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("stop Kubelet") + startKubelet := stopKubelet() + + ginkgo.By("delete pod") + e2epod.DeletePodOrFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name) + + unblock() + + ginkgo.By("start Kubelet") + startKubelet() + + calls := kubeletPlugin.CountCalls("/NodePrepareResources") + ginkgo.By("make sure NodePrepareResources is not called again") + gomega.Consistently(kubeletPlugin.CountCalls("/NodePrepareResources")).WithTimeout(draplugin.PluginClientTimeout).Should(gomega.Equal(calls)) + }) + }) + + f.Context("Two resource Kubelet Plugins", f.WithSerial(), func() { + ginkgo.BeforeEach(func(ctx context.Context) { + kubeletPlugin1 = newKubeletPlugin(ctx, getNodeName(ctx, f), kubeletPlugin1Name) + kubeletPlugin2 = newKubeletPlugin(ctx, getNodeName(ctx, f), kubeletPlugin2Name) + + ginkgo.By("wait for Kubelet plugin registration") + gomega.Eventually(kubeletPlugin1.GetGRPCCalls()).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) + gomega.Eventually(kubeletPlugin2.GetGRPCCalls()).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered) + }) + + ginkgo.It("must prepare and unprepare resources", func(ctx context.Context) { + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{kubeletPlugin1Name, kubeletPlugin2Name}) + + ginkgo.By("wait for pod to succeed") + err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + + ginkgo.By("wait for NodePrepareResources calls to succeed") + gomega.Eventually(kubeletPlugin1.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for NodeUnprepareResources calls to succeed") + gomega.Eventually(kubeletPlugin1.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + }) + + ginkgo.It("must run pod if NodePrepareResources fails for one plugin and then succeeds", func(ctx context.Context) { + unset := kubeletPlugin2.SetNodePrepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{kubeletPlugin1Name, kubeletPlugin2Name}) + + ginkgo.By("wait for pod to be in Pending state") + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("wait for plugin2 NodePrepareResources call to fail") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesFailed) + + unset() + + ginkgo.By("wait for plugin2 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err = e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must run pod if NodeUnprepareResources fails for one plugin and then succeeds", func(ctx context.Context) { + unset := kubeletPlugin2.SetNodeUnprepareResourcesFailureMode() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{kubeletPlugin1Name, kubeletPlugin2Name}) + + ginkgo.By("wait for plugin1 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin1.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for plugin2 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for plugin2 NodeUnprepareResources call to fail") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesFailed) + + unset() + + ginkgo.By("wait for plugin2 NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must run pod if NodePrepareResources is in progress for one plugin when Kubelet restarts", func(ctx context.Context) { + unblock := kubeletPlugin.BlockNodePrepareResources() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{kubeletPlugin1Name, kubeletPlugin2Name}) + + ginkgo.By("wait for pod to be in Pending state") + err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) { + return pod.Status.Phase == v1.PodPending, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("restart Kubelet") + restartKubelet(true) + + unblock() + + ginkgo.By("wait for plugin2 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err = e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) + + ginkgo.It("must call NodeUnprepareResources again if it's in progress for one plugin when Kubelet restarts", func(ctx context.Context) { + unblock := kubeletPlugin2.BlockNodeUnprepareResources() + pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod", true, []string{kubeletPlugin1Name, kubeletPlugin2Name}) + + ginkgo.By("wait for plugin1 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin1.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("wait for plugin2 NodePrepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesSucceeded) + + ginkgo.By("restart Kubelet") + restartKubelet(true) + + unblock() + + ginkgo.By("wait for plugin2 NodeUnprepareResources call to succeed") + gomega.Eventually(kubeletPlugin2.GetGRPCCalls).WithTimeout(draplugin.PluginClientTimeout * 2).Should(testdriver.NodeUnprepareResourcesSucceeded) + + ginkgo.By("wait for pod to succeed") + err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err) + }) }) }) // Run Kubelet plugin and wait until it's registered -func newKubeletPlugin(ctx context.Context, nodeName string) *testdriver.ExamplePlugin { +func newKubeletPlugin(ctx context.Context, nodeName, pluginName string) *testdriver.ExamplePlugin { ginkgo.By("start Kubelet plugin") - logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", nodeName) + logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin "+pluginName), "node", nodeName) ctx = klog.NewContext(ctx, logger) // Ensure that directories exist, creating them if necessary. We want @@ -144,18 +436,19 @@ func newKubeletPlugin(ctx context.Context, nodeName string) *testdriver.ExampleP // creating those directories. err := os.MkdirAll(cdiDir, os.FileMode(0750)) framework.ExpectNoError(err, "create CDI directory") + endpoint := fmt.Sprintf(endpointTemplate, pluginName) err = os.MkdirAll(filepath.Dir(endpoint), 0750) framework.ExpectNoError(err, "create socket directory") plugin, err := testdriver.StartPlugin( ctx, cdiDir, - driverName, + pluginName, "", testdriver.FileOperations{}, kubeletplugin.PluginSocketPath(endpoint), - kubeletplugin.RegistrarSocketPath(path.Join(pluginRegistrationPath, driverName+"-reg.sock")), - kubeletplugin.KubeletPluginSocketPath(draAddress), + kubeletplugin.RegistrarSocketPath(path.Join(pluginRegistrationPath, pluginName+"-reg.sock")), + kubeletplugin.KubeletPluginSocketPath(endpoint), ) framework.ExpectNoError(err) @@ -170,13 +463,13 @@ func newKubeletPlugin(ctx context.Context, nodeName string) *testdriver.ExampleP // NOTE: as scheduler and controller manager are not running by the Node e2e, // the objects must contain all required data to be processed correctly by the API server // and placed on the node without involving the scheduler and the DRA controller -func createTestObjects(ctx context.Context, clientSet kubernetes.Interface, nodename, namespace, className, claimName, podName string) *v1.Pod { +func createTestObjects(ctx context.Context, clientSet kubernetes.Interface, nodename, namespace, className, claimName, podName string, deferPodDeletion bool, pluginNames []string) *v1.Pod { // ResourceClass class := &resourcev1alpha2.ResourceClass{ ObjectMeta: metav1.ObjectMeta{ Name: className, }, - DriverName: driverName, + DriverName: "controller", } _, err := clientSet.ResourceV1alpha2().ResourceClasses().Create(ctx, class, metav1.CreateOptions{}) framework.ExpectNoError(err) @@ -231,22 +524,26 @@ func createTestObjects(ctx context.Context, clientSet kubernetes.Interface, node createdPod, err := clientSet.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) framework.ExpectNoError(err) - ginkgo.DeferCleanup(clientSet.CoreV1().Pods(namespace).Delete, podName, metav1.DeleteOptions{}) + if deferPodDeletion { + ginkgo.DeferCleanup(clientSet.CoreV1().Pods(namespace).Delete, podName, metav1.DeleteOptions{}) + } // Update claim status: set ReservedFor and AllocationResult // NOTE: This is usually done by the DRA controller + resourceHandlers := make([]resourcev1alpha2.ResourceHandle, len(pluginNames)) + for i, pluginName := range pluginNames { + resourceHandlers[i] = resourcev1alpha2.ResourceHandle{ + DriverName: pluginName, + Data: "{\"EnvVars\":{\"DRA_PARAM1\":\"PARAM1_VALUE\"},\"NodeName\":\"\"}", + } + } createdClaim.Status = resourcev1alpha2.ResourceClaimStatus{ - DriverName: driverName, + DriverName: "controller", ReservedFor: []resourcev1alpha2.ResourceClaimConsumerReference{ {Resource: "pods", Name: podName, UID: createdPod.UID}, }, Allocation: &resourcev1alpha2.AllocationResult{ - ResourceHandles: []resourcev1alpha2.ResourceHandle{ - { - DriverName: driverName, - Data: "{\"EnvVars\":{\"DRA_PARAM1\":\"PARAM1_VALUE\"},\"NodeName\":\"\"}", - }, - }, + ResourceHandles: resourceHandlers, }, } _, err = clientSet.ResourceV1alpha2().ResourceClaims(namespace).UpdateStatus(ctx, createdClaim, metav1.UpdateOptions{})