diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 8cb57aa8190..9f2552bd729 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -544,15 +544,29 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", string(podUID), contName, resource, devices.Len(), required) } } + + klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName) + healthyDevices, hasRegistered := m.healthyDevices[resource] + + // Check if resource registered with devicemanager + if !hasRegistered { + return nil, fmt.Errorf("can't allocate unregistered device %s", resource) + } + + // Check if registered resource has healthy devices + if healthyDevices.Len() == 0 { + return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource) + } + + // Check if all the previously allocated devices are healthy + if !healthyDevices.IsSuperset(devices) { + return nil, fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resource) + } + if needed == 0 { // No change, no work. return nil, nil } - klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName) - // Check if resource registered with devicemanager - if _, ok := m.healthyDevices[resource]; !ok { - return nil, fmt.Errorf("can't allocate unregistered device %s", resource) - } // Declare the list of allocated devices. // This will be populated and returned below. diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 6b7830c2316..d44bbdd2d1c 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -988,6 +988,102 @@ func TestPodContainerDeviceAllocation(t *testing.T) { } +func TestPodContainerDeviceToAllocate(t *testing.T) { + resourceName1 := "domain1.com/resource1" + resourceName2 := "domain2.com/resource2" + resourceName3 := "domain2.com/resource3" + as := require.New(t) + tmpDir, err := os.MkdirTemp("", "checkpoint") + as.Nil(err) + defer os.RemoveAll(tmpDir) + + testManager := &ManagerImpl{ + endpoints: make(map[string]endpointInfo), + healthyDevices: make(map[string]sets.String), + unhealthyDevices: make(map[string]sets.String), + allocatedDevices: make(map[string]sets.String), + podDevices: newPodDevices(), + } + + testManager.podDevices.insert("pod1", "con1", resourceName1, + constructDevices([]string{"dev1", "dev2"}), + constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"}, + map[string]string{"/home/r2lib1": "/usr/r2lib1"}, + map[string]string{"r2devices": "dev1 dev2"})) + testManager.podDevices.insert("pod2", "con2", resourceName2, + checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}}, + constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"}, + map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{})) + testManager.podDevices.insert("pod3", "con3", resourceName3, + checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}}, + constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"}, + map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{})) + + // no healthy devices for resourceName1 and devices corresponding to + // resource2 are intentionally omitted to simulate that the resource + // hasn't been registered. + testManager.healthyDevices[resourceName1] = sets.NewString() + testManager.healthyDevices[resourceName3] = sets.NewString() + // dev5 is no longer in the list of healthy devices + testManager.healthyDevices[resourceName3].Insert("dev7") + testManager.healthyDevices[resourceName3].Insert("dev8") + + testCases := []struct { + description string + podUID string + contName string + resource string + required int + reusableDevices sets.String + expectedAllocatedDevices sets.String + expErr error + }{ + { + description: "Admission error in case no healthy devices to allocate present", + podUID: "pod1", + contName: "con1", + resource: resourceName1, + required: 2, + reusableDevices: sets.NewString(), + expectedAllocatedDevices: nil, + expErr: fmt.Errorf("can't allocate unhealthy devices %s", resourceName1), + }, + { + description: "Admission error in case resource is not registered", + podUID: "pod2", + contName: "con2", + resource: resourceName2, + required: 1, + reusableDevices: sets.NewString(), + expectedAllocatedDevices: nil, + expErr: fmt.Errorf("can't allocate unregistered device %s", resourceName2), + }, + { + description: "Admission error in case resource not devices previously allocated no longer healthy", + podUID: "pod3", + contName: "con3", + resource: resourceName3, + required: 1, + reusableDevices: sets.NewString(), + expectedAllocatedDevices: nil, + expErr: fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resourceName3), + }, + } + + for _, testCase := range testCases { + allocDevices, err := testManager.devicesToAllocate(testCase.podUID, testCase.contName, testCase.resource, testCase.required, testCase.reusableDevices) + if !reflect.DeepEqual(err, testCase.expErr) { + t.Errorf("DevicePluginManager error (%v). expected error: %v but got: %v", + testCase.description, testCase.expErr, err) + } + if !reflect.DeepEqual(allocDevices, testCase.expectedAllocatedDevices) { + t.Errorf("DevicePluginManager error (%v). expected error: %v but got: %v", + testCase.description, testCase.expectedAllocatedDevices, allocDevices) + } + } + +} + func TestGetDeviceRunContainerOptions(t *testing.T) { res1 := TestResource{ resourceName: "domain1.com/resource1", diff --git a/test/e2e_node/device_manager_test.go b/test/e2e_node/device_manager_test.go index 2892dae1175..68eeb85b2d2 100644 --- a/test/e2e_node/device_manager_test.go +++ b/test/e2e_node/device_manager_test.go @@ -18,6 +18,7 @@ package e2enode import ( "context" + "errors" "fmt" "os" "path/filepath" @@ -27,7 +28,11 @@ import ( "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/klog/v2" kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" "k8s.io/kubernetes/pkg/kubelet/apis/podresources" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" @@ -39,9 +44,13 @@ import ( e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" + testutils "k8s.io/kubernetes/test/utils" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "github.com/onsi/gomega/gcustom" + "github.com/onsi/gomega/types" ) const ( @@ -266,6 +275,239 @@ var _ = SIGDescribe("Device Manager [Serial] [Feature:DeviceManager][NodeFeatur }) }) + + /* + This end to end test is to simulate a scenario where after kubelet restart/node + reboot application pods requesting devices appear before the device plugin + pod exposing those devices as resources. + + The happy path is where after node reboot/ kubelet restart, the device plugin pod + appears before the application pod. This PR and this e2e test + aims to tackle the scenario where device plugin either does not appear first + or doesn't get the chance to re-register itself. + + Since there is no way of controlling the order in which the pods appear after + kubelet restart/node reboot, we can't guarantee that the application pod + recovers before device plugin pod (the scenario we want to exercise here). + If the device plugin pod is recovered before the test pod, we still can + meaningfully reproduce the scenario by NOT sending the registration command. + To do so sample device plugin is enhanced. For implementation details, refer to: + `test/images/sample-device-plugin/sampledeviceplugin.go`. This enhancement + allows auto-registration of the plugin to be controlled with the help of an environment + variable: REGISTER_CONTROL_FILE. By default this environment variable is not present + and the device plugin autoregisters to kubelet. For this e2e test, we use sample device + plugin spec with REGISTER_CONTROL_FILE=/var/lib/kubelet/device-plugins/sample/registration + to allow manual registeration of the plugin to allow an application pod (requesting devices) + to successfully run on the node followed by kubelet restart where device plugin doesn't + register and the application pod fails with admission error. + + Breakdown of the steps implemented as part of this e2e test is as follows: + 1. Create a file `registration` at path `/var/lib/kubelet/device-plugins/sample/` + 2. Create sample device plugin with an environment variable with + `REGISTER_CONTROL_FILE=/var/lib/kubelet/device-plugins/sample/registration` that + waits for a client to delete the control file. + 3. Trigger plugin registeration by deleting the abovementioned directory. + 4. Create a test pod requesting devices exposed by the device plugin. + 5. Stop kubelet. + 6. Remove pods using CRI to ensure new pods are created after kubelet restart. + 7. Restart kubelet. + 8. Wait for the sample device plugin pod to be running. In this case, + the registration is not triggered. + 9. Ensure that resource capacity/allocatable exported by the device plugin is zero. + 10. The test pod should fail with `UnexpectedAdmissionError` + 11. Delete the test pod. + 12. Delete the sample device plugin pod. + 13. Remove `/var/lib/kubelet/device-plugins/sample/` and its content, the directory created to control registration + */ + ginkgo.Context("With sample device plugin", func() { + var deviceCount int = 2 + var devicePluginPod *v1.Pod + var triggerPathFile, triggerPathDir string + + // this test wants to reproduce what happened in https://github.com/kubernetes/kubernetes/issues/109595 + ginkgo.BeforeEach(func(ctx context.Context) { + ginkgo.By("Wait for node to be ready") + gomega.Eventually(ctx, e2enode.TotalReady). + WithArguments(f.ClientSet). + WithTimeout(time.Minute). + Should(gomega.BeEquivalentTo(1)) + + ginkgo.By("Setting up the directory and file for controlling registration") + triggerPathDir = filepath.Join(devicePluginDir, "sample") + if _, err := os.Stat(triggerPathDir); errors.Is(err, os.ErrNotExist) { + err := os.Mkdir(triggerPathDir, os.ModePerm) + if err != nil { + klog.Errorf("Directory creation %s failed: %v ", triggerPathDir, err) + panic(err) + } + klog.InfoS("Directory created successfully") + + triggerPathFile = filepath.Join(triggerPathDir, "registration") + if _, err := os.Stat(triggerPathFile); errors.Is(err, os.ErrNotExist) { + _, err = os.Create(triggerPathFile) + if err != nil { + klog.Errorf("File creation %s failed: %v ", triggerPathFile, err) + panic(err) + } + } + } + + ginkgo.By("Scheduling a sample device plugin pod") + data, err := e2etestfiles.Read(SampleDevicePluginControlRegistrationDSYAML) + if err != nil { + framework.Fail(err.Error()) + } + ds := readDaemonSetV1OrDie(data) + + dp := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: SampleDevicePluginName, + }, + Spec: ds.Spec.Template.Spec, + } + + devicePluginPod = e2epod.NewPodClient(f).CreateSync(ctx, dp) + + go func() { + // Since autoregistration is disabled for the device plugin (as REGISTER_CONTROL_FILE + // environment variable is specified), device plugin registration needs to be triggerred + // manually. + // This is done by deleting the control file at the following path: + // `/var/lib/kubelet/device-plugins/sample/registration`. + + defer ginkgo.GinkgoRecover() + framework.Logf("Deleting the control file: %q to trigger registration", triggerPathFile) + err := os.Remove(triggerPathFile) + framework.ExpectNoError(err) + }() + + ginkgo.By("Waiting for devices to become available on the local node") + + gomega.Eventually(ctx, isNodeReadyWithSampleResources). + WithArguments(f). + WithTimeout(5 * time.Minute). + Should(BeReady()) + + framework.Logf("Successfully created device plugin pod") + + devsLen := int64(deviceCount) // shortcut + ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node") + + gomega.Eventually(ctx, isNodeReadyWithAllocatableSampleResources). + WithArguments(f, devsLen). + WithTimeout(5 * time.Minute). + Should(HaveAllocatableDevices()) + }) + + ginkgo.It("should deploy pod consuming devices first but fail with admission error after kubelet restart in case device plugin hasn't re-registered", func(ctx context.Context) { + var err error + podCMD := "while true; do sleep 1000; done;" + + ginkgo.By(fmt.Sprintf("creating a pods requiring %d %q", deviceCount, SampleDeviceResourceName)) + + pod := makeBusyboxDeviceRequiringPod(SampleDeviceResourceName, podCMD) + testPod := e2epod.NewPodClient(f).CreateSync(ctx, pod) + + ginkgo.By("making sure all the pods are ready") + + err = e2epod.WaitForPodCondition(ctx, f.ClientSet, testPod.Namespace, testPod.Name, "Ready", 120*time.Second, testutils.PodRunningReady) + framework.ExpectNoError(err, "pod %s/%s did not go running", testPod.Namespace, testPod.Name) + framework.Logf("pod %s/%s running", testPod.Namespace, testPod.Name) + + ginkgo.By("stopping the kubelet") + startKubelet := stopKubelet() + + ginkgo.By("stopping all the local containers - using CRI") + rs, _, err := getCRIClient() + framework.ExpectNoError(err) + sandboxes, err := rs.ListPodSandbox(ctx, &runtimeapi.PodSandboxFilter{}) + framework.ExpectNoError(err) + for _, sandbox := range sandboxes { + gomega.Expect(sandbox.Metadata).ToNot(gomega.BeNil()) + ginkgo.By(fmt.Sprintf("deleting pod using CRI: %s/%s -> %s", sandbox.Metadata.Namespace, sandbox.Metadata.Name, sandbox.Id)) + + err := rs.RemovePodSandbox(ctx, sandbox.Id) + framework.ExpectNoError(err) + } + + ginkgo.By("restarting the kubelet") + startKubelet() + + ginkgo.By("waiting for the kubelet to be ready again") + // Wait for the Kubelet to be ready. + + gomega.Eventually(ctx, e2enode.TotalReady). + WithArguments(f.ClientSet). + WithTimeout(2 * time.Minute). + Should(gomega.BeEquivalentTo(1)) + + ginkgo.By("making sure all the pods are ready after the recovery") + + var devicePluginPodAfterRestart *v1.Pod + + devicePluginPodAfterRestart, err = e2epod.NewPodClient(f).Get(ctx, devicePluginPod.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + err = e2epod.WaitForPodCondition(ctx, f.ClientSet, devicePluginPodAfterRestart.Namespace, devicePluginPodAfterRestart.Name, "Ready", 120*time.Second, testutils.PodRunningReady) + framework.ExpectNoError(err, "pod %s/%s did not go running", devicePluginPodAfterRestart.Namespace, devicePluginPodAfterRestart.Name) + framework.Logf("pod %s/%s running", devicePluginPodAfterRestart.Namespace, devicePluginPodAfterRestart.Name) + + ginkgo.By("Waiting for the resource capacity/allocatable exported by the sample device plugin to become zero") + + // The device plugin pod has restarted but has not re-registered to kubelet (as AUTO_REGISTER= false) + // and registration wasn't triggered manually (by writing to the unix socket exposed at + // `/var/lib/kubelet/device-plugins/registered`). Because of this, the capacity and allocatable corresponding + // to the resource exposed by the device plugin should be zero. + + gomega.Eventually(ctx, isNodeReadyWithAllocatableSampleResources). + WithArguments(f, int64(0)). + WithTimeout(5 * time.Minute). + Should(HaveAllocatableDevices()) + + ginkgo.By("Checking that pod requesting devices failed to start because of admission error") + + // NOTE: The device plugin won't re-register again and this is intentional. + // Because of this, the testpod (requesting a device) should fail with an admission error. + + gomega.Eventually(ctx, getPod). + WithArguments(f, testPod.Name). + WithTimeout(time.Minute). + Should(HaveFailedWithAdmissionError(), + "the pod succeeded to start, when it should fail with the admission error") + + ginkgo.By("removing application pods") + e2epod.NewPodClient(f).DeleteSync(ctx, testPod.Name, metav1.DeleteOptions{}, 2*time.Minute) + }) + + ginkgo.AfterEach(func(ctx context.Context) { + ginkgo.By("Deleting the device plugin pod") + e2epod.NewPodClient(f).DeleteSync(ctx, devicePluginPod.Name, metav1.DeleteOptions{}, time.Minute) + + ginkgo.By("Deleting the directory and file setup for controlling registration") + err := os.RemoveAll(triggerPathDir) + framework.ExpectNoError(err) + + ginkgo.By("Deleting any Pods created by the test") + l, err := e2epod.NewPodClient(f).List(context.TODO(), metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, p := range l.Items { + if p.Namespace != f.Namespace.Name { + continue + } + + framework.Logf("Deleting pod: %s", p.Name) + e2epod.NewPodClient(f).DeleteSync(ctx, p.Name, metav1.DeleteOptions{}, 2*time.Minute) + } + + ginkgo.By("Waiting for devices to become unavailable on the local node") + gomega.Eventually(ctx, isNodeReadyWithoutSampleResources). + WithArguments(f). + WithTimeout(5 * time.Minute). + Should(BeReady()) + }) + + }) + }) func compareSRIOVResources(expected, got *sriovData) { @@ -351,3 +593,153 @@ func stringifyContainerDevices(devs []*kubeletpodresourcesv1.ContainerDevices) s sort.Strings(entries) return strings.Join(entries, ", ") } + +func makeBusyboxDeviceRequiringPod(resourceName, cmd string) *v1.Pod { + podName := "device-manager-test-" + string(uuid.NewUUID()) + rl := v1.ResourceList{ + v1.ResourceName(resourceName): *resource.NewQuantity(2, resource.DecimalSI), + } + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{{ + Image: busyboxImage, + Name: podName, + // Runs the specified command in the test pod. + Command: []string{"sh", "-c", cmd}, + Resources: v1.ResourceRequirements{ + Limits: rl, + Requests: rl, + }, + }}, + }, + } +} + +// BeReady verifies that a node is ready and devices have registered. +func BeReady() types.GomegaMatcher { + return gomega.And( + // This additional matcher checks for the final error condition. + gcustom.MakeMatcher(func(ready bool) (bool, error) { + if !ready { + return false, fmt.Errorf("Expected node to be ready=%t", ready) + } + return true, nil + }), + BeInReadyPhase(true), + ) +} + +// BeInReadyPhase matches if node is ready i.e. ready is true. +func BeInReadyPhase(isReady bool) types.GomegaMatcher { + return gcustom.MakeMatcher(func(ready bool) (bool, error) { + return ready == isReady, nil + }).WithTemplate("Expected Node Ready {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(isReady) +} + +func isNodeReadyWithSampleResources(ctx context.Context, f *framework.Framework) (bool, error) { + node, ready := getLocalTestNode(ctx, f) + if !ready { + return false, fmt.Errorf("Expected node to be ready=%t", ready) + } + + if CountSampleDeviceCapacity(node) <= 0 { + return false, fmt.Errorf("Expected devices to be advertised") + } + return true, nil +} + +// HaveAllocatableDevices verifies that a node has allocatable devices. +func HaveAllocatableDevices() types.GomegaMatcher { + return gomega.And( + // This additional matcher checks for the final error condition. + gcustom.MakeMatcher(func(hasAllocatable bool) (bool, error) { + if !hasAllocatable { + return false, fmt.Errorf("Expected node to be have allocatable devices=%t", hasAllocatable) + } + return true, nil + }), + hasAllocatable(true), + ) +} + +// hasAllocatable matches if node is ready i.e. ready is true. +func hasAllocatable(hasAllocatable bool) types.GomegaMatcher { + return gcustom.MakeMatcher(func(hasAllocatableDevices bool) (bool, error) { + return hasAllocatableDevices == hasAllocatable, nil + }).WithTemplate("Expected Node with allocatable {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(hasAllocatable) +} + +func isNodeReadyWithAllocatableSampleResources(ctx context.Context, f *framework.Framework, devCount int64) (bool, error) { + node, ready := getLocalTestNode(ctx, f) + if !ready { + return false, fmt.Errorf("Expected node to be ready=%t", ready) + } + + if CountSampleDeviceCapacity(node) != devCount { + return false, fmt.Errorf("Expected devices capacity to be: %d", devCount) + } + + if CountSampleDeviceAllocatable(node) != devCount { + return false, fmt.Errorf("Expected devices allocatable to be: %d", devCount) + } + return true, nil +} + +func isNodeReadyWithoutSampleResources(ctx context.Context, f *framework.Framework) (bool, error) { + node, ready := getLocalTestNode(ctx, f) + if !ready { + return false, fmt.Errorf("Expected node to be ready=%t", ready) + } + + if CountSampleDeviceCapacity(node) > 0 { + return false, fmt.Errorf("Expected devices to be not present") + } + return true, nil +} + +// HaveFailedWithAdmissionError verifies that a pod fails at admission. +func HaveFailedWithAdmissionError() types.GomegaMatcher { + return gomega.And( + gcustom.MakeMatcher(func(hasFailed bool) (bool, error) { + if !hasFailed { + return false, fmt.Errorf("Expected pod to have failed=%t", hasFailed) + } + return true, nil + }), + hasFailed(true), + ) +} + +// hasFailed matches if pod has failed. +func hasFailed(hasFailed bool) types.GomegaMatcher { + return gcustom.MakeMatcher(func(hasPodFailed bool) (bool, error) { + return hasPodFailed == hasFailed, nil + }).WithTemplate("Expected Pod failed {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(hasFailed) +} + +func getPod(ctx context.Context, f *framework.Framework, podName string) (bool, error) { + + pod, err := e2epod.NewPodClient(f).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("Expected node to get pod=%q got err=%q", pod.Name, err) + } + + expectedStatusReason := "UnexpectedAdmissionError" + expectedStatusMessage := "Allocate failed due to can't allocate unhealthy devices" + + // This additional matcher checks for the final error condition. + if pod.Status.Phase != v1.PodFailed { + return false, fmt.Errorf("Expected pod to reach phase %q, got final phase %q instead.", v1.PodFailed, pod.Status.Phase) + } + if pod.Status.Reason != expectedStatusReason { + return false, fmt.Errorf("Expected pod status reason to be %q, got %q instead.", expectedStatusReason, pod.Status.Reason) + } + if !strings.Contains(pod.Status.Message, expectedStatusMessage) { + return false, fmt.Errorf("Expected pod status reason to contain %q, got %q instead.", expectedStatusMessage, pod.Status.Message) + } + return true, nil +} diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 611590642bc..e6e7da29766 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -87,11 +87,16 @@ func updateImageAllowList(ctx context.Context) { } else { e2epod.ImagePrePullList.Insert(gpuDevicePluginImage) } - if samplePluginImage, err := getSampleDevicePluginImage(); err != nil { + if samplePluginImage, err := getContainerImageFromE2ETestDaemonset(SampleDevicePluginDSYAML); err != nil { klog.Errorln(err) } else { e2epod.ImagePrePullList.Insert(samplePluginImage) } + if samplePluginImageCtrlReg, err := getContainerImageFromE2ETestDaemonset(SampleDevicePluginControlRegistrationDSYAML); err != nil { + klog.Errorln(err) + } else { + e2epod.ImagePrePullList.Insert(samplePluginImageCtrlReg) + } } func getNodeProblemDetectorImage() string { @@ -222,19 +227,19 @@ func getGPUDevicePluginImage(ctx context.Context) (string, error) { return ds.Spec.Template.Spec.Containers[0].Image, nil } -func getSampleDevicePluginImage() (string, error) { - data, err := e2etestfiles.Read(SampleDevicePluginDSYAML) +func getContainerImageFromE2ETestDaemonset(dsYamlPath string) (string, error) { + data, err := e2etestfiles.Read(dsYamlPath) if err != nil { - return "", fmt.Errorf("failed to read the sample plugin yaml: %w", err) + return "", fmt.Errorf("failed to read the daemonset yaml: %w", err) } ds, err := e2emanifest.DaemonSetFromData(data) if err != nil { - return "", fmt.Errorf("failed to parse daemon set for sample plugin: %w", err) + return "", fmt.Errorf("failed to parse daemonset yaml: %w", err) } if len(ds.Spec.Template.Spec.Containers) < 1 { - return "", fmt.Errorf("failed to parse the sample plugin image: cannot extract the container from YAML") + return "", fmt.Errorf("failed to parse the container image: cannot extract the container from YAML") } return ds.Spec.Template.Spec.Containers[0].Image, nil }