From 97f4bcb456eaeec4fb2f78d093281a1ac64a5832 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser <zkaiser@nvidia.com> Date: Mon, 14 Apr 2025 19:09:30 +0000 Subject: [PATCH 1/4] gpu: Remove CDI annotations for outer runtime After the outer runtime has processed the CDI annotation from the spec we can delete them since they were converted into Linux devices in the OCI spec. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com> --- src/runtime/pkg/device/config/config.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index d2f6d89288..42bcccff8a 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -697,7 +697,14 @@ func WithCDI(annotations map[string]string, cdiSpecDirs []string, spec *specs.Sp if _, err := registry.InjectDevices(spec, devsFromAnnotations...); err != nil { return nil, fmt.Errorf("CDI device injection failed: %w", err) } - + // Once we injected the device into the ociSpec we do not need to CDI + // device annotation from the outer runtime. The runtime will create the + // appropriate inner runtime CDI annotation dependent on the device. + for key := range spec.Annotations { + if strings.HasPrefix(key, cdi.AnnotationPrefix) { + delete(spec.Annotations, key) + } + } // One crucial thing to keep in mind is that CDI device injection // might add OCI Spec environment variables, hooks, and mounts as // well. Therefore it is important that none of the corresponding From 6713db8990b1fc845ddd94ac31f666ca4749b6e0 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser <zkaiser@nvidia.com> Date: Mon, 14 Apr 2025 19:10:46 +0000 Subject: [PATCH 2/4] gpu: Add CDI parsing for Sandbox as well Extend the CDI parsing for pod_sandbox as well, only single_container was covered properly. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com> --- src/runtime/pkg/containerd-shim-v2/create.go | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/create.go b/src/runtime/pkg/containerd-shim-v2/create.go index c272c89d4e..ab5afd2a07 100644 --- a/src/runtime/pkg/containerd-shim-v2/create.go +++ b/src/runtime/pkg/containerd-shim-v2/create.go @@ -112,19 +112,16 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con if s.sandbox != nil { return nil, fmt.Errorf("cannot create another sandbox in sandbox: %s", s.sandbox.ID()) } + // Here we deal with CDI devices that are cold-plugged (k8s) and + // for the single_container (nerdctl, podman, ...) use-case. // We can provide additional directories where to search for // CDI specs if needed. immutable OS's only have specific // directories where applications can write too. For instance /opt/cdi // // _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec) - // - // Only inject CDI devices if single_container we do not want - // CDI devices in the pod_sandbox - if containerType == vc.SingleContainer { - _, err = config.WithCDI(ociSpec.Annotations, []string{}, ociSpec) - if err != nil { - return nil, fmt.Errorf("adding CDI devices failed: %w", err) - } + _, err = config.WithCDI(ociSpec.Annotations, []string{}, ociSpec) + if err != nil { + return nil, fmt.Errorf("adding CDI devices failed: %w", err) } s.config = runtimeConfig From 486244b2923480c4a7ff7f0d9c83e6f5f0d9ff47 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser <zkaiser@nvidia.com> Date: Mon, 14 Apr 2025 19:15:47 +0000 Subject: [PATCH 3/4] gpu: Remove unneeded parsing of CDI devices The addition of CDI devices is now done for single_container and pod_sandbox and pod_container before the devmanager creates the deviceinfos no need for extra parsing. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com> --- src/runtime/virtcontainers/sandbox.go | 72 --------------------------- 1 file changed, 72 deletions(-) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index b8b2fb5404..d0b560a922 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -723,79 +723,7 @@ func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) // for correct number of PCIe root ports. var vhostUserBlkDevices []config.DeviceInfo - //io.katacontainers.pkg.oci.container_type:pod_sandbox - for cnt, container := range sandboxConfig.Containers { - // Do not alter the original spec, we do not want to inject - // CDI devices into the sandbox container, were using the CDI - // devices as additional information to determine the number of - // PCIe root ports to reserve for the hypervisor. - // A single_container type will have the CDI devices injected - // only do this if we're a pod_sandbox type. - if container.Annotations["io.katacontainers.pkg.oci.container_type"] == "pod_sandbox" && container.CustomSpec != nil { - cdiSpec := container.CustomSpec - // We can provide additional directories where to search for - // CDI specs if needed. immutable OS's only have specific - // directories where applications can write too. For instance /opt/cdi - // - // _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec) - // - _, err := config.WithCDI(cdiSpec.Annotations, []string{}, cdiSpec) - if err != nil { - return coldPlugVFIO, fmt.Errorf("adding CDI devices failed: %w", err) - } - - for _, dev := range cdiSpec.Linux.Devices { - isVFIODevice := deviceManager.IsVFIODevice(dev.Path) - if hotPlugVFIO && isVFIODevice { - vfioDev := config.DeviceInfo{ - ColdPlug: true, - ContainerPath: dev.Path, - Port: sandboxConfig.HypervisorConfig.HotPlugVFIO, - DevType: dev.Type, - Major: dev.Major, - Minor: dev.Minor, - } - if dev.FileMode != nil { - vfioDev.FileMode = *dev.FileMode - } - if dev.UID != nil { - vfioDev.UID = *dev.UID - } - if dev.GID != nil { - vfioDev.GID = *dev.GID - } - - vfioDevices = append(vfioDevices, vfioDev) - continue - } - if coldPlugVFIO && isVFIODevice { - vfioDev := config.DeviceInfo{ - ColdPlug: true, - ContainerPath: dev.Path, - Port: sandboxConfig.HypervisorConfig.ColdPlugVFIO, - DevType: dev.Type, - Major: dev.Major, - Minor: dev.Minor, - } - if dev.FileMode != nil { - vfioDev.FileMode = *dev.FileMode - } - if dev.UID != nil { - vfioDev.UID = *dev.UID - } - if dev.GID != nil { - vfioDev.GID = *dev.GID - } - - vfioDevices = append(vfioDevices, vfioDev) - continue - } - } - } - // As stated before the single_container will have the CDI - // devices injected by the runtime. For the pod_container use-case - // see container.go how cold and hot-plug are handled. for dev, device := range container.DeviceInfos { if deviceManager.IsVhostUserBlk(device) { vhostUserBlkDevices = append(vhostUserBlkDevices, device) From 39464352911843c1ceac6eb17068b0907b171229 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser <zkaiser@nvidia.com> Date: Mon, 14 Apr 2025 20:49:27 +0000 Subject: [PATCH 4/4] gpu: Handle VFIO devices with DevicePlugin and CDI We can provide devices during cold-plug with CDI annotation on a Pod level and add per container device information wit the device plugin. Since the sandbox has already attached the VFIO device remove them from consideration and just apply the inner runtime CDI annotation. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com> --- src/runtime/virtcontainers/container.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index 536fbcff13..b736229f5e 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -842,6 +842,20 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error { coldPlugDevices := []config.DeviceInfo{} for i, vfio := range deviceInfos { + // If device is already attached during sandbox creation, e.g. + // with an CDI annotation, skip it in the container creation and + // only create the proper CDI annotation for the kata-agent + for _, dev := range config.PCIeDevicesPerPort["root-port"] { + if dev.HostPath == vfio.ContainerPath { + c.Logger().Warnf("device %s already attached to the sandbox, skipping", vfio.ContainerPath) + } + } + for _, dev := range config.PCIeDevicesPerPort["switch-port"] { + if dev.HostPath == vfio.ContainerPath { + c.Logger().Warnf("device %s already attached to the sandbox, skipping", vfio.ContainerPath) + } + } + // Only considering VFIO updates for Port and ColdPlug or // HotPlug updates isVFIODevice := deviceManager.IsVFIODevice(vfio.ContainerPath)