From 4c93bb2d61a3ad3a1d69791180a82288fd39d465 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 13 Mar 2024 12:44:09 +0000 Subject: [PATCH] qemu: Add CDI device handling for any container type We need special handling for pod_sandbox, pod_container and single_container how and when to inject CDI devices Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/containerd-shim-v2/create.go | 69 +------- src/runtime/pkg/device/config/config.go | 59 +++++++ src/runtime/pkg/katautils/config.go | 2 +- src/runtime/virtcontainers/container.go | 74 +++++--- src/runtime/virtcontainers/sandbox.go | 177 +++++++++++++------ 5 files changed, 234 insertions(+), 147 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/create.go b/src/runtime/pkg/containerd-shim-v2/create.go index 1c3bf81c1..d0c5634a9 100644 --- a/src/runtime/pkg/containerd-shim-v2/create.go +++ b/src/runtime/pkg/containerd-shim-v2/create.go @@ -19,11 +19,11 @@ import ( "strings" "syscall" - "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" taskAPI "github.com/containerd/containerd/api/runtime/task/v2" containerd_types "github.com/containerd/containerd/api/types" "github.com/containerd/containerd/mount" "github.com/containerd/typeurl/v2" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" @@ -74,63 +74,6 @@ func copyLayersToMounts(rootFs *vc.RootFs, spec *specs.Spec) error { return nil } -// CDI (Container Device Interface), is a specification, for container- runtimes, -// to support third-party devices. -// It introduces an abstract notion of a device as a resource. Such devices are -// uniquely specified by a fully-qualified name that is constructed from a -// vendor ID, a device class, and a name that is unique per vendor ID-device -// class pair. -// -// vendor.com/class=unique_name -// -// The combination of vendor ID and device class (vendor.com/class in the -// above example) is referred to as the device kind. -// CDI concerns itself only with enabling containers to be device aware. -// Areas like resource management are explicitly left out of CDI (and are -// expected to be handled by the orchestrator). Because of this focus, the CDI -// specification is simple to implement and allows great flexibility for -// runtimes and orchestrators. -func withCDI(annotations map[string]string, cdiSpecDirs []string, spec *specs.Spec) (*specs.Spec, error) { - // Add devices from CDI annotations - _, devsFromAnnotations, err := cdi.ParseAnnotations(annotations) - if err != nil { - return nil, fmt.Errorf("failed to parse CDI device annotations: %w", err) - } - - if len(devsFromAnnotations) == 0 { - // No devices found, skip device injection - return spec, nil - } - - var registry cdi.Registry - if len(cdiSpecDirs) > 0 { - // We can override the directories where to search for CDI specs - // if needed, the default is /etc/cdi /var/run/cdi - registry = cdi.GetRegistry(cdi.WithSpecDirs(cdiSpecDirs...)) - } else { - registry = cdi.GetRegistry() - } - - if err = registry.Refresh(); err != nil { - // We don't consider registry refresh failure a fatal error. - // For instance, a dynamically generated invalid CDI Spec file for - // any particular vendor shouldn't prevent injection of devices of - // different vendors. CDI itself knows better and it will fail the - // injection if necessary. - return nil, fmt.Errorf("CDI registry refresh failed: %w", err) - } - - if _, err := registry.InjectDevices(spec, devsFromAnnotations...); err != nil { - return nil, fmt.Errorf("CDI device injection failed: %w", err) - } - - // One crucial thing to keep in mind is that CDI device injection - // might add OCI Spec environment variables, hooks, and mounts as - // well. Therefore it is important that none of the corresponding - // OCI Spec fields are reset up in the call stack once we return. - return spec, nil -} - func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) { rootFs := vc.RootFs{} if len(r.Rootfs) == 1 { @@ -175,9 +118,13 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con // // _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec) // - _, err = withCDI(ociSpec.Annotations, []string{}, ociSpec) - if err != nil { - return nil, fmt.Errorf("adding CDI devices failed") + // Only inject CDI devices if single_container we do not want + // CDI devices in the pod_sandbox + if containerType == vc.SingleContainer { + _, err = config.WithCDI(ociSpec.Annotations, []string{}, ociSpec) + if err != nil { + return nil, fmt.Errorf("adding CDI devices failed") + } } s.config = runtimeConfig diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index 70f7a6932..a17989653 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -13,8 +13,10 @@ import ( "strconv" "strings" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/go-ini/ini" vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" + "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) @@ -642,3 +644,60 @@ type DeviceState struct { // or hot plugged (false). ColdPlug bool } + +// CDI (Container Device Interface), is a specification, for container- runtimes, +// to support third-party devices. +// It introduces an abstract notion of a device as a resource. Such devices are +// uniquely specified by a fully-qualified name that is constructed from a +// vendor ID, a device class, and a name that is unique per vendor ID-device +// class pair. +// +// vendor.com/class=unique_name +// +// The combination of vendor ID and device class (vendor.com/class in the +// above example) is referred to as the device kind. +// CDI concerns itself only with enabling containers to be device aware. +// Areas like resource management are explicitly left out of CDI (and are +// expected to be handled by the orchestrator). Because of this focus, the CDI +// specification is simple to implement and allows great flexibility for +// runtimes and orchestrators. +func WithCDI(annotations map[string]string, cdiSpecDirs []string, spec *specs.Spec) (*specs.Spec, error) { + // Add devices from CDI annotations + _, devsFromAnnotations, err := cdi.ParseAnnotations(annotations) + if err != nil { + return nil, fmt.Errorf("failed to parse CDI device annotations: %w", err) + } + + if len(devsFromAnnotations) == 0 { + // No devices found, skip device injection + return spec, nil + } + + var registry cdi.Registry + if len(cdiSpecDirs) > 0 { + // We can override the directories where to search for CDI specs + // if needed, the default is /etc/cdi /var/run/cdi + registry = cdi.GetRegistry(cdi.WithSpecDirs(cdiSpecDirs...)) + } else { + registry = cdi.GetRegistry() + } + + if err = registry.Refresh(); err != nil { + // We don't consider registry refresh failure a fatal error. + // For instance, a dynamically generated invalid CDI Spec file for + // any particular vendor shouldn't prevent injection of devices of + // different vendors. CDI itself knows better and it will fail the + // injection if necessary. + return nil, fmt.Errorf("CDI registry refresh failed: %w", err) + } + + if _, err := registry.InjectDevices(spec, devsFromAnnotations...); err != nil { + return nil, fmt.Errorf("CDI device injection failed: %w", err) + } + + // One crucial thing to keep in mind is that CDI device injection + // might add OCI Spec environment variables, hooks, and mounts as + // well. Therefore it is important that none of the corresponding + // OCI Spec fields are reset up in the call stack once we return. + return spec, nil +} diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index b9e460ae5..b095306e9 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -57,7 +57,7 @@ const ( // the maximum amount of PCI bridges that can be cold plugged in a VM maxPCIBridges uint32 = 5 - // For mor info why these values, see: + // For more info on why these values were chosen, see: // https://github.com/kata-containers/kata-containers/blob/main/docs/design/kata-vra.md#hypervisor-resource-limits maxPCIeRootPorts uint32 = 16 maxPCIeSwitchPorts uint32 = 16 diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index 042971b70..0a5e702b8 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -837,9 +837,9 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error { // Aggregate all the containner devices for hot-plug and use them to dedcue // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort) - modeIsVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO) - updatedDeviceInfos := []config.DeviceInfo{} + hotPlugDevices := []config.DeviceInfo{} + coldPlugDevices := []config.DeviceInfo{} for i, vfio := range deviceInfos { // Only considering VFIO updates for Port and ColdPlug or @@ -848,28 +848,30 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error { if hotPlugVFIO && isVFIODevice { deviceInfos[i].ColdPlug = false deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO + hotPlugDevices = append(hotPlugDevices, deviceInfos[i]) + continue } // Device is already cold-plugged at sandbox creation time // ignore it for the container creation if coldPlugVFIO && isVFIODevice { + coldPlugDevices = append(coldPlugDevices, deviceInfos[i]) continue } - - updatedDeviceInfos = append(updatedDeviceInfos, deviceInfos[i]) + hotPlugDevices = append(hotPlugDevices, deviceInfos[i]) } + // If modeVFIO is enabled we need 1st to attach the VFIO control group // device /dev/vfio/vfio an 2nd the actuall device(s) afterwards. // Sort the devices starting with device #1 being the VFIO control group // device and the next the actuall device(s) /dev/vfio/ - if modeIsVFIO { - deviceInfos = sortContainerVFIODevices(updatedDeviceInfos) - } + deviceInfos = sortContainerVFIODevices(hotPlugDevices) for _, info := range deviceInfos { dev, err := c.sandbox.devManager.NewDevice(info) if err != nil { return err } + storedDevices = append(storedDevices, ContainerDevice{ ID: dev.DeviceID(), ContainerPath: info.ContainerPath, @@ -879,6 +881,11 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error { }) } c.devices = filterDevices(c, storedDevices) + + // If we're hot-plugging this will be a no-op because at this stage + // no devices are attached to the root-port or switch-port + c.annotateContainerWithVFIOMetadata(coldPlugDevices) + return nil } @@ -936,24 +943,24 @@ func sortContainerVFIODevices(devices []config.DeviceInfo) []config.DeviceInfo { return vfioDevices } +type DeviceRelation struct { + Bus string + Path string + Index int +} + // Depending on the HW we might need to inject metadata into the container // In this case for the NV GPU we need to provide the correct mapping from // VFIO- to GPU index inside of the VM when vfio_mode="guest-kernel", // otherwise we do not know which GPU is which. -func (c *Container) annotateContainerWithVFIOMetadata() { - - type relation struct { - Bus string - Path string - Index int - } +func (c *Container) annotateContainerWithVFIOMetadata(devices interface{}) { modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel) if modeIsGK { // Hot plug is done let's update meta information about the // hot plugged devices especially VFIO devices in modeIsGK - siblings := make([]relation, 0) + siblings := make([]DeviceRelation, 0) // In the sandbox we first create the root-ports and secondly // the switch-ports. The range over map is not deterministic // so lets first iterate over all root-port devices and then @@ -961,13 +968,13 @@ func (c *Container) annotateContainerWithVFIOMetadata() { for _, dev := range config.PCIeDevicesPerPort["root-port"] { // For the NV GPU we need special handling let's use only those if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") { - siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath}) + siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath}) } } for _, dev := range config.PCIeDevicesPerPort["switch-port"] { // For the NV GPU we need special handling let's use only those if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") { - siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath}) + siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath}) } } // We need to sort the VFIO devices by bus to get the correct @@ -979,20 +986,33 @@ func (c *Container) annotateContainerWithVFIOMetadata() { for i := range siblings { siblings[i].Index = i } + // Now that we have the index lets connect the /dev/vfio/ // to the correct index - for _, dev := range c.devices { - for _, bdf := range siblings { - if bdf.Path == dev.ContainerPath { - vfioNum := filepath.Base(dev.ContainerPath) - annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum) - annoValue := fmt.Sprintf("nvidia.com/gpu=%d", bdf.Index) - c.config.CustomSpec.Annotations[annoKey] = annoValue - c.Logger().Infof("Annotated container with %s: %s", annoKey, annoValue) - } + if devices, ok := devices.([]ContainerDevice); ok { + for _, dev := range devices { + c.siblingAnnotation(dev.ContainerPath, siblings) } } + if devices, ok := devices.([]config.DeviceInfo); ok { + for _, dev := range devices { + c.siblingAnnotation(dev.ContainerPath, siblings) + } + + } + + } +} +func (c *Container) siblingAnnotation(devPath string, siblings []DeviceRelation) { + for _, sibling := range siblings { + if sibling.Path == devPath { + vfioNum := filepath.Base(devPath) + annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum) + annoValue := fmt.Sprintf("nvidia.com/gpu=%d", sibling.Index) + c.config.CustomSpec.Annotations[annoKey] = annoValue + c.Logger().Infof("annotated container with %s: %s", annoKey, annoValue) + } } } @@ -1022,7 +1042,7 @@ func (c *Container) create(ctx context.Context) (err error) { return } - c.annotateContainerWithVFIOMetadata() + c.annotateContainerWithVFIOMetadata(c.devices) // Deduce additional system mount info that should be handled by the agent // inside the VM diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 4a9d9e698..b58daccaa 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -672,64 +672,6 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } -func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) { - // If we have a confidential guest we need to cold-plug the PCIe VFIO devices - // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) - // Aggregate all the containner devices for hot-plug and use them to dedcue - // the correct amount of ports to reserve for the hypervisor. - hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) - - modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) - // modeIsVFIO is needed at the container level not the sandbox level. - // modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) - - var vfioDevices []config.DeviceInfo - // vhost-user-block device is a PCIe device in Virt, keep track of it - // for correct number of PCIe root ports. - var vhostUserBlkDevices []config.DeviceInfo - - for cnt, containers := range sandboxConfig.Containers { - for dev, device := range containers.DeviceInfos { - - if deviceManager.IsVhostUserBlk(device) { - vhostUserBlkDevices = append(vhostUserBlkDevices, device) - continue - } - isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath) - if hotPlugVFIO && isVFIODevice { - device.ColdPlug = false - device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO - vfioDevices = append(vfioDevices, device) - sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO - } - if coldPlugVFIO && isVFIODevice { - device.ColdPlug = true - device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO - vfioDevices = append(vfioDevices, device) - // We need to remove the devices marked for cold-plug - // otherwise at the container level the kata-agent - // will try to hot-plug them. - if modeIsGK { - sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" - } - } - } - var filteredDevices []config.DeviceInfo - for _, device := range containers.DeviceInfos { - if device.ID != "remove-we-are-cold-plugging" { - filteredDevices = append(filteredDevices, device) - } - } - sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices - } - - sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices - sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices - - return coldPlugVFIO, nil -} - func setHypervisorConfigAnnotations(sandboxConfig *SandboxConfig) { if len(sandboxConfig.Containers) > 0 { // These values are required by remote hypervisor @@ -747,6 +689,125 @@ func setHypervisorConfigAnnotations(sandboxConfig *SandboxConfig) { } } +func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) { + // If we have a confidential guest we need to cold-plug the PCIe VFIO devices + // until we have TDISP/IDE PCIe support. + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) + // Aggregate all the containner devices for hot-plug and use them to dedcue + // the correct amount of ports to reserve for the hypervisor. + hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) + + //modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) + // modeIsVFIO is needed at the container level not the sandbox level. + // modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) + + var vfioDevices []config.DeviceInfo + // vhost-user-block device is a PCIe device in Virt, keep track of it + // for correct number of PCIe root ports. + var vhostUserBlkDevices []config.DeviceInfo + + //io.katacontainers.pkg.oci.container_type:pod_sandbox + + for cnt, container := range sandboxConfig.Containers { + // Do not alter the original spec, we do not want to inject + // CDI devices into the sandbox container, were using the CDI + // devices as additional information to determine the number of + // PCIe root ports to reserve for the hypervisor. + // A single_container type will have the CDI devices injected + // only do this if we're a pod_sandbox type. + if container.Annotations["io.katacontainers.pkg.oci.container_type"] == "pod_sandbox" && container.CustomSpec != nil { + cdiSpec := container.CustomSpec + // We can provide additional directories where to search for + // CDI specs if needed. immutable OS's only have specific + // directories where applications can write too. For instance /opt/cdi + // + // _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec) + // + _, err := config.WithCDI(cdiSpec.Annotations, []string{}, cdiSpec) + if err != nil { + return coldPlugVFIO, fmt.Errorf("adding CDI devices failed") + } + + for _, dev := range cdiSpec.Linux.Devices { + isVFIODevice := deviceManager.IsVFIODevice(dev.Path) + if hotPlugVFIO && isVFIODevice { + vfioDev := config.DeviceInfo{ + ColdPlug: true, + ContainerPath: dev.Path, + Port: sandboxConfig.HypervisorConfig.HotPlugVFIO, + DevType: dev.Type, + Major: dev.Major, + Minor: dev.Minor, + } + if dev.FileMode != nil { + vfioDev.FileMode = *dev.FileMode + } + if dev.UID != nil { + vfioDev.UID = *dev.UID + } + if dev.GID != nil { + vfioDev.GID = *dev.GID + } + + vfioDevices = append(vfioDevices, vfioDev) + continue + } + if coldPlugVFIO && isVFIODevice { + vfioDev := config.DeviceInfo{ + ColdPlug: true, + ContainerPath: dev.Path, + Port: sandboxConfig.HypervisorConfig.ColdPlugVFIO, + DevType: dev.Type, + Major: dev.Major, + Minor: dev.Minor, + } + if dev.FileMode != nil { + vfioDev.FileMode = *dev.FileMode + } + if dev.UID != nil { + vfioDev.UID = *dev.UID + } + if dev.GID != nil { + vfioDev.GID = *dev.GID + } + + vfioDevices = append(vfioDevices, vfioDev) + continue + } + } + } + // As stated before the single_container will have the CDI + // devices injected by the runtime. For the pod_container use-case + // see container.go how cold and hot-plug are handled. + for dev, device := range container.DeviceInfos { + if deviceManager.IsVhostUserBlk(device) { + vhostUserBlkDevices = append(vhostUserBlkDevices, device) + continue + } + isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath) + if hotPlugVFIO && isVFIODevice { + device.ColdPlug = false + device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO + vfioDevices = append(vfioDevices, device) + sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO + continue + } + if coldPlugVFIO && isVFIODevice { + device.ColdPlug = true + device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO + vfioDevices = append(vfioDevices, device) + sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO + continue + } + } + } + + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices + sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices + + return coldPlugVFIO, nil +} + func (s *Sandbox) createResourceController() error { var err error cgroupPath := ""