From e82fdee20f5d585e1dc2e99bdb7e1e72d40b208e Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 15 Jan 2025 20:33:42 +0000 Subject: [PATCH 1/3] runtime: Add proper IOMMUFD parsing With newer kernels we have a new backend for VFIO called IOMMUFD this is a departure from VFIO IOMMU Groups since it has only one device associated with an IOMMUFD entry. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 89 +++++++++++++++++++++++++ src/runtime/pkg/device/drivers/vfio.go | 16 ++++- src/runtime/virtcontainers/qemu.go | 22 ++++-- 3 files changed, 120 insertions(+), 7 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index a89ec9b7e0..8e99aad663 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -9,9 +9,12 @@ package drivers import ( "fmt" "os" + "path" "path/filepath" + "regexp" "strconv" "strings" + "syscall" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" @@ -157,6 +160,92 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo return false, nil } +func getMajorMinorFromDevPath(devPath string) (uint32, uint32, error) { + fi, err := os.Stat(devPath) + if err != nil { + return 0, 0, err + } + + dev := fi.Sys().(*syscall.Stat_t) + return uint32(dev.Rdev >> 8), uint32(dev.Rdev & 0xff), nil +} + +func extractIndex(devicePath string) (string, error) { + + base := filepath.Base(devicePath) + + const prefix = "vfio" + if !strings.HasPrefix(base, prefix) { + return "0", fmt.Errorf("unexpected device name format: %s", base) + } + return strings.TrimPrefix(base, prefix), nil +} + +func getBdfFromVFIODev(major uint32, minor uint32) (string, error) { + devPath := fmt.Sprintf("/sys/dev/char/%d:%d", major, minor) + realPath, err := filepath.EvalSymlinks(devPath) + if err != nil { + return "", fmt.Errorf("Failed to resolve symlink for %s: %v", devPath, err) + } + + bdfRegex := regexp.MustCompile(`([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])`) + matches := bdfRegex.FindAllString(realPath, -1) + if len(matches) == 0 { + return "", fmt.Errorf("No BDF found in resolved path: %s", realPath) + } + return matches[len(matches)-1], nil +} + +// GetDeviceFromVFIODev return the host device associated with the VFIO device +// There is only one device per VFIO device in the case of IOMMUFD +func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { + // The way we get the host BDF is by reading the symlink of the char + // device major:minor entries in /sys/chart/major:minor + // $ ls -l /dev/vfio/devices/vfio0 + // crw------- 1 root root 237, 0 Jan 15 16:53 /dev/vfio/devices/vfio0 + major, minor, err := getMajorMinorFromDevPath(device.HostPath) + if err != nil { + return nil, fmt.Errorf("Failed to get major:minor from %s: %v", device.HostPath, err) + } + // $ ls -l /sys/dev/char/237:0 + // /sys/dev/char/237:0 -> ../../devices/pci0000:64/0000:64:00.0/0000:65:00.0/vfio-dev/vfio0 + deviceBDF, err := getBdfFromVFIODev(major, minor) + if err != nil { + return nil, err + } + + deviceSysfsDev := path.Join(config.SysBusPciDevicesPath, deviceBDF) + vfioDeviceType, err := GetVFIODeviceType(deviceSysfsDev) + if err != nil { + return nil, err + } + + vendorID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor) + deviceID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice) + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + + id, err := extractIndex(device.HostPath) + if err != nil { + return nil, err + } + + vfio := config.VFIODev{ + ID: id, + Type: vfioDeviceType, + BDF: deviceBDF, + SysfsDev: deviceSysfsDev, + IsPCIe: IsPCIeDevice(deviceBDF), + Class: pciClass, + VendorID: vendorID, + DeviceID: deviceID, + Port: device.Port, + HostPath: device.HostPath, + } + vfioDevs := []*config.VFIODev{&vfio} + + return vfioDevs, nil +} + // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group // We can reuse this function at various levels, sandbox, container. func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) { diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index f315fddaf0..9e4df43f6d 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -28,6 +28,7 @@ const ( iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group" vfioDevPath = "/dev/vfio/%s" vfioAPSysfsDir = "/sys/devices/vfio_ap" + IommufdDevPath = "/dev/vfio/devices" ) // VFIODevice is a vfio device meant to be passed to the hypervisor @@ -64,9 +65,18 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() - device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) - if err != nil { - return err + // This work for IOMMUFD enabled kernels > 6.x + // In the case of IOMMUFD the device.HostPath will look like + // /dev/vfio/devices/vfio0 + // (1) Check if we have the new IOMMUFD or old container based VFIO + if strings.HasPrefix(device.DeviceInfo.HostPath, IommufdDevPath) { + device.VfioDevs, err = GetDeviceFromVFIODev(*device.DeviceInfo) + } else { + // Once we have + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) + if err != nil { + return err + } } for _, vfio := range device.VfioDevs { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index ba86c3d63a..22f22ab8c9 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -792,11 +792,25 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } - devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) - if err != nil { - return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + vfioDevices := []*config.VFIODev{} + // This works for IOMMUFD enabled kernels > 6.x + // In the case of IOMMUFD the device.HostPath will look like + // /dev/vfio/devices/vfio0 + // (1) Check if we have the new IOMMUFD or old container based VFIO + if strings.HasPrefix(dev.HostPath, drivers.IommufdDevPath) { + q.Logger().Infof("### IOMMUFD Path: %s", dev.HostPath) + vfioDevices, err = drivers.GetDeviceFromVFIODev(dev) + if err != nil { + return fmt.Errorf("Cannot get VFIO device from IOMMUFD with device: %v err: %v", dev, err) + } + } else { + vfioDevices, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + if err != nil { + return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + } } - for _, vfioDevice := range devicesPerIOMMUGroup { + + for _, vfioDevice := range vfioDevices { if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } From 9add63325842e2985c266a66fdb83b8f76c98973 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 15 Jan 2025 21:16:52 +0000 Subject: [PATCH 2/3] qemu: Add command line for IOMMUFD For each IOMMUFD device create an object and assign it to the device, we need additional information that is populated now correctly to decide if we run the old VFIO or new VFIO backend. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 2 +- src/runtime/pkg/device/drivers/vfio.go | 3 +++ src/runtime/pkg/govmm/qemu/qemu.go | 11 +++++++++++ src/runtime/virtcontainers/qemu.go | 2 +- src/runtime/virtcontainers/qemu_arch_base.go | 2 ++ 5 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 8e99aad663..0117610eb6 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -233,7 +233,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { ID: id, Type: vfioDeviceType, BDF: deviceBDF, - SysfsDev: deviceSysfsDev, + SysfsDev: device.HostPath, IsPCIe: IsPCIeDevice(deviceBDF), Class: pciClass, VendorID: vendorID, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 9e4df43f6d..77f53a16a2 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -71,6 +71,9 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece // (1) Check if we have the new IOMMUFD or old container based VFIO if strings.HasPrefix(device.DeviceInfo.HostPath, IommufdDevPath) { device.VfioDevs, err = GetDeviceFromVFIODev(*device.DeviceInfo) + if err != nil { + return err + } } else { // Once we have device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index e1070b7319..be3a842322 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -25,6 +25,8 @@ import ( "strconv" "strings" "syscall" + + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" ) // Machine describes the machine type qemu will emulate. @@ -1869,6 +1871,9 @@ func (b PCIeSwitchDownstreamPortDevice) Valid() bool { // VFIODevice represents a qemu vfio device meant for direct access by guest OS. type VFIODevice struct { + // ID index of the vfio device in devfs or sysfs used for IOMMUFD + ID string + // Bus-Device-Function of device BDF string @@ -1946,6 +1951,12 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vfioDev.DevNo)) } + if strings.HasPrefix(vfioDev.SysfsDev, drivers.IommufdDevPath) { + qemuParams = append(qemuParams, "-object") + qemuParams = append(qemuParams, fmt.Sprintf("iommufd,id=iommufd%s", vfioDev.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("iommufd=iommufd%s", vfioDev.ID)) + } + qemuParams = append(qemuParams, "-device") qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 22f22ab8c9..73b6df2e49 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -792,7 +792,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } - vfioDevices := []*config.VFIODev{} + var vfioDevices []*config.VFIODev // This works for IOMMUFD enabled kernels > 6.x // In the case of IOMMUFD the device.HostPath will look like // /dev/vfio/devices/vfio0 diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index fd92be7724..503d1b0608 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -708,10 +708,12 @@ func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev conf devices = append(devices, govmmQemu.VFIODevice{ + ID: vfioDev.ID, BDF: vfioDev.BDF, VendorID: vfioDev.VendorID, DeviceID: vfioDev.DeviceID, Bus: vfioDev.Bus, + SysfsDev: vfioDev.SysfsDev, }, ) From 7cca2c492574466f09b09ddbaefb513b27880ecc Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Feb 2025 16:51:49 +0000 Subject: [PATCH 3/3] gpu: Use a dedicated VFIO group vs iommufd entry We do not want to abuse the sysfsentry lets use a dedicated devfsentry. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 3 +++ src/runtime/pkg/device/drivers/utils.go | 3 ++- src/runtime/pkg/govmm/qemu/qemu.go | 5 ++++- src/runtime/virtcontainers/qemu_arch_base.go | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index a179896537..d2f6d89288 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -396,6 +396,9 @@ type VFIODev struct { // sysfsdev of VFIO mediated device SysfsDev string + // DevfsDev is used to identify a VFIO Group device or IOMMMUFD VFIO device + DevfsDev string + // VendorID specifies vendor id VendorID string diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 0117610eb6..cfb1d7c1ad 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -233,7 +233,8 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { ID: id, Type: vfioDeviceType, BDF: deviceBDF, - SysfsDev: device.HostPath, + SysfsDev: deviceSysfsDev, + DevfsDev: device.HostPath, IsPCIe: IsPCIeDevice(deviceBDF), Class: pciClass, VendorID: vendorID, diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index be3a842322..fda8c72483 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -1897,6 +1897,9 @@ type VFIODevice struct { // SysfsDev specifies the sysfs matrix entry for the AP device SysfsDev string + + // DevfsDev is used to identify a VFIO Group device or IOMMMUFD VFIO device + DevfsDev string } // VFIODeviceTransport is a map of the vfio device name that corresponds to @@ -1951,7 +1954,7 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vfioDev.DevNo)) } - if strings.HasPrefix(vfioDev.SysfsDev, drivers.IommufdDevPath) { + if strings.HasPrefix(vfioDev.DevfsDev, drivers.IommufdDevPath) { qemuParams = append(qemuParams, "-object") qemuParams = append(qemuParams, fmt.Sprintf("iommufd,id=iommufd%s", vfioDev.ID)) deviceParams = append(deviceParams, fmt.Sprintf("iommufd=iommufd%s", vfioDev.ID)) diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 503d1b0608..31c55209fa 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -714,6 +714,7 @@ func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev conf DeviceID: vfioDev.DeviceID, Bus: vfioDev.Bus, SysfsDev: vfioDev.SysfsDev, + DevfsDev: vfioDev.DevfsDev, }, )