diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index a17989653..d2f6d8928 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -396,6 +396,9 @@ type VFIODev struct { // sysfsdev of VFIO mediated device SysfsDev string + // DevfsDev is used to identify a VFIO Group device or IOMMMUFD VFIO device + DevfsDev string + // VendorID specifies vendor id VendorID string diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index a89ec9b7e..cfb1d7c1a 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -9,9 +9,12 @@ package drivers import ( "fmt" "os" + "path" "path/filepath" + "regexp" "strconv" "strings" + "syscall" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" @@ -157,6 +160,93 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo return false, nil } +func getMajorMinorFromDevPath(devPath string) (uint32, uint32, error) { + fi, err := os.Stat(devPath) + if err != nil { + return 0, 0, err + } + + dev := fi.Sys().(*syscall.Stat_t) + return uint32(dev.Rdev >> 8), uint32(dev.Rdev & 0xff), nil +} + +func extractIndex(devicePath string) (string, error) { + + base := filepath.Base(devicePath) + + const prefix = "vfio" + if !strings.HasPrefix(base, prefix) { + return "0", fmt.Errorf("unexpected device name format: %s", base) + } + return strings.TrimPrefix(base, prefix), nil +} + +func getBdfFromVFIODev(major uint32, minor uint32) (string, error) { + devPath := fmt.Sprintf("/sys/dev/char/%d:%d", major, minor) + realPath, err := filepath.EvalSymlinks(devPath) + if err != nil { + return "", fmt.Errorf("Failed to resolve symlink for %s: %v", devPath, err) + } + + bdfRegex := regexp.MustCompile(`([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F])`) + matches := bdfRegex.FindAllString(realPath, -1) + if len(matches) == 0 { + return "", fmt.Errorf("No BDF found in resolved path: %s", realPath) + } + return matches[len(matches)-1], nil +} + +// GetDeviceFromVFIODev return the host device associated with the VFIO device +// There is only one device per VFIO device in the case of IOMMUFD +func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { + // The way we get the host BDF is by reading the symlink of the char + // device major:minor entries in /sys/chart/major:minor + // $ ls -l /dev/vfio/devices/vfio0 + // crw------- 1 root root 237, 0 Jan 15 16:53 /dev/vfio/devices/vfio0 + major, minor, err := getMajorMinorFromDevPath(device.HostPath) + if err != nil { + return nil, fmt.Errorf("Failed to get major:minor from %s: %v", device.HostPath, err) + } + // $ ls -l /sys/dev/char/237:0 + // /sys/dev/char/237:0 -> ../../devices/pci0000:64/0000:64:00.0/0000:65:00.0/vfio-dev/vfio0 + deviceBDF, err := getBdfFromVFIODev(major, minor) + if err != nil { + return nil, err + } + + deviceSysfsDev := path.Join(config.SysBusPciDevicesPath, deviceBDF) + vfioDeviceType, err := GetVFIODeviceType(deviceSysfsDev) + if err != nil { + return nil, err + } + + vendorID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor) + deviceID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice) + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + + id, err := extractIndex(device.HostPath) + if err != nil { + return nil, err + } + + vfio := config.VFIODev{ + ID: id, + Type: vfioDeviceType, + BDF: deviceBDF, + SysfsDev: deviceSysfsDev, + DevfsDev: device.HostPath, + IsPCIe: IsPCIeDevice(deviceBDF), + Class: pciClass, + VendorID: vendorID, + DeviceID: deviceID, + Port: device.Port, + HostPath: device.HostPath, + } + vfioDevs := []*config.VFIODev{&vfio} + + return vfioDevs, nil +} + // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group // We can reuse this function at various levels, sandbox, container. func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) { diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index f315fddaf..77f53a16a 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -28,6 +28,7 @@ const ( iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group" vfioDevPath = "/dev/vfio/%s" vfioAPSysfsDir = "/sys/devices/vfio_ap" + IommufdDevPath = "/dev/vfio/devices" ) // VFIODevice is a vfio device meant to be passed to the hypervisor @@ -64,9 +65,21 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() - device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) - if err != nil { - return err + // This work for IOMMUFD enabled kernels > 6.x + // In the case of IOMMUFD the device.HostPath will look like + // /dev/vfio/devices/vfio0 + // (1) Check if we have the new IOMMUFD or old container based VFIO + if strings.HasPrefix(device.DeviceInfo.HostPath, IommufdDevPath) { + device.VfioDevs, err = GetDeviceFromVFIODev(*device.DeviceInfo) + if err != nil { + return err + } + } else { + // Once we have + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) + if err != nil { + return err + } } for _, vfio := range device.VfioDevs { diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index aefa1ffdf..28b129921 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -25,6 +25,8 @@ import ( "strconv" "strings" "syscall" + + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" ) // Machine describes the machine type qemu will emulate. @@ -1859,6 +1861,9 @@ func (b PCIeSwitchDownstreamPortDevice) Valid() bool { // VFIODevice represents a qemu vfio device meant for direct access by guest OS. type VFIODevice struct { + // ID index of the vfio device in devfs or sysfs used for IOMMUFD + ID string + // Bus-Device-Function of device BDF string @@ -1882,6 +1887,9 @@ type VFIODevice struct { // SysfsDev specifies the sysfs matrix entry for the AP device SysfsDev string + + // DevfsDev is used to identify a VFIO Group device or IOMMMUFD VFIO device + DevfsDev string } // VFIODeviceTransport is a map of the vfio device name that corresponds to @@ -1936,6 +1944,12 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vfioDev.DevNo)) } + if strings.HasPrefix(vfioDev.DevfsDev, drivers.IommufdDevPath) { + qemuParams = append(qemuParams, "-object") + qemuParams = append(qemuParams, fmt.Sprintf("iommufd,id=iommufd%s", vfioDev.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("iommufd=iommufd%s", vfioDev.ID)) + } + qemuParams = append(qemuParams, "-device") qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index ba86c3d63..73b6df2e4 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -792,11 +792,25 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } - devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) - if err != nil { - return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + var vfioDevices []*config.VFIODev + // This works for IOMMUFD enabled kernels > 6.x + // In the case of IOMMUFD the device.HostPath will look like + // /dev/vfio/devices/vfio0 + // (1) Check if we have the new IOMMUFD or old container based VFIO + if strings.HasPrefix(dev.HostPath, drivers.IommufdDevPath) { + q.Logger().Infof("### IOMMUFD Path: %s", dev.HostPath) + vfioDevices, err = drivers.GetDeviceFromVFIODev(dev) + if err != nil { + return fmt.Errorf("Cannot get VFIO device from IOMMUFD with device: %v err: %v", dev, err) + } + } else { + vfioDevices, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + if err != nil { + return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + } } - for _, vfioDevice := range devicesPerIOMMUGroup { + + for _, vfioDevice := range vfioDevices { if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index fd92be772..31c55209f 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -708,10 +708,13 @@ func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev conf devices = append(devices, govmmQemu.VFIODevice{ + ID: vfioDev.ID, BDF: vfioDev.BDF, VendorID: vfioDev.VendorID, DeviceID: vfioDev.DeviceID, Bus: vfioDev.Bus, + SysfsDev: vfioDev.SysfsDev, + DevfsDev: vfioDev.DevfsDev, }, )