diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 2a9f2269ee..2422499714 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -861,9 +861,38 @@ func (c *Container) create() (err error) { } } - // Attach devices - if err = c.attachDevices(); err != nil { - return + var ( + machineType = c.sandbox.config.HypervisorConfig.HypervisorMachineType + normalAttachedDevs []ContainerDevice //for q35: normally attached devices + delayAttachedDevs []ContainerDevice //for q35: delay attached devices, for example, large bar space device + ) + // Fix: https://github.com/kata-containers/runtime/issues/2460 + if machineType == QemuQ35 { + // add Large Bar space device to delayAttachedDevs + for _, device := range c.devices { + var isLargeBarSpace bool + isLargeBarSpace, err = manager.IsVFIOLargeBarSpaceDevice(device.ContainerPath) + if err != nil { + return + } + if isLargeBarSpace { + delayAttachedDevs = append(delayAttachedDevs, device) + } else { + normalAttachedDevs = append(normalAttachedDevs, device) + } + } + } else { + normalAttachedDevs = c.devices + } + + c.Logger().WithFields(logrus.Fields{ + "machine_type": machineType, + "devices": normalAttachedDevs, + }).Info("normal attach devices") + if len(normalAttachedDevs) > 0 { + if err = c.attachDevices(normalAttachedDevs); err != nil { + return + } } // Deduce additional system mount info that should be handled by the agent @@ -876,6 +905,17 @@ func (c *Container) create() (err error) { } c.process = *process + // lazy attach device after createContainer for q35 + if machineType == QemuQ35 && len(delayAttachedDevs) > 0 { + c.Logger().WithFields(logrus.Fields{ + "machine_type": machineType, + "devices": delayAttachedDevs, + }).Info("lazy attach devices") + if err = c.attachDevices(delayAttachedDevs); err != nil { + return + } + } + if !rootless.IsRootless() && !c.sandbox.config.SandboxCgroupOnly { if err = c.cgroupsCreate(); err != nil { return @@ -1359,11 +1399,15 @@ func (c *Container) removeDrive() (err error) { return nil } -func (c *Container) attachDevices() error { +func (c *Container) attachDevices(devices []ContainerDevice) error { // there's no need to do rollback when error happens, // because if attachDevices fails, container creation will fail too, // and rollbackFailingContainerCreation could do all the rollbacks - for _, dev := range c.devices { + + // since devices with large bar space require delayed attachment, + // the devices need to be split into two lists, normalAttachedDevs and delayAttachedDevs. + // so c.device is not used here. See issue https://github.com/kata-containers/runtime/issues/2460. + for _, dev := range devices { if err := c.sandbox.devManager.AttachDevice(dev.ID, c.sandbox); err != nil { return err } diff --git a/virtcontainers/device/drivers/utils.go b/virtcontainers/device/drivers/utils.go index f3b338c61a..ad046ca10b 100644 --- a/virtcontainers/device/drivers/utils.go +++ b/virtcontainers/device/drivers/utils.go @@ -106,3 +106,19 @@ func readPCIProperty(propertyPath string) (string, error) { } return strings.Split(string(buf), "\n")[0], nil } + +func GetVFIODeviceType(deviceFileName string) config.VFIODeviceType { + //For example, 0000:04:00.0 + tokens := strings.Split(deviceFileName, ":") + vfioDeviceType := config.VFIODeviceErrorType + if len(tokens) == 3 { + vfioDeviceType = config.VFIODeviceNormalType + } else { + //For example, 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 + tokens = strings.Split(deviceFileName, "-") + if len(tokens) == 5 { + vfioDeviceType = config.VFIODeviceMediatedType + } + } + return vfioDeviceType +} diff --git a/virtcontainers/device/drivers/vfio.go b/virtcontainers/device/drivers/vfio.go index 8762ade9ec..c8f8f75910 100644 --- a/virtcontainers/device/drivers/vfio.go +++ b/virtcontainers/device/drivers/vfio.go @@ -187,18 +187,8 @@ func (device *VFIODevice) Load(ds persistapi.DeviceState) { // It should implement GetAttachCount() and DeviceID() as api.Device implementation // here it shares function from *GenericDevice so we don't need duplicate codes - func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { - tokens := strings.Split(deviceFileName, ":") - vfioDeviceType = config.VFIODeviceErrorType - if len(tokens) == 3 { - vfioDeviceType = config.VFIODeviceNormalType - } else { - tokens = strings.Split(deviceFileName, "-") - if len(tokens) == 5 { - vfioDeviceType = config.VFIODeviceMediatedType - } - } + vfioDeviceType = GetVFIODeviceType(deviceFileName) switch vfioDeviceType { case config.VFIODeviceNormalType: diff --git a/virtcontainers/device/manager/manager.go b/virtcontainers/device/manager/manager.go index 78a2c281bd..db1e8ee48a 100644 --- a/virtcontainers/device/manager/manager.go +++ b/virtcontainers/device/manager/manager.go @@ -74,6 +74,8 @@ func NewDeviceManager(blockDriver string, devices []api.Device) api.DeviceManage dm.blockDriver = VirtioSCSI } + drivers.AllPCIeDevs = make(map[string]bool) + for _, dev := range devices { dm.devices[dev.DeviceID()] = dev } diff --git a/virtcontainers/device/manager/utils.go b/virtcontainers/device/manager/utils.go index e5ccb99715..2f26bc245d 100644 --- a/virtcontainers/device/manager/utils.go +++ b/virtcontainers/device/manager/utils.go @@ -7,10 +7,16 @@ package manager import ( + "fmt" + "io/ioutil" "path/filepath" + "strconv" "strings" + "github.com/sirupsen/logrus" + "github.com/kata-containers/runtime/virtcontainers/device/config" + "github.com/kata-containers/runtime/virtcontainers/device/drivers" ) const ( @@ -35,3 +41,94 @@ func isVFIO(hostPath string) bool { func isBlock(devInfo config.DeviceInfo) bool { return devInfo.DevType == "b" } + +// IsVFIOLargeBarSpaceDevice checks if the device is a large bar space device. +func IsVFIOLargeBarSpaceDevice(hostPath string) (bool, error) { + if !isVFIO(hostPath) { + return false, nil + } + + iommuDevicesPath := filepath.Join(config.SysIOMMUPath, filepath.Base(hostPath), "devices") + deviceFiles, err := ioutil.ReadDir(iommuDevicesPath) + if err != nil { + return false, err + } + + // Pass all devices in iommu group + for _, deviceFile := range deviceFiles { + vfioDeviceType := drivers.GetVFIODeviceType(deviceFile.Name()) + var isLarge bool + switch vfioDeviceType { + case config.VFIODeviceNormalType: + sysfsResource := filepath.Join(iommuDevicesPath, deviceFile.Name(), "resource") + if isLarge, err = isLargeBarSpace(sysfsResource); err != nil { + return false, err + } + deviceLogger().WithFields(logrus.Fields{ + "device-file": deviceFile.Name(), + "device-type": vfioDeviceType, + "resource": sysfsResource, + "large-bar-space": isLarge, + }).Info("Detect large bar space device") + return isLarge, nil + case config.VFIODeviceMediatedType: + //TODO: support VFIODeviceMediatedType + deviceLogger().WithFields(logrus.Fields{ + "device-file": deviceFile.Name(), + "device-type": vfioDeviceType, + }).Warn("Detect large bar space device is not yet supported for VFIODeviceMediatedType") + default: + deviceLogger().WithFields(logrus.Fields{ + "device-file": deviceFile.Name(), + "device-type": vfioDeviceType, + }).Warn("Incorrect token found when detecting large bar space devices") + } + } + + return false, nil +} + +func isLargeBarSpace(resourcePath string) (bool, error) { + buf, err := ioutil.ReadFile(resourcePath) + if err != nil { + return false, fmt.Errorf("failed to read sysfs resource: %v", err) + } + + // The resource file contains host addresses of PCI resources: + // For example: + // $ cat /sys/bus/pci/devices/0000:04:00.0/resource + // 0x00000000c6000000 0x00000000c6ffffff 0x0000000000040200 + // 0x0000383800000000 0x0000383bffffffff 0x000000000014220c + // Refer: + // resource format: https://github.com/torvalds/linux/blob/63623fd44972d1ed2bfb6e0fb631dfcf547fd1e7/drivers/pci/pci-sysfs.c#L145 + // calculate size : https://github.com/pciutils/pciutils/blob/61ecc14a327de030336f1ff3fea9c7e7e55a90ca/lspci.c#L388 + suffix := []string{"", "K", "M", "G", "T"} + for rIdx, line := range strings.Split(string(buf), "\n") { + cols := strings.Fields(line) + start, _ := strconv.ParseUint(cols[0], 0, 64) + end, _ := strconv.ParseUint(cols[1], 0, 64) + size := end - start + 1 + sIdx := 0 + for i := range suffix { + if size/1024 < 1 { + break + } + size /= 1024 + sIdx = i + 1 + } + deviceLogger().WithFields(logrus.Fields{ + "resource": resourcePath, + "region": rIdx, + "start": cols[0], + "end": cols[1], + "size": size, + "suffix": suffix[sIdx], + }).Debug("Check large bar space device") + //size is large than 4G + if (sIdx == 3 && size > 4) || sIdx > 3 { + return true, nil + } + } + + return false, nil +} diff --git a/virtcontainers/sandbox_test.go b/virtcontainers/sandbox_test.go index 58d8874cdc..528c3dfc05 100644 --- a/virtcontainers/sandbox_test.go +++ b/virtcontainers/sandbox_test.go @@ -730,7 +730,7 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { containers[c.id].sandbox = &sandbox - err = containers[c.id].attachDevices() + err = containers[c.id].attachDevices(c.devices) assert.Nil(t, err, "Error while attaching devices %s", err) err = containers[c.id].detachDevices()