diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dcb0e93b8f..c65e861233 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -216,6 +216,15 @@ func (p PCIePort) Valid() bool { return false } +type PCIePortMapping map[string]bool + +var ( + // Each of this structures keeps track of the devices attached to the + // different types of PCI ports. We can deduces the Bus number from it + // and eliminate duplicates being assigned. + PCIeDevices = map[PCIePort]PCIePortMapping{} +) + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // DriverOptions is specific options for each device driver diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 5e5f2a10bd..a9875f7e75 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -31,13 +31,6 @@ const ( vfioAPSysfsDir = "/sys/devices/vfio_ap" ) -var ( - // AllPCIeDevs deduces the correct bus number. The BDF keeps track that - // we're not accounting for the very same device, if a user provides the - // devices multiple times. - AllPCIeDevs = map[string]bool{} -) - // VFIODevice is a vfio device meant to be passed to the hypervisor // to be used by the Virtual Machine. type VFIODevice struct { @@ -78,9 +71,11 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } for _, vfio := range device.VfioDevs { if vfio.IsPCIe { - vfio.Rank = len(AllPCIeDevs) - AllPCIeDevs[vfio.BDF] = true - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], vfio.Rank) + //vfio.Rank = len(AllPCIeDevs) + //AllPCIeDevs[vfio.BDF] = true + busIndex := len(config.PCIeDevices[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + config.PCIeDevices[vfio.Port][vfio.BDF] = true } } @@ -214,10 +209,10 @@ func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: // Get bdf of device eg. 0000:00:1c.0 - //deviceBDF = getBDF(deviceFileName) + // OLD IMPL: deviceBDF = getBDF(deviceFileName) // The old implementation did not consider the case where - // vfio devices are located on differente root busses. The - // kata-agent will handle the case now, here use the full PCI addr + // vfio devices are located on different root busses. The + // kata-agent will handle the case now, here, use the full PCI addr deviceBDF = deviceFileName // Get sysfs path used by cloud-hypervisor deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName) diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index baf1209a75..735061d9ee 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS dm.blockDriver = config.VirtioSCSI } - drivers.AllPCIeDevs = make(map[string]bool) + config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping) + + config.PCIeDevices[config.RootPort] = make(map[string]bool) + config.PCIeDevices[config.SwitchPort] = make(map[string]bool) + config.PCIeDevices[config.BridgePort] = make(map[string]bool) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev @@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device } if IsVFIO(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil - } else if isVhostUserBlk(devInfo) { + } else if IsVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { devInfo.DriverOptions = make(map[string]string) } diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index e78205d0c7..a9e4ee8c6a 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool { } // isVhostUserBlk checks if the device is a VhostUserBlk device. -func isVhostUserBlk(devInfo config.DeviceInfo) bool { +func IsVhostUserBlk(devInfo config.DeviceInfo) bool { return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor } diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index b57992b3d0..6752719ddb 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) { } for _, d := range data { - isVhostUserBlk := isVhostUserBlk( + isVhostUserBlk := IsVhostUserBlk( config.DeviceInfo{ DevType: d.devType, Major: d.major, diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index bb9c86b217..43a631c1df 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -505,9 +505,13 @@ type HypervisorConfig struct { // MemOffset specifies memory space for nvdimm device MemOffset uint64 - // RawDevics are used to get PCIe device info early before the sandbox + // VFIODevices are used to get PCIe device info early before the sandbox // is started to make better PCIe topology decisions VFIODevices []config.DeviceInfo + // VhostUserBlkDevices are handled differently in Q35 and Virt machine + // type. capture them early before the sandbox to make better PCIe topology + // decisions + VhostUserBlkDevices []config.DeviceInfo // HotplugVFIO is used to indicate if devices need to be hotplugged on the // root port or a switch diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 79dfb286e7..ad94923ffb 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -702,7 +702,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } if machine.Type == QemuQ35 || machine.Type == QemuVirt { - if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { + if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") return err } @@ -741,9 +741,10 @@ func (q *qemu) checkBpfEnabled() { // Max PCIe switch ports is 16 // There is only 64kB of IO memory each root,switch port will consume 4k hence // only 16 ports possible. -func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { +func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error { + // If no-port set just return no need to add PCIe Root Port or PCIe Switches - if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort { + if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 { return nil } @@ -767,7 +768,8 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } - // Get the number of hot(cold)-pluggable ports needed from the provided devices + // Get the number of hot(cold)-pluggable ports needed from the provided + // VFIO devices and VhostUserBlockDevices var numOfPluggablePorts uint32 = 0 for _, dev := range hypervisorConfig.VFIODevices { var err error @@ -785,22 +787,42 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig } } } + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) + + numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) + // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 // max slots on the complete pci(e) topology is 256 in QEMU - if numOfPluggablePorts > maxPCIeRootPort { - return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) - } - if numOfPluggablePorts > maxPCIeSwitchPort { - return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) - } - - if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { + if vfioOnRootPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices) + } + if numOfPluggablePorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil } - if q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort { + if vfioOnSwitchPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices) + if numOfPluggableRootPorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit) + } + if numOfPluggablePorts > maxPCIeSwitchPort { + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) + } qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil } return nil } @@ -1585,6 +1607,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri } func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) { + err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime) if err != nil { return err @@ -1602,16 +1625,12 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V switch machineType { case QemuVirt: - if q.state.ColdPlugVFIO.String() != "true" { - return fmt.Errorf("TODO: Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") - } - //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(drivers.AllPCIeDevs)) - drivers.AllPCIeDevs[devID] = true + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) + config.PCIeDevices[config.RootPort][devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.qomGetSlot(bridgeQomPath) diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index bfa3481455..418075e26b 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) { config6 := newQemuConfig() config6.DisableGuestSeLinux = false - config7 := newQemuConfig() - config7.PCIeRootPort = 1 - config8 := newQemuConfig() config8.EnableVhostUserStore = true config8.HugePages = true @@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) { {config3, false, true}, {config5, false, true}, {config6, false, false}, - {config7, false, true}, {config8, false, true}, {config9, true, false}, {config10, false, true}, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index ed03c092d6..9949656a6d 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -621,9 +621,17 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) var vfioDevices []config.DeviceInfo + // vhost-user-block device is a PCIe device in Virt, keep track of it + // for correct number of PCIe root ports. + var vhostUserBlkDevices []config.DeviceInfo for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { + + if deviceManager.IsVhostUserBlk(device) { + vhostUserBlkDevices = append(vhostUserBlkDevices, device) + continue + } isVFIO := deviceManager.IsVFIO(device.ContainerPath) if hotPlugVFIO && isVFIO { vfioDevices = append(vfioDevices, device) @@ -649,6 +657,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices + sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -1930,6 +1939,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy return err case config.VhostUserBlk: vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice) + if !ok { return fmt.Errorf("device type mismatch, expect device type to be %s", devType) }