mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-05-04 22:47:29 +00:00
gpu: PCIe topology, consider vhost-user-block in Virt
In Virt the vhost-user-block is an PCIe device so we need to make sure to consider it as well. We're keeping track of vhost-user-block devices and deduce the correct amount of PCIe root ports. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
parent
b11246c3aa
commit
fbacc09646
@ -216,6 +216,15 @@ func (p PCIePort) Valid() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type PCIePortMapping map[string]bool
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Each of this structures keeps track of the devices attached to the
|
||||||
|
// different types of PCI ports. We can deduces the Bus number from it
|
||||||
|
// and eliminate duplicates being assigned.
|
||||||
|
PCIeDevices = map[PCIePort]PCIePortMapping{}
|
||||||
|
)
|
||||||
|
|
||||||
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
||||||
type DeviceInfo struct {
|
type DeviceInfo struct {
|
||||||
// DriverOptions is specific options for each device driver
|
// DriverOptions is specific options for each device driver
|
||||||
|
@ -31,13 +31,6 @@ const (
|
|||||||
vfioAPSysfsDir = "/sys/devices/vfio_ap"
|
vfioAPSysfsDir = "/sys/devices/vfio_ap"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
// AllPCIeDevs deduces the correct bus number. The BDF keeps track that
|
|
||||||
// we're not accounting for the very same device, if a user provides the
|
|
||||||
// devices multiple times.
|
|
||||||
AllPCIeDevs = map[string]bool{}
|
|
||||||
)
|
|
||||||
|
|
||||||
// VFIODevice is a vfio device meant to be passed to the hypervisor
|
// VFIODevice is a vfio device meant to be passed to the hypervisor
|
||||||
// to be used by the Virtual Machine.
|
// to be used by the Virtual Machine.
|
||||||
type VFIODevice struct {
|
type VFIODevice struct {
|
||||||
@ -78,9 +71,11 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
|
|||||||
}
|
}
|
||||||
for _, vfio := range device.VfioDevs {
|
for _, vfio := range device.VfioDevs {
|
||||||
if vfio.IsPCIe {
|
if vfio.IsPCIe {
|
||||||
vfio.Rank = len(AllPCIeDevs)
|
//vfio.Rank = len(AllPCIeDevs)
|
||||||
AllPCIeDevs[vfio.BDF] = true
|
//AllPCIeDevs[vfio.BDF] = true
|
||||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], vfio.Rank)
|
busIndex := len(config.PCIeDevices[vfio.Port])
|
||||||
|
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||||
|
config.PCIeDevices[vfio.Port][vfio.BDF] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,10 +209,10 @@ func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
|
|||||||
switch vfioDeviceType {
|
switch vfioDeviceType {
|
||||||
case config.VFIOPCIDeviceNormalType:
|
case config.VFIOPCIDeviceNormalType:
|
||||||
// Get bdf of device eg. 0000:00:1c.0
|
// Get bdf of device eg. 0000:00:1c.0
|
||||||
//deviceBDF = getBDF(deviceFileName)
|
// OLD IMPL: deviceBDF = getBDF(deviceFileName)
|
||||||
// The old implementation did not consider the case where
|
// The old implementation did not consider the case where
|
||||||
// vfio devices are located on differente root busses. The
|
// vfio devices are located on different root busses. The
|
||||||
// kata-agent will handle the case now, here use the full PCI addr
|
// kata-agent will handle the case now, here, use the full PCI addr
|
||||||
deviceBDF = deviceFileName
|
deviceBDF = deviceFileName
|
||||||
// Get sysfs path used by cloud-hypervisor
|
// Get sysfs path used by cloud-hypervisor
|
||||||
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
|
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
|
||||||
|
@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
|
|||||||
dm.blockDriver = config.VirtioSCSI
|
dm.blockDriver = config.VirtioSCSI
|
||||||
}
|
}
|
||||||
|
|
||||||
drivers.AllPCIeDevs = make(map[string]bool)
|
config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
|
||||||
|
|
||||||
|
config.PCIeDevices[config.RootPort] = make(map[string]bool)
|
||||||
|
config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
|
||||||
|
config.PCIeDevices[config.BridgePort] = make(map[string]bool)
|
||||||
|
|
||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
dm.devices[dev.DeviceID()] = dev
|
dm.devices[dev.DeviceID()] = dev
|
||||||
@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
|
|||||||
}
|
}
|
||||||
if IsVFIO(devInfo.HostPath) {
|
if IsVFIO(devInfo.HostPath) {
|
||||||
return drivers.NewVFIODevice(&devInfo), nil
|
return drivers.NewVFIODevice(&devInfo), nil
|
||||||
} else if isVhostUserBlk(devInfo) {
|
} else if IsVhostUserBlk(devInfo) {
|
||||||
if devInfo.DriverOptions == nil {
|
if devInfo.DriverOptions == nil {
|
||||||
devInfo.DriverOptions = make(map[string]string)
|
devInfo.DriverOptions = make(map[string]string)
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// isVhostUserBlk checks if the device is a VhostUserBlk device.
|
// isVhostUserBlk checks if the device is a VhostUserBlk device.
|
||||||
func isVhostUserBlk(devInfo config.DeviceInfo) bool {
|
func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
|
||||||
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
|
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, d := range data {
|
for _, d := range data {
|
||||||
isVhostUserBlk := isVhostUserBlk(
|
isVhostUserBlk := IsVhostUserBlk(
|
||||||
config.DeviceInfo{
|
config.DeviceInfo{
|
||||||
DevType: d.devType,
|
DevType: d.devType,
|
||||||
Major: d.major,
|
Major: d.major,
|
||||||
|
@ -505,9 +505,13 @@ type HypervisorConfig struct {
|
|||||||
// MemOffset specifies memory space for nvdimm device
|
// MemOffset specifies memory space for nvdimm device
|
||||||
MemOffset uint64
|
MemOffset uint64
|
||||||
|
|
||||||
// RawDevics are used to get PCIe device info early before the sandbox
|
// VFIODevices are used to get PCIe device info early before the sandbox
|
||||||
// is started to make better PCIe topology decisions
|
// is started to make better PCIe topology decisions
|
||||||
VFIODevices []config.DeviceInfo
|
VFIODevices []config.DeviceInfo
|
||||||
|
// VhostUserBlkDevices are handled differently in Q35 and Virt machine
|
||||||
|
// type. capture them early before the sandbox to make better PCIe topology
|
||||||
|
// decisions
|
||||||
|
VhostUserBlkDevices []config.DeviceInfo
|
||||||
|
|
||||||
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
|
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
|
||||||
// root port or a switch
|
// root port or a switch
|
||||||
|
@ -702,7 +702,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
|
|||||||
}
|
}
|
||||||
|
|
||||||
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
|
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
|
||||||
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil {
|
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
|
||||||
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
|
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -741,9 +741,10 @@ func (q *qemu) checkBpfEnabled() {
|
|||||||
// Max PCIe switch ports is 16
|
// Max PCIe switch ports is 16
|
||||||
// There is only 64kB of IO memory each root,switch port will consume 4k hence
|
// There is only 64kB of IO memory each root,switch port will consume 4k hence
|
||||||
// only 16 ports possible.
|
// only 16 ports possible.
|
||||||
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error {
|
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
|
||||||
|
|
||||||
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
|
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
|
||||||
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort {
|
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -767,7 +768,8 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
|
|||||||
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the number of hot(cold)-pluggable ports needed from the provided devices
|
// Get the number of hot(cold)-pluggable ports needed from the provided
|
||||||
|
// VFIO devices and VhostUserBlockDevices
|
||||||
var numOfPluggablePorts uint32 = 0
|
var numOfPluggablePorts uint32 = 0
|
||||||
for _, dev := range hypervisorConfig.VFIODevices {
|
for _, dev := range hypervisorConfig.VFIODevices {
|
||||||
var err error
|
var err error
|
||||||
@ -785,22 +787,42 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
|
||||||
|
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
|
||||||
|
|
||||||
|
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
|
||||||
|
|
||||||
// If number of PCIe root ports > 16 then bail out otherwise we may
|
// If number of PCIe root ports > 16 then bail out otherwise we may
|
||||||
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
|
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
|
||||||
// cannot be added which are crucial for Kata max slots on root bus is 32
|
// cannot be added which are crucial for Kata max slots on root bus is 32
|
||||||
// max slots on the complete pci(e) topology is 256 in QEMU
|
// max slots on the complete pci(e) topology is 256 in QEMU
|
||||||
if numOfPluggablePorts > maxPCIeRootPort {
|
if vfioOnRootPort {
|
||||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
// On Arm the vhost-user-block device is a PCIe device we need
|
||||||
}
|
// to account for it in the number of pluggable ports
|
||||||
if numOfPluggablePorts > maxPCIeSwitchPort {
|
if machineType == QemuVirt {
|
||||||
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
|
numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
|
||||||
}
|
}
|
||||||
|
if numOfPluggablePorts > maxPCIeRootPort {
|
||||||
if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
|
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||||
|
}
|
||||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
if q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort {
|
if vfioOnSwitchPort {
|
||||||
|
// On Arm the vhost-user-block device is a PCIe device we need
|
||||||
|
// to account for it in the number of pluggable ports
|
||||||
|
if machineType == QemuVirt {
|
||||||
|
numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
|
||||||
|
if numOfPluggableRootPorts > maxPCIeRootPort {
|
||||||
|
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||||
|
}
|
||||||
|
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
|
||||||
|
}
|
||||||
|
if numOfPluggablePorts > maxPCIeSwitchPort {
|
||||||
|
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
|
||||||
|
}
|
||||||
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -1585,6 +1607,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
|
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
|
||||||
|
|
||||||
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
|
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -1602,16 +1625,12 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
|
|||||||
|
|
||||||
switch machineType {
|
switch machineType {
|
||||||
case QemuVirt:
|
case QemuVirt:
|
||||||
if q.state.ColdPlugVFIO.String() != "true" {
|
|
||||||
return fmt.Errorf("TODO: Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
|
|
||||||
}
|
|
||||||
|
|
||||||
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
|
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
|
||||||
//Since the dev is the first and only one on this bus(root port), it should be 0.
|
//Since the dev is the first and only one on this bus(root port), it should be 0.
|
||||||
addr := "00"
|
addr := "00"
|
||||||
|
|
||||||
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(drivers.AllPCIeDevs))
|
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
|
||||||
drivers.AllPCIeDevs[devID] = true
|
config.PCIeDevices[config.RootPort][devID] = true
|
||||||
|
|
||||||
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
|
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
|
||||||
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
|
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
|
||||||
|
@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
|
|||||||
config6 := newQemuConfig()
|
config6 := newQemuConfig()
|
||||||
config6.DisableGuestSeLinux = false
|
config6.DisableGuestSeLinux = false
|
||||||
|
|
||||||
config7 := newQemuConfig()
|
|
||||||
config7.PCIeRootPort = 1
|
|
||||||
|
|
||||||
config8 := newQemuConfig()
|
config8 := newQemuConfig()
|
||||||
config8.EnableVhostUserStore = true
|
config8.EnableVhostUserStore = true
|
||||||
config8.HugePages = true
|
config8.HugePages = true
|
||||||
@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
|
|||||||
{config3, false, true},
|
{config3, false, true},
|
||||||
{config5, false, true},
|
{config5, false, true},
|
||||||
{config6, false, false},
|
{config6, false, false},
|
||||||
{config7, false, true},
|
|
||||||
{config8, false, true},
|
{config8, false, true},
|
||||||
{config9, true, false},
|
{config9, true, false},
|
||||||
{config10, false, true},
|
{config10, false, true},
|
||||||
|
@ -621,9 +621,17 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
|||||||
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
||||||
|
|
||||||
var vfioDevices []config.DeviceInfo
|
var vfioDevices []config.DeviceInfo
|
||||||
|
// vhost-user-block device is a PCIe device in Virt, keep track of it
|
||||||
|
// for correct number of PCIe root ports.
|
||||||
|
var vhostUserBlkDevices []config.DeviceInfo
|
||||||
|
|
||||||
for cnt, containers := range sandboxConfig.Containers {
|
for cnt, containers := range sandboxConfig.Containers {
|
||||||
for dev, device := range containers.DeviceInfos {
|
for dev, device := range containers.DeviceInfos {
|
||||||
|
|
||||||
|
if deviceManager.IsVhostUserBlk(device) {
|
||||||
|
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
|
||||||
|
continue
|
||||||
|
}
|
||||||
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
|
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
|
||||||
if hotPlugVFIO && isVFIO {
|
if hotPlugVFIO && isVFIO {
|
||||||
vfioDevices = append(vfioDevices, device)
|
vfioDevices = append(vfioDevices, device)
|
||||||
@ -649,6 +657,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
|||||||
|
|
||||||
}
|
}
|
||||||
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
||||||
|
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
|
||||||
|
|
||||||
// store doesn't require hypervisor to be stored immediately
|
// store doesn't require hypervisor to be stored immediately
|
||||||
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
|
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
|
||||||
@ -1930,6 +1939,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
|
|||||||
return err
|
return err
|
||||||
case config.VhostUserBlk:
|
case config.VhostUserBlk:
|
||||||
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
|
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
|
||||||
|
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user