qemu: Support PCIe device hotplug for q35

- add pcie-root-port device to qemu command line for q35
- hotplug a PCIe device into a PCIe Root Port

Fixes: #2432

Signed-off-by: Jimmy Xu <junming.xjm@antfin.com>
This commit is contained in:
Jimmy Xu 2020-02-05 14:35:14 +08:00
parent fa7d00ec25
commit bb41b7248a
22 changed files with 281 additions and 21 deletions

View File

@ -186,6 +186,7 @@ DEFENABLEDEBUG := false
DEFDISABLENESTINGCHECKS := false DEFDISABLENESTINGCHECKS := false
DEFMSIZE9P := 8192 DEFMSIZE9P := 8192
DEFHOTPLUGVFIOONROOTBUS := false DEFHOTPLUGVFIOONROOTBUS := false
DEFPCIEROOTPORT := 0
# Default cgroup model # Default cgroup model
DEFSANDBOXCGROUPONLY ?= false DEFSANDBOXCGROUPONLY ?= false
@ -444,6 +445,7 @@ USER_VARS += DEFENABLEDEBUG
USER_VARS += DEFDISABLENESTINGCHECKS USER_VARS += DEFDISABLENESTINGCHECKS
USER_VARS += DEFMSIZE9P USER_VARS += DEFMSIZE9P
USER_VARS += DEFHOTPLUGVFIOONROOTBUS USER_VARS += DEFHOTPLUGVFIOONROOTBUS
USER_VARS += DEFPCIEROOTPORT
USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFENTROPYSOURCE
USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFSANDBOXCGROUPONLY
USER_VARS += BUILDFLAGS USER_VARS += BUILDFLAGS
@ -619,7 +621,8 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit
-e "s|@DEFENABLEDEBUG@|$(DEFENABLEDEBUG)|g" \ -e "s|@DEFENABLEDEBUG@|$(DEFENABLEDEBUG)|g" \
-e "s|@DEFDISABLENESTINGCHECKS@|$(DEFDISABLENESTINGCHECKS)|g" \ -e "s|@DEFDISABLENESTINGCHECKS@|$(DEFDISABLENESTINGCHECKS)|g" \
-e "s|@DEFMSIZE9P@|$(DEFMSIZE9P)|g" \ -e "s|@DEFMSIZE9P@|$(DEFMSIZE9P)|g" \
-e "s|@DEFHOTPLUGONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \ -e "s|@DEFHOTPLUGVFIOONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \
-e "s|@DEFPCIEROOTPORT@|$(DEFPCIEROOTPORT)|g" \
-e "s|@DEFENTROPYSOURCE@|$(DEFENTROPYSOURCE)|g" \ -e "s|@DEFENTROPYSOURCE@|$(DEFENTROPYSOURCE)|g" \
-e "s|@DEFSANDBOXCGROUPONLY@|$(DEFSANDBOXCGROUPONLY)|g" \ -e "s|@DEFSANDBOXCGROUPONLY@|$(DEFSANDBOXCGROUPONLY)|g" \
$< > $@ $< > $@

View File

@ -224,6 +224,13 @@ enable_iothreads = @DEFENABLEIOTHREADS@
# Default false # Default false
#hotplug_vfio_on_root_bus = true #hotplug_vfio_on_root_bus = true
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
# Default 0
#pcie_root_port = 2
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off # If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
# security (vhost-net runs ring0) for network I/O performance. # security (vhost-net runs ring0) for network I/O performance.
#disable_vhost_net = true #disable_vhost_net = true

View File

@ -83,16 +83,18 @@ type RuntimeVersionInfo struct {
// HypervisorInfo stores hypervisor details // HypervisorInfo stores hypervisor details
type HypervisorInfo struct { type HypervisorInfo struct {
MachineType string MachineType string
Version string Version string
Path string Path string
BlockDeviceDriver string BlockDeviceDriver string
EntropySource string EntropySource string
Msize9p uint32 SharedFS string
MemorySlots uint32 Msize9p uint32
Debug bool MemorySlots uint32
UseVSock bool PCIeRootPort uint32
SharedFS string HotplugVFIOOnRootBus bool
Debug bool
UseVSock bool
} }
// ProxyInfo stores proxy details // ProxyInfo stores proxy details
@ -355,6 +357,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) HypervisorInfo {
MemorySlots: config.HypervisorConfig.MemSlots, MemorySlots: config.HypervisorConfig.MemSlots,
EntropySource: config.HypervisorConfig.EntropySource, EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS, SharedFS: config.HypervisorConfig.SharedFS,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
} }
} }

View File

@ -91,6 +91,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
blockStorageDriver := "virtio-scsi" blockStorageDriver := "virtio-scsi"
enableIOThreads := true enableIOThreads := true
hotplugVFIOOnRootBus := true hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
disableNewNetNs := false disableNewNetNs := false
sharedFS := "virtio-9p" sharedFS := "virtio-9p"
@ -150,6 +151,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
BlockDeviceDriver: blockStorageDriver, BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs, DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: hypConfig.NumVCPUs, DefaultVCPUCount: hypConfig.NumVCPUs,
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
@ -329,6 +331,9 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
Debug: config.HypervisorConfig.Debug, Debug: config.HypervisorConfig.Debug,
EntropySource: config.HypervisorConfig.EntropySource, EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS, SharedFS: config.HypervisorConfig.SharedFS,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
} }
} }

View File

@ -398,6 +398,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
blockDeviceDriver := "virtio-scsi" blockDeviceDriver := "virtio-scsi"
enableIOThreads := true enableIOThreads := true
hotplugVFIOOnRootBus := true hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
disableNewNetNs := false disableNewNetNs := false
sharedFS := "virtio-9p" sharedFS := "virtio-9p"
@ -416,6 +417,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
BlockDeviceDriver: blockDeviceDriver, BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs, DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS, SharedFS: sharedFS,
} }

View File

@ -28,6 +28,7 @@ type RuntimeConfigOptions struct {
AgentTraceMode string AgentTraceMode string
AgentTraceType string AgentTraceType string
SharedFS string SharedFS string
PCIeRootPort uint32
DisableBlock bool DisableBlock bool
EnableIOThreads bool EnableIOThreads bool
HotplugVFIOOnRootBus bool HotplugVFIOOnRootBus bool
@ -59,6 +60,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
guest_hook_path = "` + config.DefaultGuestHookPath + `" guest_hook_path = "` + config.DefaultGuestHookPath + `"

View File

@ -44,6 +44,7 @@ const defaultEnableDebug bool = false
const defaultDisableNestingChecks bool = false const defaultDisableNestingChecks bool = false
const defaultMsize9p uint32 = 8192 const defaultMsize9p uint32 = 8192
const defaultHotplugVFIOOnRootBus bool = false const defaultHotplugVFIOOnRootBus bool = false
const defaultPCIeRootPort = 0
const defaultEntropySource = "/dev/urandom" const defaultEntropySource = "/dev/urandom"
const defaultGuestHookPath string = "" const defaultGuestHookPath string = ""
const defaultVirtioFSCacheMode = "none" const defaultVirtioFSCacheMode = "none"

View File

@ -111,6 +111,7 @@ type hypervisor struct {
MemOffset uint32 `toml:"memory_offset"` MemOffset uint32 `toml:"memory_offset"`
DefaultBridges uint32 `toml:"default_bridges"` DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"` Msize9p uint32 `toml:"msize_9p"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"` DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"` MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"` HugePages bool `toml:"enable_hugepages"`
@ -648,6 +649,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
UseVSock: useVSock, UseVSock: useVSock,
DisableImageNvdimm: h.DisableImageNvdimm, DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: h.DisableVhostNet, DisableVhostNet: h.DisableVhostNet,
GuestHookPath: h.guestHookPath(), GuestHookPath: h.guestHookPath(),
}, nil }, nil
@ -796,6 +798,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
EnableIOThreads: h.EnableIOThreads, EnableIOThreads: h.EnableIOThreads,
Msize9p: h.msize9p(), Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: true, DisableVhostNet: true,
UseVSock: true, UseVSock: true,
}, nil }, nil
@ -1073,6 +1076,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
EnableIOThreads: defaultEnableIOThreads, EnableIOThreads: defaultEnableIOThreads,
Msize9p: defaultMsize9p, Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
PCIeRootPort: defaultPCIeRootPort,
GuestHookPath: defaultGuestHookPath, GuestHookPath: defaultGuestHookPath,
VirtioFSCache: defaultVirtioFSCacheMode, VirtioFSCache: defaultVirtioFSCacheMode,
DisableImageNvdimm: defaultDisableImageNvdimm, DisableImageNvdimm: defaultDisableImageNvdimm,

View File

@ -82,6 +82,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
blockDeviceDriver := "virtio-scsi" blockDeviceDriver := "virtio-scsi"
enableIOThreads := true enableIOThreads := true
hotplugVFIOOnRootBus := true hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
disableNewNetNs := false disableNewNetNs := false
sharedFS := "virtio-9p" sharedFS := "virtio-9p"
@ -101,6 +102,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
BlockDeviceDriver: blockDeviceDriver, BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs, DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: defaultVCPUCount, DefaultVCPUCount: defaultVCPUCount,
DefaultMaxVCPUCount: defaultMaxVCPUCount, DefaultMaxVCPUCount: defaultMaxVCPUCount,
@ -158,6 +160,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
Mlock: !defaultEnableSwap, Mlock: !defaultEnableSwap,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
Msize9p: defaultMsize9p, Msize9p: defaultMsize9p,
MemSlots: defaultMemSlots, MemSlots: defaultMemSlots,
EntropySource: defaultEntropySource, EntropySource: defaultEntropySource,
@ -775,6 +778,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
disableBlock := true disableBlock := true
enableIOThreads := true enableIOThreads := true
hotplugVFIOOnRootBus := true hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
orgVHostVSockDevicePath := utils.VHostVSockDevicePath orgVHostVSockDevicePath := utils.VHostVSockDevicePath
defer func() { defer func() {
utils.VHostVSockDevicePath = orgVHostVSockDevicePath utils.VHostVSockDevicePath = orgVHostVSockDevicePath
@ -789,6 +793,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
DisableBlockDeviceUse: disableBlock, DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
UseVSock: true, UseVSock: true,
} }
@ -846,6 +851,10 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus { if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus {
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
} }
if config.PCIeRootPort != pcieRootPort {
t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
}
} }
func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
@ -869,6 +878,7 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
disableBlock := true disableBlock := true
enableIOThreads := true enableIOThreads := true
hotplugVFIOOnRootBus := true hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
hypervisor := hypervisor{ hypervisor := hypervisor{
Path: hypervisorPath, Path: hypervisorPath,
@ -879,6 +889,7 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
DisableBlockDeviceUse: disableBlock, DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads, EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
} }
_, err = newQemuHypervisorConfig(hypervisor) _, err = newQemuHypervisorConfig(hypervisor)

View File

@ -75,6 +75,12 @@ var SysDevPrefix = "/sys/dev"
// SysIOMMUPath is static string of /sys/kernel/iommu_groups // SysIOMMUPath is static string of /sys/kernel/iommu_groups
var SysIOMMUPath = "/sys/kernel/iommu_groups" var SysIOMMUPath = "/sys/kernel/iommu_groups"
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
// SysBusPciSlotsPath is static string of /sys/bus/pci/slots
var SysBusPciSlotsPath = "/sys/bus/pci/slots"
// DeviceInfo is an embedded type that contains device data common to all types of devices. // DeviceInfo is an embedded type that contains device data common to all types of devices.
type DeviceInfo struct { type DeviceInfo struct {
// Hostpath is device path on host // Hostpath is device path on host
@ -165,12 +171,15 @@ const (
// VFIODev represents a VFIO drive used for hotplugging // VFIODev represents a VFIO drive used for hotplugging
type VFIODev struct { type VFIODev struct {
// ID is used to identify this drive in the hypervisor options. // IsPCIe specifies device is PCIe or PCI
ID string IsPCIe bool
// Type of VFIO device // Type of VFIO device
Type VFIODeviceType Type VFIODeviceType
// ID is used to identify this drive in the hypervisor options.
ID string
// BDF (Bus:Device.Function) of the PCI address // BDF (Bus:Device.Function) of the PCI address
BDF string BDF string
@ -182,6 +191,12 @@ type VFIODev struct {
// DeviceID specifies device id // DeviceID specifies device id
DeviceID string DeviceID string
// PCI Class Code
Class string
// Bus of VFIO PCIe device
Bus string
} }
// RNGDev represents a random number generator device // RNGDev represents a random number generator device

View File

@ -7,13 +7,102 @@
package drivers package drivers
import ( import (
"github.com/sirupsen/logrus" "fmt"
"io/ioutil"
"path/filepath"
"strings"
"github.com/kata-containers/runtime/virtcontainers/device/api" "github.com/kata-containers/runtime/virtcontainers/device/api"
"github.com/kata-containers/runtime/virtcontainers/device/config"
"github.com/sirupsen/logrus"
) )
const intMax uint = ^uint(0) const (
intMax = ^uint(0)
PCIDomain = "0000"
PCIeKeyword = "PCIe"
)
type PCISysFsType string
var (
PCISysFsDevices PCISysFsType = "devices" // /sys/bus/pci/devices
PCISysFsSlots PCISysFsType = "slots" // /sys/bus/pci/slots
)
type PCISysFsProperty string
var (
PCISysFsDevicesClass PCISysFsProperty = "class" // /sys/bus/pci/devices/xxx/class
PCISysFsSlotsAddress PCISysFsProperty = "address" // /sys/bus/pci/slots/xxx/address
PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed
)
func deviceLogger() *logrus.Entry { func deviceLogger() *logrus.Entry {
return api.DeviceLogger() return api.DeviceLogger()
} }
/*
Identify PCIe device by /sys/bus/pci/slots/xx/max_bus_speed, sample content "8.0 GT/s PCIe"
The /sys/bus/pci/slots/xx/address contains bdf, sample content "0000:04:00"
bdf format: bus:slot.function
*/
func isPCIeDevice(bdf string) bool {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
slots, err := ioutil.ReadDir(config.SysBusPciSlotsPath)
if err != nil {
deviceLogger().WithError(err).WithField("path", config.SysBusPciSlotsPath).Warn("failed to list pci slots")
return false
}
b := strings.Split(bdf, ".")[0]
for _, slot := range slots {
address := getPCISlotProperty(slot.Name(), PCISysFsSlotsAddress)
if b == address {
maxBusSpeed := getPCISlotProperty(slot.Name(), PCISysFsSlotsMaxBusSpeed)
if strings.Contains(maxBusSpeed, PCIeKeyword) {
return true
}
}
}
deviceLogger().WithField("dev-bdf", bdf).Debug("can not find slot for bdf of pci device")
return false
}
// read from /sys/bus/pci/devices/xxx/property
func getPCIDeviceProperty(bdf string, property PCISysFsProperty) string {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
propertyPath := filepath.Join(config.SysBusPciDevicesPath, bdf, string(property))
rlt, err := readPCIProperty(propertyPath)
if err != nil {
deviceLogger().WithError(err).WithField("path", propertyPath).Warn("failed to read pci device property")
return ""
}
return rlt
}
// read from /sys/bus/pci/slots/xxx/property
func getPCISlotProperty(slot string, property PCISysFsProperty) string {
propertyPath := filepath.Join(config.SysBusPciSlotsPath, slot, string(property))
rlt, err := readPCIProperty(propertyPath)
if err != nil {
deviceLogger().WithError(err).WithField("path", propertyPath).Warn("failed to read pci slot property")
return ""
}
return rlt
}
func readPCIProperty(propertyPath string) (string, error) {
var (
buf []byte
err error
)
if buf, err = ioutil.ReadFile(propertyPath); err != nil {
return "", fmt.Errorf("failed to read pci sysfs %v, error:%v", propertyPath, err)
}
return strings.Split(string(buf), "\n")[0], nil
}

View File

@ -27,6 +27,11 @@ const (
pciDriverBindPath = "/sys/bus/pci/drivers/%s/bind" pciDriverBindPath = "/sys/bus/pci/drivers/%s/bind"
vfioNewIDPath = "/sys/bus/pci/drivers/vfio-pci/new_id" vfioNewIDPath = "/sys/bus/pci/drivers/vfio-pci/new_id"
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id" vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
pcieRootPortPrefix = "rp"
)
var (
AllPCIeDevs = map[string]bool{}
) )
// VFIODevice is a vfio device meant to be passed to the hypervisor // VFIODevice is a vfio device meant to be passed to the hypervisor
@ -83,8 +88,14 @@ func (device *VFIODevice) Attach(devReceiver api.DeviceReceiver) (retErr error)
Type: vfioDeviceType, Type: vfioDeviceType,
BDF: deviceBDF, BDF: deviceBDF,
SysfsDev: deviceSysfsDev, SysfsDev: deviceSysfsDev,
IsPCIe: isPCIeDevice(deviceBDF),
Class: getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass),
} }
device.VfioDevs = append(device.VfioDevs, vfio) device.VfioDevs = append(device.VfioDevs, vfio)
if vfio.IsPCIe {
vfio.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[vfio.BDF] = true
}
} }
// hotplug a VFIO device is actually hotplugging a group of iommu devices // hotplug a VFIO device is actually hotplugging a group of iommu devices

View File

@ -369,6 +369,10 @@ type HypervisorConfig struct {
// root bus instead of a bridge. // root bus instead of a bridge.
HotplugVFIOOnRootBus bool HotplugVFIOOnRootBus bool
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// BootToBeTemplate used to indicate if the VM is created to be a template VM // BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool BootToBeTemplate bool

View File

@ -248,6 +248,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
UseVSock: sconfig.HypervisorConfig.UseVSock, UseVSock: sconfig.HypervisorConfig.UseVSock,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@ -534,6 +535,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
UseVSock: hconf.UseVSock, UseVSock: hconf.UseVSock,
DisableImageNvdimm: hconf.DisableImageNvdimm, DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
PCIeRootPort: hconf.PCIeRootPort,
BootToBeTemplate: hconf.BootToBeTemplate, BootToBeTemplate: hconf.BootToBeTemplate,
BootFromTemplate: hconf.BootFromTemplate, BootFromTemplate: hconf.BootFromTemplate,
DisableVhostNet: hconf.DisableVhostNet, DisableVhostNet: hconf.DisableVhostNet,

View File

@ -153,6 +153,10 @@ type HypervisorConfig struct {
// root bus instead of a bridge. // root bus instead of a bridge.
HotplugVFIOOnRootBus bool HotplugVFIOOnRootBus bool
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// BootToBeTemplate used to indicate if the VM is created to be a template VM // BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool BootToBeTemplate bool

View File

@ -41,4 +41,5 @@ type HypervisorState struct {
HotpluggedMemory int HotpluggedMemory int
VirtiofsdPid int VirtiofsdPid int
HotplugVFIOOnRootBus bool HotplugVFIOOnRootBus bool
PCIeRootPort int
} }

View File

@ -97,6 +97,10 @@ const (
// root bus instead of a bridge. // root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
// EntropySource is a sandbox annotation to specify the path to a host source of // EntropySource is a sandbox annotation to specify the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device) // entropy (/dev/random, /dev/urandom or real hardware RNG device)
EntropySource = kataAnnotHypervisorPrefix + "entropy_source" EntropySource = kataAnnotHypervisorPrefix + "entropy_source"

View File

@ -447,6 +447,14 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig)
config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus
} }
if value, ok := ocispec.Annotations[vcAnnotations.PCIeRootPort]; ok {
pcieRootPort, err := strconv.ParseUint(value, 10, 32)
if err != nil {
return fmt.Errorf("Error parsing annotation for pcie_root_port: %v, Please specify an integer greater than or equal to 0", err)
}
config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
}
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok { if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
if value != "" { if value != "" {
config.HypervisorConfig.EntropySource = value config.HypervisorConfig.EntropySource = value

View File

@ -763,6 +763,7 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.UseVSock] = "true" ocispec.Annotations[vcAnnotations.UseVSock] = "true"
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
ocispec.Annotations[vcAnnotations.EntropySource] = "/dev/urandom" ocispec.Annotations[vcAnnotations.EntropySource] = "/dev/urandom"
addAnnotations(ocispec, &config) addAnnotations(ocispec, &config)
@ -793,6 +794,7 @@ func TestAddHypervisorAnnotations(t *testing.T) {
assert.Equal(config.HypervisorConfig.UseVSock, true) assert.Equal(config.HypervisorConfig.UseVSock, true)
assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true) assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true) assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
assert.Equal(config.HypervisorConfig.EntropySource, "/dev/urandom") assert.Equal(config.HypervisorConfig.EntropySource, "/dev/urandom")
// In case an absurd large value is provided, the config value if not over-ridden // In case an absurd large value is provided, the config value if not over-ridden

View File

@ -71,6 +71,7 @@ type QemuState struct {
UUID string UUID string
HotplugVFIOOnRootBus bool HotplugVFIOOnRootBus bool
VirtiofsdPid int VirtiofsdPid int
PCIeRootPort int
} }
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@ -266,6 +267,7 @@ func (q *qemu) setup(id string, hypervisorConfig *HypervisorConfig) error {
q.state.UUID = uuid.Generate().String() q.state.UUID = uuid.Generate().String()
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
// The path might already exist, but in case of VM templating, // The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet. // we have to create it since the sandbox has not created it yet.
@ -584,6 +586,13 @@ func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNa
return err return err
} }
// Add PCIe Root Port devices to hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
if hypervisorConfig.PCIeRootPort > 0 {
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort)
}
q.qemuConfig = qemuConfig q.qemuConfig = qemuConfig
return nil return nil
@ -1149,17 +1158,39 @@ func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err erro
} }
devID := device.ID devID := device.ID
machinneType := q.hypervisorConfig().HypervisorMachineType
if op == addDevice { if op == addDevice {
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
"machine-type": machinneType,
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
"pcie-root-port": q.state.PCIeRootPort,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require // for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges. // a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotplugVFIOOnRootBus { if q.state.HotplugVFIOOnRootBus {
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
switch machinneType {
case QemuQ35:
if device.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
device.Bus = ""
}
default:
device.Bus = ""
}
switch device.Type { switch device.Type {
case config.VFIODeviceNormalType: case config.VFIODeviceNormalType:
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, "", romFile) return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile)
case config.VFIODeviceMediatedType: case config.VFIODeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", "", romFile) return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile)
default: default:
return fmt.Errorf("Incorrect VFIO device type found") return fmt.Errorf("Incorrect VFIO device type found")
} }
@ -1185,6 +1216,8 @@ func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err erro
return fmt.Errorf("Incorrect VFIO device type found") return fmt.Errorf("Incorrect VFIO device type found")
} }
} else { } else {
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus { if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(devID); err != nil { if err := q.arch.removeDeviceFromBridge(devID); err != nil {
return err return err
@ -1848,6 +1881,39 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
return memory return memory
} }
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device {
var (
bus string
chassis string
multiFunction bool
addr string
)
switch machineType {
case QemuQ35:
bus = defaultBridgeBus
chassis = "0"
multiFunction = false
addr = "0"
default:
return devices
}
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
Multifunction: multiFunction,
Addr: addr,
},
)
}
return devices
}
func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) { func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) {
span, _ := q.trace("getThreadIDs") span, _ := q.trace("getThreadIDs")
defer span.Finish() defer span.Finish()
@ -2013,6 +2079,7 @@ func (q *qemu) save() (s persistapi.HypervisorState) {
s.UUID = q.state.UUID s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
s.PCIeRootPort = q.state.PCIeRootPort
for _, bridge := range q.arch.getBridges() { for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, persistapi.Bridge{ s.Bridges = append(s.Bridges, persistapi.Bridge{
@ -2036,6 +2103,7 @@ func (q *qemu) load(s persistapi.HypervisorState) {
q.state.HotpluggedMemory = s.HotpluggedMemory q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsdPid = s.VirtiofsdPid q.state.VirtiofsdPid = s.VirtiofsdPid
q.state.PCIeRootPort = s.PCIeRootPort
for _, bridge := range s.Bridges { for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))

View File

@ -20,13 +20,15 @@ type qemuAmd64 struct {
vmFactory bool vmFactory bool
} }
const defaultQemuPath = "/usr/bin/qemu-system-x86_64" const (
defaultQemuPath = "/usr/bin/qemu-system-x86_64"
const defaultQemuMachineType = QemuPC defaultQemuMachineType = QemuPC
const defaultQemuMachineOptions = "accel=kvm,kernel_irqchip" defaultQemuMachineOptions = "accel=kvm,kernel_irqchip"
const qmpMigrationWaitTimeout = 5 * time.Second qmpMigrationWaitTimeout = 5 * time.Second
)
var qemuPaths = map[string]string{ var qemuPaths = map[string]string{
QemuPCLite: "/usr/bin/qemu-lite-system-x86_64", QemuPCLite: "/usr/bin/qemu-lite-system-x86_64",

View File

@ -127,6 +127,9 @@ type qemuArch interface {
// setIgnoreSharedMemoryMigrationCaps set bypass-shared-memory capability for migration // setIgnoreSharedMemoryMigrationCaps set bypass-shared-memory capability for migration
setIgnoreSharedMemoryMigrationCaps(context.Context, *govmmQemu.QMP) error setIgnoreSharedMemoryMigrationCaps(context.Context, *govmmQemu.QMP) error
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32) []govmmQemu.Device
} }
type qemuArchBase struct { type qemuArchBase struct {
@ -153,6 +156,7 @@ const (
defaultPCBridgeBus = "pci.0" defaultPCBridgeBus = "pci.0"
maxDevIDSize = 31 maxDevIDSize = 31
defaultMsize9p = 8192 defaultMsize9p = 8192
pcieRootPortPrefix = "rp"
) )
// This is the PCI start address assigned to the first bridge that // This is the PCI start address assigned to the first bridge that
@ -646,6 +650,7 @@ func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev conf
BDF: vfioDev.BDF, BDF: vfioDev.BDF,
VendorID: vfioDev.VendorID, VendorID: vfioDev.VendorID,
DeviceID: vfioDev.DeviceID, DeviceID: vfioDev.DeviceID,
Bus: vfioDev.Bus,
}, },
) )
@ -750,3 +755,8 @@ func (q *qemuArchBase) setBridges(bridges []types.Bridge) {
func (q *qemuArchBase) addBridge(b types.Bridge) { func (q *qemuArchBase) addBridge(b types.Bridge) {
q.Bridges = append(q.Bridges, b) q.Bridges = append(q.Bridges, b)
} }
// appendPCIeRootPortDevice appends to devices the given pcie-root-port
func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32) []govmmQemu.Device {
return genericAppendPCIeRootPort(devices, number, q.machineType)
}