diff --git a/Makefile b/Makefile index 3d6c30979d..07859efee5 100644 --- a/Makefile +++ b/Makefile @@ -186,6 +186,7 @@ DEFENABLEDEBUG := false DEFDISABLENESTINGCHECKS := false DEFMSIZE9P := 8192 DEFHOTPLUGVFIOONROOTBUS := false +DEFPCIEROOTPORT := 0 # Default cgroup model DEFSANDBOXCGROUPONLY ?= false @@ -444,6 +445,7 @@ USER_VARS += DEFENABLEDEBUG USER_VARS += DEFDISABLENESTINGCHECKS USER_VARS += DEFMSIZE9P USER_VARS += DEFHOTPLUGVFIOONROOTBUS +USER_VARS += DEFPCIEROOTPORT USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += BUILDFLAGS @@ -619,7 +621,8 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit -e "s|@DEFENABLEDEBUG@|$(DEFENABLEDEBUG)|g" \ -e "s|@DEFDISABLENESTINGCHECKS@|$(DEFDISABLENESTINGCHECKS)|g" \ -e "s|@DEFMSIZE9P@|$(DEFMSIZE9P)|g" \ - -e "s|@DEFHOTPLUGONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \ + -e "s|@DEFHOTPLUGVFIOONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \ + -e "s|@DEFPCIEROOTPORT@|$(DEFPCIEROOTPORT)|g" \ -e "s|@DEFENTROPYSOURCE@|$(DEFENTROPYSOURCE)|g" \ -e "s|@DEFSANDBOXCGROUPONLY@|$(DEFSANDBOXCGROUPONLY)|g" \ $< > $@ diff --git a/cli/config/configuration-qemu.toml.in b/cli/config/configuration-qemu.toml.in index d7ab5e80dc..20e387008c 100644 --- a/cli/config/configuration-qemu.toml.in +++ b/cli/config/configuration-qemu.toml.in @@ -224,6 +224,13 @@ enable_iothreads = @DEFENABLEIOTHREADS@ # Default false #hotplug_vfio_on_root_bus = true +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +#pcie_root_port = 2 + # If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off # security (vhost-net runs ring0) for network I/O performance. #disable_vhost_net = true diff --git a/cli/kata-env.go b/cli/kata-env.go index 90a73b3c41..c973961aae 100644 --- a/cli/kata-env.go +++ b/cli/kata-env.go @@ -83,16 +83,18 @@ type RuntimeVersionInfo struct { // HypervisorInfo stores hypervisor details type HypervisorInfo struct { - MachineType string - Version string - Path string - BlockDeviceDriver string - EntropySource string - Msize9p uint32 - MemorySlots uint32 - Debug bool - UseVSock bool - SharedFS string + MachineType string + Version string + Path string + BlockDeviceDriver string + EntropySource string + SharedFS string + Msize9p uint32 + MemorySlots uint32 + PCIeRootPort uint32 + HotplugVFIOOnRootBus bool + Debug bool + UseVSock bool } // ProxyInfo stores proxy details @@ -355,6 +357,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) HypervisorInfo { MemorySlots: config.HypervisorConfig.MemSlots, EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, + + HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, + PCIeRootPort: config.HypervisorConfig.PCIeRootPort, } } diff --git a/cli/kata-env_test.go b/cli/kata-env_test.go index 1e1b3ee092..ed02de8ffd 100644 --- a/cli/kata-env_test.go +++ b/cli/kata-env_test.go @@ -91,6 +91,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC blockStorageDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true + pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" @@ -150,6 +151,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, @@ -329,6 +331,9 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { Debug: config.HypervisorConfig.Debug, EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, + + HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, + PCIeRootPort: config.HypervisorConfig.PCIeRootPort, } } diff --git a/containerd-shim-v2/create_test.go b/containerd-shim-v2/create_test.go index ea0e286d4d..bf7cd4321b 100644 --- a/containerd-shim-v2/create_test.go +++ b/containerd-shim-v2/create_test.go @@ -398,6 +398,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err blockDeviceDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true + pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" @@ -416,6 +417,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err BlockDeviceDriver: blockDeviceDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, SharedFS: sharedFS, } diff --git a/pkg/katatestutils/utils.go b/pkg/katatestutils/utils.go index 62f6ffc97e..bb71d616fb 100644 --- a/pkg/katatestutils/utils.go +++ b/pkg/katatestutils/utils.go @@ -28,6 +28,7 @@ type RuntimeConfigOptions struct { AgentTraceMode string AgentTraceType string SharedFS string + PCIeRootPort uint32 DisableBlock bool EnableIOThreads bool HotplugVFIOOnRootBus bool @@ -59,6 +60,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` + pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/pkg/katautils/config-settings.go b/pkg/katautils/config-settings.go index 9bae10f7ee..c46f36c15f 100644 --- a/pkg/katautils/config-settings.go +++ b/pkg/katautils/config-settings.go @@ -44,6 +44,7 @@ const defaultEnableDebug bool = false const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 const defaultHotplugVFIOOnRootBus bool = false +const defaultPCIeRootPort = 0 const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "none" diff --git a/pkg/katautils/config.go b/pkg/katautils/config.go index 6f7c14c66e..bf20a33206 100644 --- a/pkg/katautils/config.go +++ b/pkg/katautils/config.go @@ -111,6 +111,7 @@ type hypervisor struct { MemOffset uint32 `toml:"memory_offset"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` + PCIeRootPort uint32 `toml:"pcie_root_port"` DisableBlockDeviceUse bool `toml:"disable_block_device_use"` MemPrealloc bool `toml:"enable_mem_prealloc"` HugePages bool `toml:"enable_hugepages"` @@ -648,6 +649,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { UseVSock: useVSock, DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + PCIeRootPort: h.PCIeRootPort, DisableVhostNet: h.DisableVhostNet, GuestHookPath: h.guestHookPath(), }, nil @@ -796,6 +798,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + PCIeRootPort: h.PCIeRootPort, DisableVhostNet: true, UseVSock: true, }, nil @@ -1073,6 +1076,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { EnableIOThreads: defaultEnableIOThreads, Msize9p: defaultMsize9p, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, + PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VirtioFSCache: defaultVirtioFSCacheMode, DisableImageNvdimm: defaultDisableImageNvdimm, diff --git a/pkg/katautils/config_test.go b/pkg/katautils/config_test.go index 96f5059b9a..eb15e5f3e7 100644 --- a/pkg/katautils/config_test.go +++ b/pkg/katautils/config_test.go @@ -82,6 +82,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf blockDeviceDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true + pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" @@ -101,6 +102,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf BlockDeviceDriver: blockDeviceDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: defaultVCPUCount, DefaultMaxVCPUCount: defaultMaxVCPUCount, @@ -158,6 +160,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf Mlock: !defaultEnableSwap, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, Msize9p: defaultMsize9p, MemSlots: defaultMemSlots, EntropySource: defaultEntropySource, @@ -775,6 +778,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true + pcieRootPort := uint32(2) orgVHostVSockDevicePath := utils.VHostVSockDevicePath defer func() { utils.VHostVSockDevicePath = orgVHostVSockDevicePath @@ -789,6 +793,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, UseVSock: true, } @@ -846,6 +851,10 @@ func TestNewQemuHypervisorConfig(t *testing.T) { if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus { t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) } + + if config.PCIeRootPort != pcieRootPort { + t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort) + } } func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { @@ -869,6 +878,7 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true + pcieRootPort := uint32(2) hypervisor := hypervisor{ Path: hypervisorPath, @@ -879,6 +889,7 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + PCIeRootPort: pcieRootPort, } _, err = newQemuHypervisorConfig(hypervisor) diff --git a/virtcontainers/device/config/config.go b/virtcontainers/device/config/config.go index 9a30134576..3a884470a7 100644 --- a/virtcontainers/device/config/config.go +++ b/virtcontainers/device/config/config.go @@ -75,6 +75,12 @@ var SysDevPrefix = "/sys/dev" // SysIOMMUPath is static string of /sys/kernel/iommu_groups var SysIOMMUPath = "/sys/kernel/iommu_groups" +// SysBusPciDevicesPath is static string of /sys/bus/pci/devices +var SysBusPciDevicesPath = "/sys/bus/pci/devices" + +// SysBusPciSlotsPath is static string of /sys/bus/pci/slots +var SysBusPciSlotsPath = "/sys/bus/pci/slots" + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // Hostpath is device path on host @@ -165,12 +171,15 @@ const ( // VFIODev represents a VFIO drive used for hotplugging type VFIODev struct { - // ID is used to identify this drive in the hypervisor options. - ID string + // IsPCIe specifies device is PCIe or PCI + IsPCIe bool // Type of VFIO device Type VFIODeviceType + // ID is used to identify this drive in the hypervisor options. + ID string + // BDF (Bus:Device.Function) of the PCI address BDF string @@ -182,6 +191,12 @@ type VFIODev struct { // DeviceID specifies device id DeviceID string + + // PCI Class Code + Class string + + // Bus of VFIO PCIe device + Bus string } // RNGDev represents a random number generator device diff --git a/virtcontainers/device/drivers/utils.go b/virtcontainers/device/drivers/utils.go index 33c18ae8ea..f3b338c61a 100644 --- a/virtcontainers/device/drivers/utils.go +++ b/virtcontainers/device/drivers/utils.go @@ -7,13 +7,102 @@ package drivers import ( - "github.com/sirupsen/logrus" + "fmt" + "io/ioutil" + "path/filepath" + "strings" "github.com/kata-containers/runtime/virtcontainers/device/api" + "github.com/kata-containers/runtime/virtcontainers/device/config" + "github.com/sirupsen/logrus" ) -const intMax uint = ^uint(0) +const ( + intMax = ^uint(0) + + PCIDomain = "0000" + PCIeKeyword = "PCIe" +) + +type PCISysFsType string + +var ( + PCISysFsDevices PCISysFsType = "devices" // /sys/bus/pci/devices + PCISysFsSlots PCISysFsType = "slots" // /sys/bus/pci/slots +) + +type PCISysFsProperty string + +var ( + PCISysFsDevicesClass PCISysFsProperty = "class" // /sys/bus/pci/devices/xxx/class + PCISysFsSlotsAddress PCISysFsProperty = "address" // /sys/bus/pci/slots/xxx/address + PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed +) func deviceLogger() *logrus.Entry { return api.DeviceLogger() } + +/* +Identify PCIe device by /sys/bus/pci/slots/xx/max_bus_speed, sample content "8.0 GT/s PCIe" +The /sys/bus/pci/slots/xx/address contains bdf, sample content "0000:04:00" +bdf format: bus:slot.function +*/ +func isPCIeDevice(bdf string) bool { + if len(strings.Split(bdf, ":")) == 2 { + bdf = PCIDomain + ":" + bdf + } + slots, err := ioutil.ReadDir(config.SysBusPciSlotsPath) + if err != nil { + deviceLogger().WithError(err).WithField("path", config.SysBusPciSlotsPath).Warn("failed to list pci slots") + return false + } + b := strings.Split(bdf, ".")[0] + for _, slot := range slots { + address := getPCISlotProperty(slot.Name(), PCISysFsSlotsAddress) + if b == address { + maxBusSpeed := getPCISlotProperty(slot.Name(), PCISysFsSlotsMaxBusSpeed) + if strings.Contains(maxBusSpeed, PCIeKeyword) { + return true + } + } + } + deviceLogger().WithField("dev-bdf", bdf).Debug("can not find slot for bdf of pci device") + return false +} + +// read from /sys/bus/pci/devices/xxx/property +func getPCIDeviceProperty(bdf string, property PCISysFsProperty) string { + if len(strings.Split(bdf, ":")) == 2 { + bdf = PCIDomain + ":" + bdf + } + propertyPath := filepath.Join(config.SysBusPciDevicesPath, bdf, string(property)) + rlt, err := readPCIProperty(propertyPath) + if err != nil { + deviceLogger().WithError(err).WithField("path", propertyPath).Warn("failed to read pci device property") + return "" + } + return rlt +} + +// read from /sys/bus/pci/slots/xxx/property +func getPCISlotProperty(slot string, property PCISysFsProperty) string { + propertyPath := filepath.Join(config.SysBusPciSlotsPath, slot, string(property)) + rlt, err := readPCIProperty(propertyPath) + if err != nil { + deviceLogger().WithError(err).WithField("path", propertyPath).Warn("failed to read pci slot property") + return "" + } + return rlt +} + +func readPCIProperty(propertyPath string) (string, error) { + var ( + buf []byte + err error + ) + if buf, err = ioutil.ReadFile(propertyPath); err != nil { + return "", fmt.Errorf("failed to read pci sysfs %v, error:%v", propertyPath, err) + } + return strings.Split(string(buf), "\n")[0], nil +} diff --git a/virtcontainers/device/drivers/vfio.go b/virtcontainers/device/drivers/vfio.go index ade39c18b7..8762ade9ec 100644 --- a/virtcontainers/device/drivers/vfio.go +++ b/virtcontainers/device/drivers/vfio.go @@ -27,6 +27,11 @@ const ( pciDriverBindPath = "/sys/bus/pci/drivers/%s/bind" vfioNewIDPath = "/sys/bus/pci/drivers/vfio-pci/new_id" vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id" + pcieRootPortPrefix = "rp" +) + +var ( + AllPCIeDevs = map[string]bool{} ) // VFIODevice is a vfio device meant to be passed to the hypervisor @@ -83,8 +88,14 @@ func (device *VFIODevice) Attach(devReceiver api.DeviceReceiver) (retErr error) Type: vfioDeviceType, BDF: deviceBDF, SysfsDev: deviceSysfsDev, + IsPCIe: isPCIeDevice(deviceBDF), + Class: getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass), } device.VfioDevs = append(device.VfioDevs, vfio) + if vfio.IsPCIe { + vfio.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) + AllPCIeDevs[vfio.BDF] = true + } } // hotplug a VFIO device is actually hotplugging a group of iommu devices diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index 20034d5aa8..c0b657eda8 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -369,6 +369,10 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort uint32 + // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool diff --git a/virtcontainers/persist.go b/virtcontainers/persist.go index 47a5c10383..ceff379600 100644 --- a/virtcontainers/persist.go +++ b/virtcontainers/persist.go @@ -248,6 +248,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { UseVSock: sconfig.HypervisorConfig.UseVSock, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, + PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, @@ -534,6 +535,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { UseVSock: hconf.UseVSock, DisableImageNvdimm: hconf.DisableImageNvdimm, HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, + PCIeRootPort: hconf.PCIeRootPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, DisableVhostNet: hconf.DisableVhostNet, diff --git a/virtcontainers/persist/api/config.go b/virtcontainers/persist/api/config.go index 41877491ad..7912587057 100644 --- a/virtcontainers/persist/api/config.go +++ b/virtcontainers/persist/api/config.go @@ -153,6 +153,10 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort uint32 + // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool diff --git a/virtcontainers/persist/api/hypervisor.go b/virtcontainers/persist/api/hypervisor.go index b64c32e0cb..6fec96ae1a 100644 --- a/virtcontainers/persist/api/hypervisor.go +++ b/virtcontainers/persist/api/hypervisor.go @@ -41,4 +41,5 @@ type HypervisorState struct { HotpluggedMemory int VirtiofsdPid int HotplugVFIOOnRootBus bool + PCIeRootPort int } diff --git a/virtcontainers/pkg/annotations/annotations.go b/virtcontainers/pkg/annotations/annotations.go index 3be3aa1866..14da6eb591 100644 --- a/virtcontainers/pkg/annotations/annotations.go +++ b/virtcontainers/pkg/annotations/annotations.go @@ -97,6 +97,10 @@ const ( // root bus instead of a bridge. HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port" + // EntropySource is a sandbox annotation to specify the path to a host source of // entropy (/dev/random, /dev/urandom or real hardware RNG device) EntropySource = kataAnnotHypervisorPrefix + "entropy_source" diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go index 416b68ae83..9fa517ee4e 100644 --- a/virtcontainers/pkg/oci/utils.go +++ b/virtcontainers/pkg/oci/utils.go @@ -447,6 +447,14 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig) config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus } + if value, ok := ocispec.Annotations[vcAnnotations.PCIeRootPort]; ok { + pcieRootPort, err := strconv.ParseUint(value, 10, 32) + if err != nil { + return fmt.Errorf("Error parsing annotation for pcie_root_port: %v, Please specify an integer greater than or equal to 0", err) + } + config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort) + } + if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok { if value != "" { config.HypervisorConfig.EntropySource = value diff --git a/virtcontainers/pkg/oci/utils_test.go b/virtcontainers/pkg/oci/utils_test.go index 792ed37cc4..d8815efa4c 100644 --- a/virtcontainers/pkg/oci/utils_test.go +++ b/virtcontainers/pkg/oci/utils_test.go @@ -763,6 +763,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.UseVSock] = "true" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" + ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2" ocispec.Annotations[vcAnnotations.EntropySource] = "/dev/urandom" addAnnotations(ocispec, &config) @@ -793,6 +794,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(config.HypervisorConfig.UseVSock, true) assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true) assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true) + assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2)) assert.Equal(config.HypervisorConfig.EntropySource, "/dev/urandom") // In case an absurd large value is provided, the config value if not over-ridden diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index 66b7b71783..197ab203bb 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -71,6 +71,7 @@ type QemuState struct { UUID string HotplugVFIOOnRootBus bool VirtiofsdPid int + PCIeRootPort int } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -266,6 +267,7 @@ func (q *qemu) setup(id string, hypervisorConfig *HypervisorConfig) error { q.state.UUID = uuid.Generate().String() q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus + q.state.PCIeRootPort = int(q.config.PCIeRootPort) // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. @@ -584,6 +586,13 @@ func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNa return err } + // Add PCIe Root Port devices to hypervisor + // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port. + // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt + if hypervisorConfig.PCIeRootPort > 0 { + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort) + } + q.qemuConfig = qemuConfig return nil @@ -1149,17 +1158,39 @@ func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err erro } devID := device.ID + machinneType := q.hypervisorConfig().HypervisorMachineType if op == addDevice { + + buf, _ := json.Marshal(device) + q.Logger().WithFields(logrus.Fields{ + "machine-type": machinneType, + "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus, + "pcie-root-port": q.state.PCIeRootPort, + "device-info": string(buf), + }).Info("Start hot-plug VFIO device") + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus // for pc machine type instead of bridge. This is useful for devices that require // a large PCI BAR which is a currently a limitation with PCI bridges. if q.state.HotplugVFIOOnRootBus { + + // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port. + switch machinneType { + case QemuQ35: + if device.IsPCIe && q.state.PCIeRootPort <= 0 { + q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") + device.Bus = "" + } + default: + device.Bus = "" + } + switch device.Type { case config.VFIODeviceNormalType: - return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, "", romFile) + return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) case config.VFIODeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", "", romFile) + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1185,6 +1216,8 @@ func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err erro return fmt.Errorf("Incorrect VFIO device type found") } } else { + q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") + if !q.state.HotplugVFIOOnRootBus { if err := q.arch.removeDeviceFromBridge(devID); err != nil { return err @@ -1848,6 +1881,39 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } +// genericAppendPCIeRootPort appends to devices the given pcie-root-port +func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { + var ( + bus string + chassis string + multiFunction bool + addr string + ) + switch machineType { + case QemuQ35: + bus = defaultBridgeBus + chassis = "0" + multiFunction = false + addr = "0" + default: + return devices + } + + for i := uint32(0); i < number; i++ { + devices = append(devices, + govmmQemu.PCIeRootPortDevice{ + ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i), + Bus: bus, + Chassis: chassis, + Slot: strconv.FormatUint(uint64(i), 10), + Multifunction: multiFunction, + Addr: addr, + }, + ) + } + return devices +} + func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) { span, _ := q.trace("getThreadIDs") defer span.Finish() @@ -2013,6 +2079,7 @@ func (q *qemu) save() (s persistapi.HypervisorState) { s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus + s.PCIeRootPort = q.state.PCIeRootPort for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, persistapi.Bridge{ @@ -2036,6 +2103,7 @@ func (q *qemu) load(s persistapi.HypervisorState) { q.state.HotpluggedMemory = s.HotpluggedMemory q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsdPid = s.VirtiofsdPid + q.state.PCIeRootPort = s.PCIeRootPort for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) diff --git a/virtcontainers/qemu_amd64.go b/virtcontainers/qemu_amd64.go index 5068052c8a..69ceb4f519 100644 --- a/virtcontainers/qemu_amd64.go +++ b/virtcontainers/qemu_amd64.go @@ -20,13 +20,15 @@ type qemuAmd64 struct { vmFactory bool } -const defaultQemuPath = "/usr/bin/qemu-system-x86_64" +const ( + defaultQemuPath = "/usr/bin/qemu-system-x86_64" -const defaultQemuMachineType = QemuPC + defaultQemuMachineType = QemuPC -const defaultQemuMachineOptions = "accel=kvm,kernel_irqchip" + defaultQemuMachineOptions = "accel=kvm,kernel_irqchip" -const qmpMigrationWaitTimeout = 5 * time.Second + qmpMigrationWaitTimeout = 5 * time.Second +) var qemuPaths = map[string]string{ QemuPCLite: "/usr/bin/qemu-lite-system-x86_64", diff --git a/virtcontainers/qemu_arch_base.go b/virtcontainers/qemu_arch_base.go index 6ef639ac6a..9eff2c7405 100644 --- a/virtcontainers/qemu_arch_base.go +++ b/virtcontainers/qemu_arch_base.go @@ -127,6 +127,9 @@ type qemuArch interface { // setIgnoreSharedMemoryMigrationCaps set bypass-shared-memory capability for migration setIgnoreSharedMemoryMigrationCaps(context.Context, *govmmQemu.QMP) error + + // appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus + appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32) []govmmQemu.Device } type qemuArchBase struct { @@ -153,6 +156,7 @@ const ( defaultPCBridgeBus = "pci.0" maxDevIDSize = 31 defaultMsize9p = 8192 + pcieRootPortPrefix = "rp" ) // This is the PCI start address assigned to the first bridge that @@ -646,6 +650,7 @@ func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev conf BDF: vfioDev.BDF, VendorID: vfioDev.VendorID, DeviceID: vfioDev.DeviceID, + Bus: vfioDev.Bus, }, ) @@ -750,3 +755,8 @@ func (q *qemuArchBase) setBridges(bridges []types.Bridge) { func (q *qemuArchBase) addBridge(b types.Bridge) { q.Bridges = append(q.Bridges, b) } + +// appendPCIeRootPortDevice appends to devices the given pcie-root-port +func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32) []govmmQemu.Device { + return genericAppendPCIeRootPort(devices, number, q.machineType) +}