Merge pull request #6699 from zvonkok/cold-plug-vfio

gpu: cold plug VFIO devices
This commit is contained in:
Peng Tao 2023-05-05 10:04:29 +08:00 committed by GitHub
commit 65670e6b0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 365 additions and 191 deletions

View File

@ -17,6 +17,7 @@ import (
"github.com/prometheus/procfs"
"github.com/urfave/cli"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
@ -113,6 +114,7 @@ type HypervisorInfo struct {
Msize9p uint32
MemorySlots uint32
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
HotplugVFIOOnRootBus bool
Debug bool
}
@ -305,17 +307,17 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
}
return HypervisorInfo{
Debug: config.HypervisorConfig.Debug,
MachineType: config.HypervisorConfig.HypervisorMachineType,
Version: version,
Path: hypervisorPath,
BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver,
Msize9p: config.HypervisorConfig.Msize9p,
MemorySlots: config.HypervisorConfig.MemSlots,
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
Debug: config.HypervisorConfig.Debug,
MachineType: config.HypervisorConfig.HypervisorMachineType,
Version: version,
Path: hypervisorPath,
BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver,
Msize9p: config.HypervisorConfig.Msize9p,
MemorySlots: config.HypervisorConfig.MemSlots,
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
SocketPath: socketPath,

View File

@ -19,6 +19,7 @@ import (
"testing"
"github.com/BurntSushi/toml"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
@ -74,6 +75,7 @@ func createConfig(configPath string, fileData string) error {
}
func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
var coldPlugVFIO hv.PCIePort
const logPath = "/log/path"
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
kernelPath := filepath.Join(prefixDir, "kernel")
@ -86,6 +88,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.NoPort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
@ -129,6 +132,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
ColdPlugVFIO: coldPlugVFIO,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: hypConfig.NumVCPUs,
@ -191,12 +195,13 @@ func genericGetExpectedHostDetails(tmpdir string, expectedVendor string, expecte
expectedSupportVSocks, _ := vcUtils.SupportsVsocks()
expectedHostDetails := HostInfo{
Kernel: expectedKernelVersion,
Architecture: expectedArch,
Distro: expectedDistro,
CPU: expectedCPU,
VMContainerCapable: expectedVMContainerCapable,
SupportVSocks: expectedSupportVSocks,
AvailableGuestProtections: vc.AvailableGuestProtections(),
Kernel: expectedKernelVersion,
Architecture: expectedArch,
Distro: expectedDistro,
CPU: expectedCPU,
VMContainerCapable: expectedVMContainerCapable,
SupportVSocks: expectedSupportVSocks,
}
testProcCPUInfo := filepath.Join(tmpdir, "cpuinfo")
@ -273,6 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
}
if os.Geteuid() == 0 {

View File

@ -352,6 +352,11 @@ pflashes = []
# Default false
#hotplug_vfio_on_root_bus = true
# In a confidential compute environment hot-plugging can compromise
# security. Enable cold-plugging of VFIO devices to a root-port.
# The default setting is "no-port", which means disabled.
#cold_plug_vfio = "root-port"
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port

View File

@ -20,6 +20,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
@ -308,6 +309,7 @@ func TestCreateContainerConfigFail(t *testing.T) {
}
func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
var coldPlugVFIO hv.PCIePort
if dir == "" {
return "", fmt.Errorf("BUG: need directory")
}
@ -332,6 +334,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
coldPlugVFIO = hv.RootPort
configFileOptions := ktu.RuntimeConfigOptions{
Hypervisor: "qemu",
@ -350,6 +353,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
ColdPlugVFIO: coldPlugVFIO,
}
runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions)

View File

@ -10,10 +10,12 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
"github.com/sirupsen/logrus"
)
@ -133,3 +135,99 @@ func GetAPVFIODevices(sysfsdev string) ([]string, error) {
// Split by newlines, omitting final newline
return strings.Split(string(data[:len(data)-1]), "\n"), nil
}
// Ignore specific PCI devices, supply the pciClass and the bitmask to check
// against the device class, deviceBDF for meaningfull info message
func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (bool, error) {
if pciClass == "" {
return false, nil
}
pciClassID, err := strconv.ParseUint(pciClass, 0, 32)
if err != nil {
return false, err
}
// ClassID is 16 bits, remove the two trailing zeros
pciClassID = pciClassID >> 8
if pciClassID&bitmask == bitmask {
deviceLogger().Infof("Ignoring PCI (Host) Bridge deviceBDF %v Class %x", deviceBDF, pciClassID)
return true, nil
}
return false, nil
}
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
// We can reuse this function at various levels, sandbox, container.
// Only the VFIO module is allowed to do bus assignments, all other modules need to
// ignore it if used as helper function to get VFIO information.
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
vfioDevs := []*config.VFIODev{}
vfioGroup := filepath.Base(device.HostPath)
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
if err != nil {
return nil, err
}
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return nil, err
}
id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize)
pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass)
// We need to ignore Host or PCI Bridges that are in the same IOMMU group as the
// passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge.
// Class 0x0604 is PCI bridge, 0x0600 is Host bridge
ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600)
if err != nil {
return nil, err
}
if ignorePCIDevice {
continue
}
var vfio config.VFIODev
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
Class: pciClass,
}
if isPCIe && !ignoreBusAssignment {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[deviceBDF] = true
}
vfio = vfioPCI
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return nil, err
}
vfio = config.VFIOAPDev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
APDevices: devices,
}
default:
return nil, fmt.Errorf("Failed to append device: VFIO device type unrecognized")
}
vfioDevs = append(vfioDevs, &vfio)
}
return vfioDevs, nil
}

View File

@ -11,7 +11,6 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/sirupsen/logrus"
@ -54,25 +53,6 @@ func NewVFIODevice(devInfo *config.DeviceInfo) *VFIODevice {
}
}
// Ignore specific PCI devices, supply the pciClass and the bitmask to check
// against the device class, deviceBDF for meaningfull info message
func (device *VFIODevice) checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (bool, error) {
if pciClass == "" {
return false, nil
}
pciClassID, err := strconv.ParseUint(pciClass, 0, 32)
if err != nil {
return false, err
}
// ClassID is 16 bits, remove the two trailing zeros
pciClassID = pciClassID >> 8
if pciClassID&bitmask == bitmask {
deviceLogger().Infof("Ignoring PCI (Host) Bridge deviceBDF %v Class %x", deviceBDF, pciClassID)
return true, nil
}
return false, nil
}
// Attach is standard interface of api.Device, it's used to add device to some
// DeviceReceiver
func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceReceiver) (retErr error) {
@ -90,72 +70,11 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
}
}()
vfioGroup := filepath.Base(device.DeviceInfo.HostPath)
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
if err != nil {
return err
}
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return err
}
id := utils.MakeNameID("vfio", device.DeviceInfo.ID+strconv.Itoa(i), maxDevIDSize)
pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass)
// We need to ignore Host or PCI Bridges that are in the same IOMMU group as the
// passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge.
// Class 0x0604 is PCI bridge, 0x0600 is Host bridge
ignorePCIDevice, err := device.checkIgnorePCIClass(pciClass, deviceBDF, 0x0600)
if err != nil {
return err
}
if ignorePCIDevice {
continue
}
var vfio config.VFIODev
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
Class: pciClass,
}
if isPCIe {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[deviceBDF] = true
}
vfio = vfioPCI
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return err
}
vfio = config.VFIOAPDev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
APDevices: devices,
}
default:
return fmt.Errorf("Failed to append device: VFIO device type unrecognized")
}
device.VfioDevs = append(device.VfioDevs, &vfio)
}
coldPlug := device.DeviceInfo.ColdPlug
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")

View File

@ -116,7 +116,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
if devInfo.ID, err = dm.newDeviceID(); err != nil {
return nil, err
}
if isVFIO(devInfo.HostPath) {
if IsVFIO(devInfo.HostPath) {
return drivers.NewVFIODevice(&devInfo), nil
} else if isVhostUserBlk(devInfo) {
if devInfo.DriverOptions == nil {

View File

@ -17,8 +17,8 @@ const (
vfioPath = "/dev/vfio/"
)
// isVFIO checks if the device provided is a vfio group.
func isVFIO(hostPath string) bool {
// IsVFIO checks if the device provided is a vfio group.
func IsVFIO(hostPath string) bool {
// Ignore /dev/vfio/vfio character device
if strings.HasPrefix(hostPath, filepath.Join(vfioPath, "vfio")) {
return false

View File

@ -31,7 +31,7 @@ func TestIsVFIO(t *testing.T) {
}
for _, d := range data {
isVFIO := isVFIO(d.path)
isVFIO := IsVFIO(d.path)
assert.Equal(t, d.expected, isVFIO)
}
}

View File

@ -5,6 +5,8 @@
package hypervisors
import "fmt"
// Bridge is a bridge where devices can be hot plugged
type Bridge struct {
// DeviceAddr contains information about devices plugged and its address in the bridge
@ -26,6 +28,34 @@ type CPUDevice struct {
ID string
}
// PCIePort distinguish only between root and switch port
type PCIePort string
const (
// RootPort attach VFIO devices to a root-port
RootPort PCIePort = "root-port"
// SwitchPort attach VFIO devices to a switch-port
SwitchPort = "switch-port"
// BridgePort is the default
BridgePort = "bridge-port"
// NoPort is for disabling VFIO hotplug/coldplug
NoPort = "no-port"
)
func (p PCIePort) String() string {
switch p {
case RootPort:
return "root-port"
case SwitchPort:
return "switch-port"
case BridgePort:
return "bridge-port"
case NoPort:
return "no-port"
}
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
}
type HypervisorState struct {
BlockIndexMap map[int]struct{}
@ -41,10 +71,10 @@ type HypervisorState struct {
// HotpluggedCPUs is the list of CPUs that were hot-added
HotpluggedVCPUs []CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
PCIeRootPort int
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
PCIeRootPort int
ColdPlugVFIO PCIePort
HotplugVFIOOnRootBus bool
}

View File

@ -14,6 +14,7 @@ import (
"strconv"
"testing"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
)
@ -224,6 +225,7 @@ type RuntimeConfigOptions struct {
JaegerPassword string
PFlash []string
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
DefaultVCPUCount uint32
DefaultMaxVCPUCount uint32
DefaultMemSize uint32
@ -317,6 +319,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
guest_hook_path = "` + config.DefaultGuestHookPath + `"

View File

@ -9,6 +9,10 @@
package katautils
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
)
// name is the name of the runtime
var NAME = "@RUNTIME_NAME@"
@ -103,3 +107,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
// Default config file used by stateless systems.
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
const defaultColdPlugVFIO = hv.NoPort

View File

@ -20,6 +20,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@ -77,87 +78,88 @@ type factory struct {
}
type hypervisor struct {
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
BlockDeviceAIO string `toml:"block_device_aio"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
BlockDeviceAIO string `toml:"block_device_aio"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
}
type runtime struct {
@ -285,6 +287,13 @@ func (h hypervisor) firmware() (string, error) {
return ResolvePath(p)
}
func (h hypervisor) coldPlugVFIO() hv.PCIePort {
if h.ColdPlugVFIO == "" {
return defaultColdPlugVFIO
}
return h.ColdPlugVFIO
}
func (h hypervisor) firmwareVolume() (string, error) {
p := h.FirmwareVolume
@ -854,6 +863,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: h.DisableVhostNet,
EnableVhostUserStore: h.EnableVhostUserStore,
@ -1048,6 +1058,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
EnableIOThreads: h.EnableIOThreads,
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: true,
GuestHookPath: h.guestHookPath(),
@ -1278,6 +1289,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
EnableIOThreads: defaultEnableIOThreads,
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
PCIeRootPort: defaultPCIeRootPort,
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,
@ -1650,9 +1662,32 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
return err
}
return nil
}
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port
func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
// Currently only QEMU q35 supports advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" {
return nil
}
if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
return nil
}
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
}
// checkNetNsConfig performs sanity checks on disable_new_netns config.
// Because it is an expert option and conflicts with some other common configs.
func checkNetNsConfig(config oci.RuntimeConfig) error {

View File

@ -19,6 +19,7 @@ import (
"testing"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@ -70,7 +71,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
if hypervisor == "" {
return config, fmt.Errorf("BUG: need hypervisor")
}
var coldPlugVFIO hv.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
kernelParams := "foo=bar xyz"
@ -85,6 +86,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
@ -107,6 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: defaultVCPUCount,
DefaultMaxVCPUCount: defaultMaxVCPUCount,
@ -170,6 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
Msize9p: defaultMsize9p,
MemSlots: defaultMemSlots,
EntropySource: defaultEntropySource,
@ -564,6 +568,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
VirtioFSCache: defaultVirtioFSCacheMode,
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
ColdPlugVFIO: defaultColdPlugVFIO,
}
expectedAgentConfig := vc.KataAgentConfig{
@ -597,7 +602,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
func TestNewQemuHypervisorConfig(t *testing.T) {
dir := t.TempDir()
var coldPlugVFIO hv.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
imagePath := path.Join(dir, "image")
@ -606,6 +611,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
blockDeviceAIO := "io_uring"
defer func() {
@ -625,6 +631,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
TxRateLimiterMaxRate: txRateLimiterMaxRate,
SharedFS: "virtio-fs",

View File

@ -292,6 +292,10 @@ type HypervisorConfig struct {
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
ColdPlugVFIO hv.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool

View File

@ -509,6 +509,10 @@ type HypervisorConfig struct {
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
ColdPlugVFIO hv.PCIePort
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32

View File

@ -487,6 +487,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
ColdPlugVFIO: hconf.ColdPlugVFIO,
PCIeRootPort: hconf.PCIeRootPort,
BootToBeTemplate: hconf.BootToBeTemplate,
BootFromTemplate: hconf.BootFromTemplate,

View File

@ -7,6 +7,7 @@
package persistapi
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/opencontainers/runc/libcontainer/configs"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
@ -198,6 +199,10 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port or a switch or no-port
ColdPlugVFIO hv.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool

View File

@ -83,6 +83,7 @@ type QemuState struct {
VirtiofsDaemonPid int
PCIeRootPort int
HotplugVFIOOnRootBus bool
ColdPlugVFIO hv.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@ -282,6 +283,7 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
@ -708,6 +710,18 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
}
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
q.qemuConfig = qemuConfig
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)

View File

@ -32,6 +32,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
@ -620,6 +621,25 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return nil, err
}
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
var devs []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
device.ColdPlug = true
devs = append(devs, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
infos := sandboxConfig.Containers[cnt].DeviceInfos
infos = append(infos[:dev], infos[dev+1:]...)
sandboxConfig.Containers[cnt].DeviceInfos = infos
}
}
}
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
return nil, err
@ -629,6 +649,17 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return nil, err
}
if !coldPlugVFIO {
return s, nil
}
for _, dev := range devs {
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
return nil, err
}
}
return s, nil
}