diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index 3cdef00289..c715d6b2c6 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -123,6 +123,8 @@ type HypervisorInfo struct { MemorySlots uint32 HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort + PCIeRootPort uint32 + PCIeSwitchPort uint32 Debug bool SecurityInfo SecurityInfo } @@ -339,6 +341,8 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, SocketPath: socketPath, SecurityInfo: securityInfo, }, nil diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 5c0070041e..48e3ad1800 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -89,6 +89,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti enableIOThreads := true hotPlugVFIO = config.BridgePort coldPlugVFIO = config.NoPort + pcieRootPort := uint32(0) + pcieSwitchPort := uint32(0) disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd") @@ -133,6 +135,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti EnableIOThreads: enableIOThreads, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, + PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs(), DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, @@ -276,6 +280,8 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, } if os.Geteuid() == 0 { diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index 6702ffd4cc..67530cafdc 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -335,6 +335,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, virtioFSdaemon := path.Join(dir, "virtiofsd") hotPlugVFIO = config.BridgePort coldPlugVFIO = config.NoPort + pcieRootPort := uint32(0) + pcieSwitchPort := uint32(0) configFileOptions := ktu.RuntimeConfigOptions{ Hypervisor: "qemu", @@ -353,6 +355,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, VirtioFSDaemon: virtioFSdaemon, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, + PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, } runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index 773eaaa2d5..70f7a69326 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -227,13 +227,11 @@ func (p PCIePort) Valid() bool { return false } -type PCIePortMapping map[string]bool - var ( // Each of this structures keeps track of the devices attached to the // different types of PCI ports. We can deduces the Bus number from it // and eliminate duplicates being assigned. - PCIeDevices = map[PCIePort]PCIePortMapping{} + PCIeDevicesPerPort = map[PCIePort][]VFIODev{} ) // DeviceInfo is an embedded type that contains device data common to all types of devices. @@ -420,11 +418,12 @@ type VFIODev struct { // APDevices are the Adjunct Processor devices assigned to the mdev APDevices []string - // Rank identifies a device in a IOMMU group - Rank int - // Port is the PCIe port type to which the device is attached Port PCIePort + + // HostPath is the path to the device on the host we need it as a reference + // to match a /dev/vfio/ device to a device in GK mode + HostPath string } // RNGDev represents a random number generator device diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 5f76bff48e..a89ec9b7e0 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -41,6 +41,8 @@ var ( PCISysFsDevicesClass PCISysFsProperty = "class" // /sys/bus/pci/devices/xxx/class PCISysFsSlotsAddress PCISysFsProperty = "address" // /sys/bus/pci/slots/xxx/address PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed + PCISysFsDevicesVendor PCISysFsProperty = "vendor" // /sys/bus/pci/devices/xxx/vendor + PCISysFsDevicesDevice PCISysFsProperty = "device" // /sys/bus/pci/devices/xxx/device ) func deviceLogger() *logrus.Entry { @@ -194,6 +196,10 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe if ignorePCIDevice { continue } + // Fetch the PCI Vendor ID and Device ID + vendorID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor) + deviceID := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice) + // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, @@ -202,8 +208,10 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe SysfsDev: deviceSysfsDev, IsPCIe: IsPCIeDevice(deviceBDF), Class: pciClass, - Rank: -1, + VendorID: vendorID, + DeviceID: deviceID, Port: device.Port, + HostPath: device.HostPath, } case config.VFIOAPDeviceMediatedType: diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index feec9c4482..dedaea4429 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -78,9 +78,12 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } if vfio.IsPCIe { - busIndex := len(config.PCIeDevices[vfio.Port]) + busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) - config.PCIeDevices[vfio.Port][vfio.BDF] = true + // We need to keep track the number of devices per port to deduce + // the corectu bus number, additionally we can use the VFIO device + // info to act upon different Vendor IDs and Device IDs. + config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio) } } diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index ed3708dc9a..3e687cde37 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,11 +71,10 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS dm.blockDriver = config.VirtioSCSI } - config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping) - - config.PCIeDevices[config.RootPort] = make(map[string]bool) - config.PCIeDevices[config.SwitchPort] = make(map[string]bool) - config.PCIeDevices[config.BridgePort] = make(map[string]bool) + config.PCIeDevicesPerPort = make(map[config.PCIePort][]config.VFIODev) + config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0) + config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0) + config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 7384cca5e4..ffdd5354ba 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -47,4 +47,6 @@ type HypervisorState struct { Pid int HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort + PCIeRootPort uint32 + PCIeSwitchPort uint32 } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index 041a2ec5ed..c855ae9fbe 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -226,6 +226,8 @@ type RuntimeConfigOptions struct { PFlash []string HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort + PCIeRootPort uint32 + PCIeSwitchPort uint32 DefaultVCPUCount uint32 DefaultMaxVCPUCount uint32 DefaultMemSize uint32 @@ -318,6 +320,8 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" hot_plug_vfio = "` + config.HotPlugVFIO.String() + `" + pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` + pcie_switch_port = ` + strconv.FormatUint(uint64(config.PCIeSwitchPort), 10) + ` msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index ea7ad487fe..bd57d30875 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -112,5 +112,8 @@ var defaultRuntimeConfiguration = "@CONFIG_PATH@" const defaultHotPlugVFIO = config.NoPort const defaultColdPlugVFIO = config.NoPort +const defaultPCIeRootPort = 0 +const defaultPCIeSwitchPort = 0 + const defaultRemoteHypervisorSocket = "/run/peerpod/hypervisor.sock" const defaultRemoteHypervisorTimeout = 600 diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 91d048e657..b9e460ae5b 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -57,6 +57,10 @@ const ( // the maximum amount of PCI bridges that can be cold plugged in a VM maxPCIBridges uint32 = 5 + // For mor info why these values, see: + // https://github.com/kata-containers/kata-containers/blob/main/docs/design/kata-vra.md#hypervisor-resource-limits + maxPCIeRootPorts uint32 = 16 + maxPCIeSwitchPorts uint32 = 16 errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section" ) @@ -150,6 +154,8 @@ type hypervisor struct { DisableImageNvdimm bool `toml:"disable_image_nvdimm"` HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` + PCIeRootPort uint32 `toml:"pcie_root_port"` + PCIeSwitchPort uint32 `toml:"pcie_switch_port"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` ConfidentialGuest bool `toml:"confidential_guest"` @@ -302,6 +308,20 @@ func (h hypervisor) hotPlugVFIO() config.PCIePort { return h.HotPlugVFIO } +func (h hypervisor) pcieRootPort() uint32 { + if h.PCIeRootPort > maxPCIeRootPorts { + return maxPCIeRootPorts + } + return h.PCIeRootPort +} + +func (h hypervisor) pcieSwitchPort() uint32 { + if h.PCIeSwitchPort > maxPCIeSwitchPorts { + return maxPCIeSwitchPorts + } + return h.PCIeSwitchPort +} + func (h hypervisor) firmwareVolume() (string, error) { p := h.FirmwareVolume @@ -936,6 +956,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { DisableImageNvdimm: h.DisableImageNvdimm, HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), + PCIeRootPort: h.pcieRootPort(), + PCIeSwitchPort: h.pcieSwitchPort(), DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -1131,6 +1153,8 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), ColdPlugVFIO: h.coldPlugVFIO(), HotPlugVFIO: h.hotPlugVFIO(), + PCIeRootPort: h.pcieRootPort(), + PCIeSwitchPort: h.pcieSwitchPort(), DisableVhostNet: true, GuestHookPath: h.guestHookPath(), VirtioFSExtraArgs: h.VirtioFSExtraArgs, @@ -1484,6 +1508,8 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { Msize9p: defaultMsize9p, ColdPlugVFIO: defaultColdPlugVFIO, HotPlugVFIO: defaultHotPlugVFIO, + PCIeRootPort: defaultPCIeRootPort, + PCIeSwitchPort: defaultPCIeSwitchPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 640afb2578..215ec3686c 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -87,6 +87,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime enableIOThreads := true hotPlugVFIO = config.NoPort coldPlugVFIO = config.BridgePort + pcieRootPort := uint32(0) + pcieSwitchPort := uint32(0) disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -109,6 +111,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime EnableIOThreads: enableIOThreads, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, + PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: defaultVCPUCount, DefaultMaxVCPUCount: defaultMaxVCPUCount, @@ -172,6 +176,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime EnableIOThreads: enableIOThreads, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, + PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, Msize9p: defaultMsize9p, MemSlots: defaultMemSlots, EntropySource: defaultEntropySource, @@ -569,6 +575,8 @@ func TestMinimalRuntimeConfig(t *testing.T) { DisableGuestSeLinux: defaultDisableGuestSeLinux, HotPlugVFIO: defaultHotPlugVFIO, ColdPlugVFIO: defaultColdPlugVFIO, + PCIeRootPort: defaultPCIeRootPort, + PCIeSwitchPort: defaultPCIeSwitchPort, } expectedAgentConfig := vc.KataAgentConfig{ @@ -610,6 +618,8 @@ func TestNewQemuHypervisorConfig(t *testing.T) { disableBlock := true enableIOThreads := true coldPlugVFIO = config.BridgePort + pcieRootPort := uint32(0) + pcieSwitchPort := uint32(0) orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" defer func() { @@ -628,6 +638,8 @@ func TestNewQemuHypervisorConfig(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, ColdPlugVFIO: coldPlugVFIO, + PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, SharedFS: "virtio-fs", diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 696cd59fb1..e3cfd1b6c7 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -463,6 +463,14 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } + if err := addHypervisorPCIeRootPortOverrides(ocispec, config); err != nil { + return err + } + + if err := addHypervisorPCIeSwitchPortOverrides(ocispec, config); err != nil { + return err + } + if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok { if value != "" { config.HypervisorConfig.HypervisorMachineType = value @@ -605,6 +613,29 @@ func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.Sand return nil } +func addHypervisorPCIeRootPortOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error { + + if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) { + if pcieRootPort > 0 { + sbConfig.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort) + } + }); err != nil { + return err + } + return nil +} + +func addHypervisorPCIeSwitchPortOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error { + if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeSwitchPort).setUint(func(pcieSwitchPort uint64) { + if pcieSwitchPort > 0 { + sbConfig.HypervisorConfig.PCIeSwitchPort = uint32(pcieSwitchPort) + } + }); err != nil { + return err + } + return nil +} + func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error { if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index e769e0cb96..d363b8b204 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -661,6 +661,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort + ocispec.Annotations[vcAnnotations.PCIeRootPort] = "1" + ocispec.Annotations[vcAnnotations.PCIeSwitchPort] = "1" ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi" ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true" @@ -701,6 +703,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort)) assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort)) + assert.Equal(sbConfig.HypervisorConfig.PCIeRootPort, uint32(1)) + assert.Equal(sbConfig.HypervisorConfig.PCIeSwitchPort, uint32(1)) assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864)) assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true) diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index d52a63079a..042971b70f 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -11,6 +11,7 @@ import ( "io" "os" "path/filepath" + "sort" "strconv" "strings" "syscall" @@ -830,12 +831,45 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error { } deviceInfos := append(virtualVolumesDeviceInfos, contConfig.DeviceInfos...) + // If we have a confidential guest we need to cold-plug the PCIe VFIO devices + // until we have TDISP/IDE PCIe support. + coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) + // Aggregate all the containner devices for hot-plug and use them to dedcue + // the correct amount of ports to reserve for the hypervisor. + hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort) + modeIsVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO) + + updatedDeviceInfos := []config.DeviceInfo{} + + for i, vfio := range deviceInfos { + // Only considering VFIO updates for Port and ColdPlug or + // HotPlug updates + isVFIODevice := deviceManager.IsVFIODevice(vfio.ContainerPath) + if hotPlugVFIO && isVFIODevice { + deviceInfos[i].ColdPlug = false + deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO + } + // Device is already cold-plugged at sandbox creation time + // ignore it for the container creation + if coldPlugVFIO && isVFIODevice { + continue + } + + updatedDeviceInfos = append(updatedDeviceInfos, deviceInfos[i]) + } + // If modeVFIO is enabled we need 1st to attach the VFIO control group + // device /dev/vfio/vfio an 2nd the actuall device(s) afterwards. + // Sort the devices starting with device #1 being the VFIO control group + // device and the next the actuall device(s) /dev/vfio/ + if modeIsVFIO { + deviceInfos = sortContainerVFIODevices(updatedDeviceInfos) + } + for _, info := range deviceInfos { dev, err := c.sandbox.devManager.NewDevice(info) if err != nil { return err } - storedDevices = append(storedDevices, ContainerDevice{ ID: dev.DeviceID(), ContainerPath: info.ContainerPath, @@ -889,12 +923,12 @@ func (c *Container) checkBlockDeviceSupport(ctx context.Context) bool { // Sort the devices starting with device #1 being the VFIO control group // device and the next the actuall device(s) e.g. /dev/vfio/ -func sortContainerVFIODevices(devices []ContainerDevice) []ContainerDevice { - var vfioDevices []ContainerDevice +func sortContainerVFIODevices(devices []config.DeviceInfo) []config.DeviceInfo { + var vfioDevices []config.DeviceInfo for _, device := range devices { if deviceManager.IsVFIOControlDevice(device.ContainerPath) { - vfioDevices = append([]ContainerDevice{device}, vfioDevices...) + vfioDevices = append([]config.DeviceInfo{device}, vfioDevices...) continue } vfioDevices = append(vfioDevices, device) @@ -902,6 +936,66 @@ func sortContainerVFIODevices(devices []ContainerDevice) []ContainerDevice { return vfioDevices } +// Depending on the HW we might need to inject metadata into the container +// In this case for the NV GPU we need to provide the correct mapping from +// VFIO- to GPU index inside of the VM when vfio_mode="guest-kernel", +// otherwise we do not know which GPU is which. +func (c *Container) annotateContainerWithVFIOMetadata() { + + type relation struct { + Bus string + Path string + Index int + } + + modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel) + + if modeIsGK { + // Hot plug is done let's update meta information about the + // hot plugged devices especially VFIO devices in modeIsGK + siblings := make([]relation, 0) + // In the sandbox we first create the root-ports and secondly + // the switch-ports. The range over map is not deterministic + // so lets first iterate over all root-port devices and then + // switch-port devices no special handling for bridge-port (PCI) + for _, dev := range config.PCIeDevicesPerPort["root-port"] { + // For the NV GPU we need special handling let's use only those + if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") { + siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath}) + } + } + for _, dev := range config.PCIeDevicesPerPort["switch-port"] { + // For the NV GPU we need special handling let's use only those + if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") { + siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath}) + } + } + // We need to sort the VFIO devices by bus to get the correct + // ordering root-port < switch-port + sort.Slice(siblings, func(i, j int) bool { + return siblings[i].Bus < siblings[j].Bus + }) + + for i := range siblings { + siblings[i].Index = i + } + // Now that we have the index lets connect the /dev/vfio/ + // to the correct index + for _, dev := range c.devices { + for _, bdf := range siblings { + if bdf.Path == dev.ContainerPath { + vfioNum := filepath.Base(dev.ContainerPath) + annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum) + annoValue := fmt.Sprintf("nvidia.com/gpu=%d", bdf.Index) + c.config.CustomSpec.Annotations[annoKey] = annoValue + c.Logger().Infof("Annotated container with %s: %s", annoKey, annoValue) + } + } + } + + } +} + // create creates and starts a container inside a Sandbox. It has to be // called only when a new container, not known by the sandbox, has to be created. func (c *Container) create(ctx context.Context) (err error) { @@ -921,37 +1015,6 @@ func (c *Container) create(ctx context.Context) (err error) { } } - // If cold-plug we've attached the devices already, do not try to - // attach them a second time. - coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) - modeVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO) - - if coldPlugVFIO { - var cntDevices []ContainerDevice - for _, dev := range c.devices { - isVFIOControlDevice := deviceManager.IsVFIOControlDevice(dev.ContainerPath) - if isVFIOControlDevice && modeVFIO { - cntDevices = append(cntDevices, dev) - } - - if strings.HasPrefix(dev.ContainerPath, vfioPath) { - c.Logger().WithFields(logrus.Fields{ - "device": dev, - }).Info("Remvoing device since we're cold-plugging no Attach needed") - continue - } - cntDevices = append(cntDevices, dev) - } - c.devices = cntDevices - } - // If modeVFIO is enabled we need 1st to attach the VFIO control group - // device /dev/vfio/vfio an 2nd the actuall device(s) afterwards. - // Sort the devices starting with device #1 being the VFIO control group - // device and the next the actuall device(s) /dev/vfio/ - if modeVFIO { - c.devices = sortContainerVFIODevices(c.devices) - } - c.Logger().WithFields(logrus.Fields{ "devices": c.devices, }).Info("Attach devices") @@ -959,6 +1022,8 @@ func (c *Container) create(ctx context.Context) (err error) { return } + c.annotateContainerWithVFIOMetadata() + // Deduce additional system mount info that should be handled by the agent // inside the VM c.getSystemMountInfo() diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 41ab4495eb..aa30823247 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -546,6 +546,12 @@ type HypervisorConfig struct { // root port, switch or no port ColdPlugVFIO config.PCIePort + // PCIeRootPort is the number of root-port to create for the VM + PCIeRootPort uint32 + + // PCIeSwitchPort is the number of switch-port to create for the VM + PCIeSwitchPort uint32 + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUsF float32 diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 034c61d312..4301ffc70a 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -486,6 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { DisableImageNvdimm: hconf.DisableImageNvdimm, HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, + PCIeRootPort: hconf.PCIeRootPort, + PCIeSwitchPort: hconf.PCIeSwitchPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, DisableVhostNet: hconf.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 1e284f3897..133fb028fb 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -199,6 +199,12 @@ type HypervisorConfig struct { // root, bridge, switch or no-port ColdPlugVFIO config.PCIePort + // PCIeRootPort is the number of ports needed in the hypvervisor + PCIeRootPort uint32 + + // PCIeSwitchPort is the number of ports needed in the hypvervisor + PCIeSwitchPort uint32 + // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 2b9cfbdd56..9414d05397 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -128,6 +128,12 @@ const ( // HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged. HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio" + // PCIeRootPort is the number of PCIe root ports to create for the VM. + PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port" + + // PCIeSwitchPort is the number of PCIe switch ports to create for the VM. + PCIeSwitchPort = kataAnnotHypervisorPrefix + "pcie_switch_port" + // EntropySource is a sandbox annotation to specify the path to a host source of // entropy (/dev/random, /dev/urandom or real hardware RNG device) EntropySource = kataAnnotHypervisorPrefix + "entropy_source" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index ecb1b2161e..0d48d48b8d 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -83,6 +83,8 @@ type QemuState struct { VirtiofsDaemonPid int HotplugVFIO config.PCIePort ColdPlugVFIO config.PCIePort + PCIeRootPort uint32 + PCIeSwitchPort uint32 } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -283,7 +285,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.UUID = uuid.Generate().String() q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO - q.state.HotPlugVFIO = q.config.HotPlugVFIO + q.state.PCIeRootPort = q.config.PCIeRootPort + q.state.PCIeSwitchPort = q.config.PCIeSwitchPort // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. @@ -803,11 +806,19 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort) vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) + // If the devices are not advertised via CRI or cold-plugged we need to + // get the number of pluggable root/switch ports from the config + numPCIeRootPorts := hypervisorConfig.PCIeRootPort + numPCIeSwitchPorts := hypervisorConfig.PCIeSwitchPort + // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 // max slots on the complete pci(e) topology is 256 in QEMU if vfioOnRootPort { + if numOfPluggablePorts < numPCIeRootPorts { + numOfPluggablePorts = numPCIeRootPorts + } if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } @@ -815,6 +826,9 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return nil } if vfioOnSwitchPort { + if numOfPluggablePorts < numPCIeSwitchPorts { + numOfPluggablePorts = numPCIeSwitchPorts + } if numOfPluggablePorts > maxPCIeSwitchPort { return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) } @@ -924,8 +938,9 @@ func (q *qemu) setupVirtioMem(ctx context.Context) error { machineType := q.HypervisorConfig().HypervisorMachineType if machineType == QemuVirt { addr = "00" - bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) - config.PCIeDevices[config.RootPort]["virtiomem"] = true + bridgeID = fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) + dev := config.VFIODev{ID: "virtiomem"} + config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) } err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0", addr, bridgeID) @@ -1640,8 +1655,9 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) - config.PCIeDevices[config.RootPort][devID] = true + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) + dev := config.VFIODev{ID: devID} + config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh) @@ -1901,8 +1917,10 @@ func (q *qemu) hotplugNetDevice(ctx context.Context, endpoint Endpoint, op Opera // Hotplug net dev to pcie root port for QemuVirt if machineType == QemuVirt { addr := "00" - bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) - config.PCIeDevices[config.RootPort][devID] = true + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevicesPerPort[config.RootPort])) + dev := config.VFIODev{ID: devID} + config.PCIeDevicesPerPort[config.RootPort] = append(config.PCIeDevicesPerPort[config.RootPort], dev) + return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridgeID, romFile, int(q.config.NumVCPUs()), defaultDisableModern) }