From c18ceae10925529ca6b4259e8fd10f5e233bad20 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Apr 2023 10:56:29 +0000 Subject: [PATCH 01/12] gpu: Add new struct PCIePort For the hypervisor to distinguish between PCIe components, adding a new enum that can be used for hot-plug and cold-plug of PCIe devices Fixes: #6687 Signed-off-by: Zvonko Kaiser --- .../pkg/hypervisors/hypervisor_state.go | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index b1f58e7fa6..6a9fd7af20 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -26,6 +26,20 @@ type CPUDevice struct { ID string } +// PCIePort distinguish only between root and switch port +type PCIePort string + +const ( + // RootPort attach VFIO devices to a root-port + RootPort PCIePort = "root-port" + // SwitchPort attach VFIO devices to a switch-port + SwitchPort = "switch-port" + // BridgePort is the default + BridgePort = "bridge-port" + // NoPort is for disabling VFIO hotplug/coldplug + NoPort = "no-port" +) + type HypervisorState struct { BlockIndexMap map[int]struct{} @@ -41,10 +55,10 @@ type HypervisorState struct { // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - Pid int - PCIeRootPort int - + HotpluggedMemory int + VirtiofsDaemonPid int + Pid int + PCIeRootPort int + ColdPlugVFIO PCIePort HotplugVFIOOnRootBus bool } From 377ebc2ad1f923c6ffc982f24d6f129b4cce2c03 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Apr 2023 11:24:50 +0000 Subject: [PATCH 02/12] gpu: Add configuration option for cold-plug VFIO Users can set cold-plug="root-port" to cold plug a VFIO device in QEMU Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 2 + src/runtime/cmd/kata-runtime/kata-env_test.go | 3 + src/runtime/pkg/katatestutils/utils.go | 2 + .../pkg/katautils/config-settings.go.in | 2 + src/runtime/pkg/katautils/config.go | 164 +++++++++--------- src/runtime/virtcontainers/hypervisor.go | 4 + src/runtime/virtcontainers/kata_agent.go | 3 +- src/runtime/virtcontainers/persist.go | 1 + .../virtcontainers/persist/api/config.go | 5 + 9 files changed, 104 insertions(+), 82 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index c129f8f434..c7e919d62e 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -17,6 +17,7 @@ import ( "github.com/prometheus/procfs" "github.com/urfave/cli" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" @@ -113,6 +114,7 @@ type HypervisorInfo struct { Msize9p uint32 MemorySlots uint32 PCIeRootPort uint32 + ColdPlugVFIO hv.PCIePort HotplugVFIOOnRootBus bool Debug bool } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 321bc507b6..246889472d 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -24,6 +24,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" @@ -85,6 +86,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC blockStorageDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true + coldPlugVFIO := hv.RootPort pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" @@ -129,6 +131,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + ColdPlugVFIO: coldPlugVFIO, PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index 4e3a784a23..b973063e89 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -14,6 +14,7 @@ import ( "strconv" "testing" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" ) @@ -224,6 +225,7 @@ type RuntimeConfigOptions struct { JaegerPassword string PFlash []string PCIeRootPort uint32 + ColdPlugVFIO hv.PCIePort DefaultVCPUCount uint32 DefaultMaxVCPUCount uint32 DefaultMemSize uint32 diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 7bfab6d9f4..14a2b0b585 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -103,3 +103,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" + +const defaultColdPlugVFIO = "no-port" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 763e9a6f44..f419e0d610 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -20,6 +20,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -77,87 +78,88 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - RootfsType string `toml:"rootfs_type"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - SeccompSandbox string `toml:"seccompsandbox"` - BlockDeviceAIO string `toml:"block_device_aio"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` - DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` - DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` - DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` - DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` - NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` - NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` - NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` - NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - PCIeRootPort uint32 `toml:"pcie_root_port"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - SevSnpGuest bool `toml:"sev_snp_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` - DisableGuestSeLinux bool `toml:"disable_guest_selinux"` - LegacySerial bool `toml:"use_legacy_serial"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + RootfsType string `toml:"rootfs_type"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + SeccompSandbox string `toml:"seccompsandbox"` + BlockDeviceAIO string `toml:"block_device_aio"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` + DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` + DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` + DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` + DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + PCIeRootPort uint32 `toml:"pcie_root_port"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + SevSnpGuest bool `toml:"sev_snp_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` + DisableGuestSeLinux bool `toml:"disable_guest_selinux"` + LegacySerial bool `toml:"use_legacy_serial"` } type runtime struct { diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index dee5fec8fe..0a490ef577 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -509,6 +509,10 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the + // root port, switch or no port + ColdPlugVFIO hv.PCIePort + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 9e5c8b34f4..5c22277d0a 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -1177,7 +1177,8 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr case config.VhostUserBlk: kataDevice = k.appendVhostUserBlkDevice(dev, device, c) case config.DeviceVFIO: - kataDevice = k.appendVfioDevice(dev, device, c) + k.Logger().Infof("### ColdPlugging container is not adding any VFIO devices") + //kataDevice = k.appendVfioDevice(dev, device, c) } if kataDevice == nil { diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 18c83e2515..cbba44e603 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -487,6 +487,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { DisableNestingChecks: hconf.DisableNestingChecks, DisableImageNvdimm: hconf.DisableImageNvdimm, HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, + ColdPlugVFIO: hconf.ColdPlugVFIO, PCIeRootPort: hconf.PCIeRootPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 5bef012194..71533d6519 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -7,6 +7,7 @@ package persistapi import ( + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/opencontainers/runc/libcontainer/configs" specs "github.com/opencontainers/runtime-spec/specs-go" ) @@ -198,6 +199,10 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool + // ColdPlugVFIO is used to indicate if devices need to be coldlugged on the + // root port or a switch or no-port + ColdPlugVFIO hv.PCIePort + // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool From 6107c32d70377f4bc83ed22aaab3ea4e035ada48 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Apr 2023 11:34:17 +0000 Subject: [PATCH 03/12] gpu: Assign default value to cold-plug Make sure the configuration is propagated to the right structs and the default value is assigned. Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env_test.go | 3 --- src/runtime/pkg/katautils/config.go | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 246889472d..321bc507b6 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -24,7 +24,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" @@ -86,7 +85,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC blockStorageDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true - coldPlugVFIO := hv.RootPort pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" @@ -131,7 +129,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - ColdPlugVFIO: coldPlugVFIO, PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index f419e0d610..06b217a9a0 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -856,6 +856,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + ColdPlugVFIO: h.ColdPlugVFIO, PCIeRootPort: h.PCIeRootPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, @@ -1050,6 +1051,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + ColdPlugVFIO: h.ColdPlugVFIO, PCIeRootPort: h.PCIeRootPort, DisableVhostNet: true, GuestHookPath: h.guestHookPath(), @@ -1280,6 +1282,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { EnableIOThreads: defaultEnableIOThreads, Msize9p: defaultMsize9p, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, + ColdPlugVFIO: defaultColdPlugVFIO, PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, From e2b5e7f73bbf0f4eacfedbb0fa69b1fa791198ac Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 20 Apr 2023 11:41:44 +0000 Subject: [PATCH 04/12] gpu: Add Rawdevices to hypervisor RawDevics are used to get PCIe device info early before the sandbox is started to make better PCIe topology decisions Signed-off-by: Zvonko Kaiser --- src/runtime/virtcontainers/hypervisor.go | 4 ++++ src/runtime/virtcontainers/sandbox.go | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 0a490ef577..bc33cfd672 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -509,6 +509,10 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // RawDevics are used to get PCIe device info early before the sandbox + // is started to make better PCIe topology decisions + RawDevices []config.DeviceInfo + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port ColdPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index e7c52fcc22..da1d52e9d5 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -619,6 +619,12 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor if err := validateHypervisorConfig(&sandboxConfig.HypervisorConfig); err != nil { return nil, err } + // Aggregate all the container devices and update the HV config + var devices []config.DeviceInfo + for _, ct := range sandboxConfig.Containers { + devices = append(devices, ct.DeviceInfos...) + } + sandboxConfig.HypervisorConfig.RawDevices = devices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { From c8cf7ed3bc3f9b7f40dcb66fcffc97eea1db4a57 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 21 Apr 2023 08:56:47 +0000 Subject: [PATCH 05/12] gpu: Add ColdPlug of VFIO devices with devManager If we have a VFIO device and cold-plug is enabled we mark each device as ColdPlug=true and let the VFIO module do the attaching. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/manager/manager.go | 2 +- src/runtime/pkg/device/manager/utils.go | 4 +- src/runtime/pkg/device/manager/utils_test.go | 2 +- src/runtime/virtcontainers/hypervisor.go | 4 +- src/runtime/virtcontainers/kata_agent.go | 3 +- src/runtime/virtcontainers/qemu.go | 20 +++++++++- src/runtime/virtcontainers/sandbox.go | 40 +++++++++++++++++--- 7 files changed, 61 insertions(+), 14 deletions(-) diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 34a51d3001..baf1209a75 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -116,7 +116,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device if devInfo.ID, err = dm.newDeviceID(); err != nil { return nil, err } - if isVFIO(devInfo.HostPath) { + if IsVFIO(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil } else if isVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index 17d14741c1..e78205d0c7 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -17,8 +17,8 @@ const ( vfioPath = "/dev/vfio/" ) -// isVFIO checks if the device provided is a vfio group. -func isVFIO(hostPath string) bool { +// IsVFIO checks if the device provided is a vfio group. +func IsVFIO(hostPath string) bool { // Ignore /dev/vfio/vfio character device if strings.HasPrefix(hostPath, filepath.Join(vfioPath, "vfio")) { return false diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index 273283823f..b57992b3d0 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -31,7 +31,7 @@ func TestIsVFIO(t *testing.T) { } for _, d := range data { - isVFIO := isVFIO(d.path) + isVFIO := IsVFIO(d.path) assert.Equal(t, d.expected, isVFIO) } } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index bc33cfd672..f773e91d5b 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -509,9 +509,9 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 - // RawDevics are used to get PCIe device info early before the sandbox + // VFIODevics are used to get PCIe device info early before the sandbox // is started to make better PCIe topology decisions - RawDevices []config.DeviceInfo + VFIODevices []config.DeviceInfo // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 5c22277d0a..9e5c8b34f4 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -1177,8 +1177,7 @@ func (k *kataAgent) appendDevices(deviceList []*grpc.Device, c *Container) []*gr case config.VhostUserBlk: kataDevice = k.appendVhostUserBlkDevice(dev, device, c) case config.DeviceVFIO: - k.Logger().Infof("### ColdPlugging container is not adding any VFIO devices") - //kataDevice = k.appendVfioDevice(dev, device, c) + kataDevice = k.appendVfioDevice(dev, device, c) } if kataDevice == nil { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 3c8cad5204..d65c93553a 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -83,6 +83,7 @@ type QemuState struct { VirtiofsDaemonPid int PCIeRootPort int HotplugVFIOOnRootBus bool + ColdPlugVFIO hv.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -282,6 +283,7 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.Logger().Debug("Creating UUID") q.state.UUID = uuid.Generate().String() + q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.PCIeRootPort = int(q.config.PCIeRootPort) @@ -708,9 +710,25 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit) } + q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) + + // If we have a VFIO device we need to update the firmware configuration + // if executed in a trusted execution environment. + if hypervisorConfig.ConfidentialGuest { + // At the sandbox level we alreaady checked that we have a + // VFIO device, pass-through of a PCIe device needs allocated + // mmemory in the firmware otherwise BARs cannot be mapped + if len(hypervisorConfig.VFIODevices) > 0 { + fwCfg := govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: "262144", + } + qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) + } + } + q.qemuConfig = qemuConfig - q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) return err } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index da1d52e9d5..35b3193944 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -32,6 +32,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" @@ -619,12 +620,30 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor if err := validateHypervisorConfig(&sandboxConfig.HypervisorConfig); err != nil { return nil, err } - // Aggregate all the container devices and update the HV config - var devices []config.DeviceInfo - for _, ct := range sandboxConfig.Containers { - devices = append(devices, ct.DeviceInfos...) + + // If we have a confidential guest we need to cold-plug the PCIe VFIO devices + // until we have TDISP/IDE PCIe support. + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) + var devs []config.DeviceInfo + for cnt, containers := range sandboxConfig.Containers { + for dev, device := range containers.DeviceInfos { + if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) { + device.ColdPlug = true + devs = append(devs, device) + // We need to remove the devices marked for cold-plug + // otherwise at the container level the kata-agent + // will try to hot-plug them. + infos := sandboxConfig.Containers[cnt].DeviceInfos + infos = append(infos[:dev], infos[dev+1:]...) + sandboxConfig.Containers[cnt].DeviceInfos = infos + } + } } - sandboxConfig.HypervisorConfig.RawDevices = devices + // If we have a confidential guest, we need to add a specific + // firmware configuration to the hypervisor. We cannot do it here at + // the sandbox level we need to do that at the hypervisor level, capturing + // the devices here and processing in CreateVM(). + sandboxConfig.HypervisorConfig.VFIODevices = devs // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -635,6 +654,17 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } + if !coldPlugVFIO { + return s, nil + } + + for _, dev := range devs { + _, err := s.AddDevice(ctx, dev) + if err != nil { + s.Logger().WithError(err).Debug("Cannot cold-plug add device") + return nil, err + } + } return s, nil } From 131f056a124182dfd8e5f7dc8390080c29f9ebc7 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 21 Apr 2023 10:19:40 +0000 Subject: [PATCH 06/12] gpu: Extract VFIO Functions to drivers Some functions may be used in other modules then only in the VFIO module, extract them and make them available to other layers like sandbox. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/utils.go | 96 +++++++++++++++++++++++++ src/runtime/pkg/device/drivers/vfio.go | 23 +----- 2 files changed, 99 insertions(+), 20 deletions(-) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 79e00adbd1..7c87e6a59b 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -10,10 +10,12 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/sirupsen/logrus" ) @@ -133,3 +135,97 @@ func GetAPVFIODevices(sysfsdev string) ([]string, error) { // Split by newlines, omitting final newline return strings.Split(string(data[:len(data)-1]), "\n"), nil } + +// Ignore specific PCI devices, supply the pciClass and the bitmask to check +// against the device class, deviceBDF for meaningfull info message +func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (bool, error) { + if pciClass == "" { + return false, nil + } + pciClassID, err := strconv.ParseUint(pciClass, 0, 32) + if err != nil { + return false, err + } + // ClassID is 16 bits, remove the two trailing zeros + pciClassID = pciClassID >> 8 + if pciClassID&bitmask == bitmask { + deviceLogger().Infof("Ignoring PCI (Host) Bridge deviceBDF %v Class %x", deviceBDF, pciClassID) + return true, nil + } + return false, nil +} + +// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group +// We can reuse this function at various leverls, sandbox, container. +func GetAllVFIODevicesFromIOMMUGroup(device *config.DeviceInfo) ([]*config.VFIODev, error) { + + vfioDevs := []*config.VFIODev{} + + vfioGroup := filepath.Base(device.HostPath) + iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") + + deviceFiles, err := os.ReadDir(iommuDevicesPath) + if err != nil { + return nil, err + } + + // Pass all devices in iommu group + for i, deviceFile := range deviceFiles { + //Get bdf of device eg 0000:00:1c.0 + deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath) + if err != nil { + return nil, err + } + id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize) + + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the + // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. + // Class 0x0604 is PCI bridge, 0x0600 is Host bridge + ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) + if err != nil { + return nil, err + } + if ignorePCIDevice { + continue + } + + var vfio config.VFIODev + + switch vfioDeviceType { + case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: + isPCIe := isPCIeDevice(deviceBDF) + // Do not directly assign to `vfio` -- need to access field still + vfioPCI := config.VFIOPCIDev{ + ID: id, + Type: vfioDeviceType, + BDF: deviceBDF, + SysfsDev: deviceSysfsDev, + IsPCIe: isPCIe, + Class: pciClass, + } + if isPCIe { + vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) + AllPCIeDevs[deviceBDF] = true + } + vfio = vfioPCI + case config.VFIOAPDeviceMediatedType: + devices, err := GetAPVFIODevices(deviceSysfsDev) + if err != nil { + return nil, err + } + vfio = config.VFIOAPDev{ + ID: id, + SysfsDev: deviceSysfsDev, + Type: config.VFIOAPDeviceMediatedType, + APDevices: devices, + } + default: + return nil, fmt.Errorf("Failed to append device: VFIO device type unrecognized") + } + + vfioDevs = append(vfioDevs, &vfio) + } + + return vfioDevs, nil +} diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 1099f8f0b7..4d06240b4b 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -54,25 +54,6 @@ func NewVFIODevice(devInfo *config.DeviceInfo) *VFIODevice { } } -// Ignore specific PCI devices, supply the pciClass and the bitmask to check -// against the device class, deviceBDF for meaningfull info message -func (device *VFIODevice) checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (bool, error) { - if pciClass == "" { - return false, nil - } - pciClassID, err := strconv.ParseUint(pciClass, 0, 32) - if err != nil { - return false, err - } - // ClassID is 16 bits, remove the two trailing zeros - pciClassID = pciClassID >> 8 - if pciClassID&bitmask == bitmask { - deviceLogger().Infof("Ignoring PCI (Host) Bridge deviceBDF %v Class %x", deviceBDF, pciClassID) - return true, nil - } - return false, nil -} - // Attach is standard interface of api.Device, it's used to add device to some // DeviceReceiver func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceReceiver) (retErr error) { @@ -90,6 +71,8 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(device.DeviceInfo) + vfioGroup := filepath.Base(device.DeviceInfo.HostPath) iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") @@ -111,7 +94,7 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. // Class 0x0604 is PCI bridge, 0x0600 is Host bridge - ignorePCIDevice, err := device.checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) + ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) if err != nil { return err } From 2a830177cab9575863fbaf4ddadba25b5e07b982 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 21 Apr 2023 10:43:56 +0000 Subject: [PATCH 07/12] gpu: Add fwcfg helper function Added driver util function for easier handling of VFIO devices outside of the VFIO module. At the sandbox level we may need to set options depending if we have a VFIO/PCIe device, like the fwCfg for confiential guests. Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 22 ++++---- src/runtime/pkg/device/drivers/utils.go | 8 +-- src/runtime/pkg/device/drivers/vfio.go | 66 +----------------------- src/runtime/pkg/katatestutils/utils.go | 1 + src/runtime/virtcontainers/qemu.go | 44 +++++++++++++--- src/runtime/virtcontainers/sandbox.go | 2 +- 6 files changed, 56 insertions(+), 87 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index c7e919d62e..f17480aba0 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -307,17 +307,17 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { } return HypervisorInfo{ - Debug: config.HypervisorConfig.Debug, - MachineType: config.HypervisorConfig.HypervisorMachineType, - Version: version, - Path: hypervisorPath, - BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, - Msize9p: config.HypervisorConfig.Msize9p, - MemorySlots: config.HypervisorConfig.MemSlots, - EntropySource: config.HypervisorConfig.EntropySource, - SharedFS: config.HypervisorConfig.SharedFS, - VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, - + Debug: config.HypervisorConfig.Debug, + MachineType: config.HypervisorConfig.HypervisorMachineType, + Version: version, + Path: hypervisorPath, + BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, + Msize9p: config.HypervisorConfig.Msize9p, + MemorySlots: config.HypervisorConfig.MemSlots, + EntropySource: config.HypervisorConfig.EntropySource, + SharedFS: config.HypervisorConfig.SharedFS, + VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: config.HypervisorConfig.PCIeRootPort, SocketPath: socketPath, diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 7c87e6a59b..bfffa31a2e 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -156,8 +156,10 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo } // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group -// We can reuse this function at various leverls, sandbox, container. -func GetAllVFIODevicesFromIOMMUGroup(device *config.DeviceInfo) ([]*config.VFIODev, error) { +// We can reuse this function at various levels, sandbox, container. +// Only the VFIO module is allowed to do bus assignments, all other modules need to +// ignore it if used as helper function to get VFIO information. +func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) { vfioDevs := []*config.VFIODev{} @@ -204,7 +206,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device *config.DeviceInfo) ([]*config.VFIOD IsPCIe: isPCIe, Class: pciClass, } - if isPCIe { + if isPCIe && !ignoreBusAssignment { vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) AllPCIeDevs[deviceBDF] = true } diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 4d06240b4b..106220dcff 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -11,7 +11,6 @@ import ( "fmt" "os" "path/filepath" - "strconv" "strings" "github.com/sirupsen/logrus" @@ -71,74 +70,11 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() - device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(device.DeviceInfo) - - vfioGroup := filepath.Base(device.DeviceInfo.HostPath) - iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") - - deviceFiles, err := os.ReadDir(iommuDevicesPath) + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false) if err != nil { return err } - // Pass all devices in iommu group - for i, deviceFile := range deviceFiles { - //Get bdf of device eg 0000:00:1c.0 - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath) - if err != nil { - return err - } - id := utils.MakeNameID("vfio", device.DeviceInfo.ID+strconv.Itoa(i), maxDevIDSize) - - pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) - // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the - // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. - // Class 0x0604 is PCI bridge, 0x0600 is Host bridge - ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) - if err != nil { - return err - } - if ignorePCIDevice { - continue - } - - var vfio config.VFIODev - - switch vfioDeviceType { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - isPCIe := isPCIeDevice(deviceBDF) - // Do not directly assign to `vfio` -- need to access field still - vfioPCI := config.VFIOPCIDev{ - ID: id, - Type: vfioDeviceType, - BDF: deviceBDF, - SysfsDev: deviceSysfsDev, - IsPCIe: isPCIe, - Class: pciClass, - } - if isPCIe { - vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) - AllPCIeDevs[deviceBDF] = true - } - vfio = vfioPCI - case config.VFIOAPDeviceMediatedType: - devices, err := GetAPVFIODevices(deviceSysfsDev) - if err != nil { - return err - } - vfio = config.VFIOAPDev{ - ID: id, - SysfsDev: deviceSysfsDev, - Type: config.VFIOAPDeviceMediatedType, - APDevices: devices, - } - default: - return fmt.Errorf("Failed to append device: VFIO device type unrecognized") - } - - device.VfioDevs = append(device.VfioDevs, &vfio) - } - coldPlug := device.DeviceInfo.ColdPlug deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device") diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index b973063e89..bec0ed70d4 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -319,6 +319,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` + cold_plug_vfio = "` + config.ColdPlugVFIO + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d65c93553a..064aa72c76 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -718,20 +718,50 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi // At the sandbox level we alreaady checked that we have a // VFIO device, pass-through of a PCIe device needs allocated // mmemory in the firmware otherwise BARs cannot be mapped - if len(hypervisorConfig.VFIODevices) > 0 { - fwCfg := govmmQemu.FwCfg{ - Name: "opt/ovmf/X-PciMmio64Mb", - Str: "262144", - } - qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) + // First check if we have a PCIe devices, otherwise ignore + err, fwCfg := q.appendFwCfgForConfidentialGuest(hypervisorConfig.VFIODevices) + if err != nil { + return err + } + if fwCfg != nil { + qemuConfig.FwCfg = append(qemuConfig.FwCfg, *fwCfg) } } - q.qemuConfig = qemuConfig return err } +// appendFwCfgForConfidentialGuest appends the firmware configuration for a +// VFIO and PCIe device, otherwise it will be ignored. +func (q *qemu) appendFwCfgForConfidentialGuest(vfioDevices []config.DeviceInfo) (error, *govmmQemu.FwCfg) { + var err error + for _, dev := range vfioDevices { + dev.HostPath, err = config.GetHostPath(dev, false, "") + if err != nil { + return err, nil + } + vfioDevs, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev, true) + if err != nil { + return err, nil + } + fwCfg := govmmQemu.FwCfg{} + for _, vfioDev := range vfioDevs { + switch (*vfioDev).GetType() { + case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: + if (*vfioDev).(config.VFIOPCIDev).IsPCIe { + fwCfg = govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: "262144", + } + return nil, &fwCfg + } + } + } + } + return nil, nil +} + func (q *qemu) checkBpfEnabled() { if q.config.SeccompSandbox != "" { out, err := os.ReadFile("/proc/sys/net/core/bpf_jit_enable") diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 35b3193944..ff3f7fe74f 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -623,7 +623,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO == hv.RootPort) var devs []config.DeviceInfo for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { From dded731db3447712ab29b31a75f8c0c5a6f81a93 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 25 Apr 2023 09:53:08 +0000 Subject: [PATCH 08/12] gpu: Add OVMF setting for MMIO aperture The default size of OVMFs aperture is too low to initialized PCIe devices with huge BARs Signed-off-by: Zvonko Kaiser --- src/runtime/virtcontainers/hypervisor.go | 4 -- src/runtime/virtcontainers/qemu.go | 53 +++++------------------- src/runtime/virtcontainers/sandbox.go | 5 --- 3 files changed, 10 insertions(+), 52 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index f773e91d5b..0a490ef577 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -509,10 +509,6 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 - // VFIODevics are used to get PCIe device info early before the sandbox - // is started to make better PCIe topology decisions - VFIODevices []config.DeviceInfo - // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port ColdPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 064aa72c76..49b0d6abde 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -712,56 +712,23 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) - // If we have a VFIO device we need to update the firmware configuration - // if executed in a trusted execution environment. - if hypervisorConfig.ConfidentialGuest { - // At the sandbox level we alreaady checked that we have a - // VFIO device, pass-through of a PCIe device needs allocated - // mmemory in the firmware otherwise BARs cannot be mapped - // First check if we have a PCIe devices, otherwise ignore - err, fwCfg := q.appendFwCfgForConfidentialGuest(hypervisorConfig.VFIODevices) - if err != nil { - return err - } - if fwCfg != nil { - qemuConfig.FwCfg = append(qemuConfig.FwCfg, *fwCfg) + // The default OVMF MMIO aperture is too small for some PCIe devices + // with huge BARs so we need to increase it. + // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string + if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { + pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) + fwCfg := govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: pciMmio64Mb, } + qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } + q.qemuConfig = qemuConfig return err } -// appendFwCfgForConfidentialGuest appends the firmware configuration for a -// VFIO and PCIe device, otherwise it will be ignored. -func (q *qemu) appendFwCfgForConfidentialGuest(vfioDevices []config.DeviceInfo) (error, *govmmQemu.FwCfg) { - var err error - for _, dev := range vfioDevices { - dev.HostPath, err = config.GetHostPath(dev, false, "") - if err != nil { - return err, nil - } - vfioDevs, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev, true) - if err != nil { - return err, nil - } - fwCfg := govmmQemu.FwCfg{} - for _, vfioDev := range vfioDevs { - switch (*vfioDev).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - if (*vfioDev).(config.VFIOPCIDev).IsPCIe { - fwCfg = govmmQemu.FwCfg{ - Name: "opt/ovmf/X-PciMmio64Mb", - Str: "262144", - } - return nil, &fwCfg - } - } - } - } - return nil, nil -} - func (q *qemu) checkBpfEnabled() { if q.config.SeccompSandbox != "" { out, err := os.ReadFile("/proc/sys/net/core/bpf_jit_enable") diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index ff3f7fe74f..644904f75d 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -639,11 +639,6 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } } } - // If we have a confidential guest, we need to add a specific - // firmware configuration to the hypervisor. We cannot do it here at - // the sandbox level we need to do that at the hypervisor level, capturing - // the devices here and processing in CreateVM(). - sandboxConfig.HypervisorConfig.VFIODevices = devs // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { From 0fec2e6986f1e532f8e31f32b0ca90f8cef95322 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 27 Apr 2023 09:20:30 +0000 Subject: [PATCH 09/12] gpu: Add cold-plug test Cold plug setting is now correctly decoded in toml Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env_test.go | 18 ++++++++++++------ .../pkg/hypervisors/hypervisor_state.go | 14 ++++++++++++++ src/runtime/pkg/katatestutils/utils.go | 2 +- .../virtcontainers/persist/api/config.go | 2 +- src/runtime/virtcontainers/qemu.go | 3 +-- 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 321bc507b6..3760104d0c 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -19,6 +19,7 @@ import ( "testing" "github.com/BurntSushi/toml" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -74,6 +75,7 @@ func createConfig(configPath string, fileData string) error { } func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) { + var coldPlugVFIO hv.PCIePort const logPath = "/log/path" hypervisorPath := filepath.Join(prefixDir, "hypervisor") kernelPath := filepath.Join(prefixDir, "kernel") @@ -86,6 +88,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + coldPlugVFIO = hv.NoPort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd") @@ -129,6 +132,7 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + ColdPlugVFIO: coldPlugVFIO, PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, @@ -191,12 +195,13 @@ func genericGetExpectedHostDetails(tmpdir string, expectedVendor string, expecte expectedSupportVSocks, _ := vcUtils.SupportsVsocks() expectedHostDetails := HostInfo{ - Kernel: expectedKernelVersion, - Architecture: expectedArch, - Distro: expectedDistro, - CPU: expectedCPU, - VMContainerCapable: expectedVMContainerCapable, - SupportVSocks: expectedSupportVSocks, + AvailableGuestProtections: vc.AvailableGuestProtections(), + Kernel: expectedKernelVersion, + Architecture: expectedArch, + Distro: expectedDistro, + CPU: expectedCPU, + VMContainerCapable: expectedVMContainerCapable, + SupportVSocks: expectedSupportVSocks, } testProcCPUInfo := filepath.Join(tmpdir, "cpuinfo") @@ -273,6 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } if os.Geteuid() == 0 { diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 6a9fd7af20..c241dd6756 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -5,6 +5,8 @@ package hypervisors +import "fmt" + // Bridge is a bridge where devices can be hot plugged type Bridge struct { // DeviceAddr contains information about devices plugged and its address in the bridge @@ -40,6 +42,18 @@ const ( NoPort = "no-port" ) +func (p PCIePort) String() string { + switch p { + case RootPort: + return "root-port" + case SwitchPort: + return "switch-port" + case NoPort: + return "no-port" + } + return fmt.Sprintf("unknown PCIePort: %s", string(p)) +} + type HypervisorState struct { BlockIndexMap map[int]struct{} diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index bec0ed70d4..4c8257a40f 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -319,7 +319,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` - cold_plug_vfio = "` + config.ColdPlugVFIO + `" + cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 71533d6519..e4facc6b98 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -199,7 +199,7 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool - // ColdPlugVFIO is used to indicate if devices need to be coldlugged on the + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port or a switch or no-port ColdPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 49b0d6abde..3a8c55cd38 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -710,8 +710,6 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit) } - q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) - // The default OVMF MMIO aperture is too small for some PCIe devices // with huge BARs so we need to increase it. // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string @@ -726,6 +724,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi q.qemuConfig = qemuConfig + q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) return err } From f7ad75cb12cfd3491b7dd8518607396235fa22c1 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 27 Apr 2023 09:35:05 +0000 Subject: [PATCH 10/12] gpu: Cold-plug extend the api.md Make the hypervisorconfig consistent in code and api.md Signed-off-by: Zvonko Kaiser --- src/runtime/virtcontainers/documentation/api/1.0/api.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index 9255a96ca9..d3071a86f2 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -292,6 +292,10 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the + // root port, switch or no port + ColdPlugVFIO hv.PCIePort + // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool From 138ada049c7a4d70b6e8a02f90ed489b73d40491 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 27 Apr 2023 09:59:47 +0000 Subject: [PATCH 11/12] gpu: Cold Plug VFIO toml setting Added the cold_plug_vfio setting to the qemu-toml.in with some epxlanation Signed-off-by: Zvonko Kaiser --- src/runtime/config/configuration-qemu.toml.in | 5 +++++ src/runtime/pkg/katautils/config_test.go | 11 +++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 6446b0d0d7..f58f871418 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -352,6 +352,11 @@ pflashes = [] # Default false #hotplug_vfio_on_root_bus = true +# In a confidential compute environment hot-plugging can compromise +# security. Enable cold-plugging of VFIO devices to a root-port. +# The default setting is "no-port", which means disabled. +#cold_plug_vfio = "root-port" + # Before hot plugging a PCIe device, you need to add a pcie_root_port device. # Use this parameter when using some large PCI bar devices, such as Nvidia GPU # The value means the number of pcie_root_port diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 80268f911e..171f011b8e 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -19,6 +19,7 @@ import ( "testing" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -70,7 +71,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf if hypervisor == "" { return config, fmt.Errorf("BUG: need hypervisor") } - + var coldPlugVFIO hv.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") kernelParams := "foo=bar xyz" @@ -85,6 +86,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + coldPlugVFIO = hv.RootPort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -107,6 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: defaultVCPUCount, DefaultMaxVCPUCount: defaultMaxVCPUCount, @@ -170,6 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, MemSlots: defaultMemSlots, EntropySource: defaultEntropySource, @@ -564,6 +568,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { VirtioFSCache: defaultVirtioFSCacheMode, BlockDeviceAIO: defaultBlockDeviceAIO, DisableGuestSeLinux: defaultDisableGuestSeLinux, + ColdPlugVFIO: defaultColdPlugVFIO, } expectedAgentConfig := vc.KataAgentConfig{ @@ -597,7 +602,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { func TestNewQemuHypervisorConfig(t *testing.T) { dir := t.TempDir() - + var coldPlugVFIO hv.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") imagePath := path.Join(dir, "image") @@ -606,6 +611,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + coldPlugVFIO = hv.RootPort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" defer func() { @@ -625,6 +631,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + ColdPlugVFIO: coldPlugVFIO, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, SharedFS: "virtio-fs", From 13d7f39c71e45ae18d9c12cdf5dfdc15fcbbb349 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 2 May 2023 08:12:18 +0000 Subject: [PATCH 12/12] gpu: Check for VFIO port assignments Bailing out early if the port is wrong, allowed port settings are no-port, root-port, switch-port Signed-off-by: Zvonko Kaiser --- .../pkg/containerd-shim-v2/create_test.go | 4 +++ .../pkg/hypervisors/hypervisor_state.go | 4 ++- .../pkg/katautils/config-settings.go.in | 6 +++- src/runtime/pkg/katautils/config.go | 34 +++++++++++++++++-- src/runtime/virtcontainers/sandbox.go | 2 +- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index ccad5ceea1..e3e8e9369e 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -20,6 +20,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" @@ -308,6 +309,7 @@ func TestCreateContainerConfigFail(t *testing.T) { } func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) { + var coldPlugVFIO hv.PCIePort if dir == "" { return "", fmt.Errorf("BUG: need directory") } @@ -332,6 +334,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") + coldPlugVFIO = hv.RootPort configFileOptions := ktu.RuntimeConfigOptions{ Hypervisor: "qemu", @@ -350,6 +353,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err DisableNewNetNs: disableNewNetNs, SharedFS: sharedFS, VirtioFSDaemon: virtioFSdaemon, + ColdPlugVFIO: coldPlugVFIO, } runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions) diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index c241dd6756..482b7e9e20 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -48,10 +48,12 @@ func (p PCIePort) String() string { return "root-port" case SwitchPort: return "switch-port" + case BridgePort: + return "bridge-port" case NoPort: return "no-port" } - return fmt.Sprintf("unknown PCIePort: %s", string(p)) + return fmt.Sprintf("", string(p)) } type HypervisorState struct { diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 14a2b0b585..139d548264 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -9,6 +9,10 @@ package katautils +import ( + hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" +) + // name is the name of the runtime var NAME = "@RUNTIME_NAME@" @@ -104,4 +108,4 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" -const defaultColdPlugVFIO = "no-port" +const defaultColdPlugVFIO = hv.NoPort diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 06b217a9a0..6b08f4afe1 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -287,6 +287,13 @@ func (h hypervisor) firmware() (string, error) { return ResolvePath(p) } +func (h hypervisor) coldPlugVFIO() hv.PCIePort { + if h.ColdPlugVFIO == "" { + return defaultColdPlugVFIO + } + return h.ColdPlugVFIO +} + func (h hypervisor) firmwareVolume() (string, error) { p := h.FirmwareVolume @@ -856,7 +863,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, - ColdPlugVFIO: h.ColdPlugVFIO, + ColdPlugVFIO: h.coldPlugVFIO(), PCIeRootPort: h.PCIeRootPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, @@ -1051,7 +1058,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, - ColdPlugVFIO: h.ColdPlugVFIO, + ColdPlugVFIO: h.coldPlugVFIO(), PCIeRootPort: h.PCIeRootPort, DisableVhostNet: true, GuestHookPath: h.guestHookPath(), @@ -1655,9 +1662,32 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO + machineType := config.HypervisorConfig.HypervisorMachineType + if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil { + return err + } + return nil } +// checkPCIeConfig ensures the PCIe configuration is valid. +// Only allow one of the following settings for cold-plug: +// no-port, root-port, switch-port +func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error { + // Currently only QEMU q35 supports advanced PCIe topologies + // firecracker, dragonball do not have right now any PCIe support + if machineType != "q35" { + return nil + } + if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort { + return nil + } + + return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s", + vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort) +} + // checkNetNsConfig performs sanity checks on disable_new_netns config. // Because it is an expert option and conflicts with some other common configs. func checkNetNsConfig(config oci.RuntimeConfig) error { diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 644904f75d..0eb866bb03 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -623,7 +623,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO == hv.RootPort) + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) var devs []config.DeviceInfo for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos {