From 8bbcb06af52294897a4d79099b74bcbaa1c9447c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 5 Aug 2021 14:39:04 +1000 Subject: [PATCH] qemu: Disable SHPC hotplug Under certain circumstances[0] Kata will attempt to use SHPC hotplug for PCI devices on the guest. In fact we explicitly enable SHPC on our PCI to PCI bridges, regardless of the qemu default. SHPC was designed a long, long time ago for physical hotplugging and works very poorly for a virtual environment. In particular it has a mandatory 5s delay to allow a (real, human) operator to back out the operation if they press a button by mistake. This alone makes it unusable for a fast start up application like Kata. Worse, the agent forces a PCI rescan during startup. That will race with the SHPC hotplug operation causing the device to go into a bad state where config space can't be accessed from the guest at all. The only reason we've sort of gotten away with this is that our default guest kernel configuration triggers what's arguably a kernel bug effectively disabling SHPC. That makes the agent rescan the only reason we see the new device. Now that we require a qemu >=6.1, which includes ACPI PCI hotplug on the q35 machine, we can explicitly disable SHPC in all cases. It's nothing but trouble. fixes #2174 Signed-off-by: David Gibson --- .../Nvidia-GPU-passthrough-and-Kata.md | 3 +-- src/runtime/virtcontainers/qemu.go | 17 ++++++++++++++++- src/runtime/virtcontainers/qemu_amd64_test.go | 15 +++++++++------ .../virtcontainers/qemu_arch_base_test.go | 15 +++++++++------ src/runtime/virtcontainers/qemu_arm64_test.go | 15 +++++++++------ 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md b/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md index 1a3253fdc8..58b36c02d7 100644 --- a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md +++ b/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md @@ -67,7 +67,7 @@ To use large BARs devices (for example, Nvidia Tesla P100), you need Kata versio The following configuration in the Kata `configuration.toml` file as shown below can work: -Hotplug for PCI devices by `shpchp` (Linux's SHPC PCI Hotplug driver): +Hotplug for PCI devices by `acpi_pcihp` (Linux's ACPI PCI Hotplug driver): ``` machine_type = "q35" @@ -91,7 +91,6 @@ The following kernel config options need to be enabled: ``` # Support PCI/PCIe device hotplug (Required for large BARs device) CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_HOTPLUG_PCI_SHPC=y # Support for loading modules (Required for load Nvidia drivers) CONFIG_MODULES=y diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index c2cee7a2b7..48252a0b5f 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -2126,8 +2126,23 @@ func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, ma ID: b.ID, // Each bridge is required to be assigned a unique chassis id > 0 Chassis: idx + 1, - SHPC: true, + SHPC: false, Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10), + // Certain guest BIOS versions think + // !SHPC means no hotplug, and won't + // reserve the IO and memory windows + // that will be needed for devices + // added underneath this bridge. This + // will only break for certain + // combinations of exact qemu, BIOS + // and guest kernel versions, but for + // consistency, just hint the usual + // default windows for a bridge (as + // the BIOS would use with SHPC) so + // that we can do ACPI hotplug. + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, ) } diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index 106abf31c5..4b8d044819 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -187,12 +187,15 @@ func TestQemuAmd64AppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, } diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 1cd904864b..4b495f73d9 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -307,12 +307,15 @@ func TestQemuArchBaseAppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, } diff --git a/src/runtime/virtcontainers/qemu_arm64_test.go b/src/runtime/virtcontainers/qemu_arm64_test.go index e421aa79b4..0d777af41b 100644 --- a/src/runtime/virtcontainers/qemu_arm64_test.go +++ b/src/runtime/virtcontainers/qemu_arm64_test.go @@ -78,12 +78,15 @@ func TestQemuArm64AppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, }