From e248de46168b0763d1805e9d29b40d18197e54f7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 15 Sep 2021 11:19:50 +1000 Subject: [PATCH 1/3] vendor: Update govmm Update to commit 1b60b536f3c7, in particular to get extensions to allow IO and memory window reservations to be set on PCI bridges. https://github.com/kata-containers/govmm/pull/201 Git log: de039da govmm/qemu: Let IO/memory reservations be specified for bridge devices Signed-off-by: David Gibson --- src/runtime/go.mod | 2 +- src/runtime/go.sum | 4 ++-- .../kata-containers/govmm/qemu/qemu.go | 19 +++++++++++++++++++ src/runtime/vendor/modules.txt | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/runtime/go.mod b/src/runtime/go.mod index d74e690245..3d3594149a 100644 --- a/src/runtime/go.mod +++ b/src/runtime/go.mod @@ -24,7 +24,7 @@ require ( github.com/gogo/protobuf v1.3.2 github.com/hashicorp/go-multierror v1.0.0 github.com/intel-go/cpuid v0.0.0-20210602155658-5747e5cec0d9 - github.com/kata-containers/govmm v0.0.0-20210831124834-2f8e417bb2c4 + github.com/kata-containers/govmm v0.0.0-20210909155007-1b60b536f3c7 github.com/mdlayher/vsock v0.0.0-20191108225356-d9c65923cb8f github.com/opencontainers/runc v1.0.1 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 diff --git a/src/runtime/go.sum b/src/runtime/go.sum index 54d0dc1582..a2647dd72b 100644 --- a/src/runtime/go.sum +++ b/src/runtime/go.sum @@ -357,8 +357,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= -github.com/kata-containers/govmm v0.0.0-20210831124834-2f8e417bb2c4 h1:F+/U5Vfep00pjh5oZFrVyDnVoQg8Wu7ZtXGGkaN5Glg= -github.com/kata-containers/govmm v0.0.0-20210831124834-2f8e417bb2c4/go.mod h1:A6QaNB6N6PRQ9mTRpFtUxiF5T5CJpzLALjxBrUQPlFI= +github.com/kata-containers/govmm v0.0.0-20210909155007-1b60b536f3c7 h1:lrtaReMyoviyn/Gtd9iAmQ9qNSTaS3QC1NgQ+h5fliI= +github.com/kata-containers/govmm v0.0.0-20210909155007-1b60b536f3c7/go.mod h1:A6QaNB6N6PRQ9mTRpFtUxiF5T5CJpzLALjxBrUQPlFI= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= diff --git a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go index ee814a9c6b..e57a4b26a9 100644 --- a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go +++ b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go @@ -1802,6 +1802,15 @@ type BridgeDevice struct { // ROMFile specifies the ROM file being used for this device. ROMFile string + + // Address range reservations for devices behind the bridge + // NB: strings seem an odd choice, but if they were integers, + // they'd default to 0 by Go's rules in all the existing users + // who don't set them. 0 is a valid value for certain cases, + // but not you want by default. + IOReserve string + MemReserve string + Pref64Reserve string } // Valid returns true if the BridgeDevice structure is valid and complete. @@ -1852,6 +1861,16 @@ func (bridgeDev BridgeDevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", bridgeDev.ROMFile)) } + if bridgeDev.IOReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", bridgeDev.IOReserve)) + } + if bridgeDev.MemReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", bridgeDev.MemReserve)) + } + if bridgeDev.Pref64Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", bridgeDev.Pref64Reserve)) + } + qemuParams = append(qemuParams, "-device") qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) diff --git a/src/runtime/vendor/modules.txt b/src/runtime/vendor/modules.txt index f41c35f96e..407f522bde 100644 --- a/src/runtime/vendor/modules.txt +++ b/src/runtime/vendor/modules.txt @@ -180,7 +180,7 @@ github.com/hashicorp/go-multierror # github.com/intel-go/cpuid v0.0.0-20210602155658-5747e5cec0d9 ## explicit github.com/intel-go/cpuid -# github.com/kata-containers/govmm v0.0.0-20210831124834-2f8e417bb2c4 +# github.com/kata-containers/govmm v0.0.0-20210909155007-1b60b536f3c7 ## explicit github.com/kata-containers/govmm/qemu # github.com/mailru/easyjson v0.7.0 From cc4983eeac5932eaccaac69c70ab793bd3628dfd Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 8 Sep 2021 13:35:44 +1000 Subject: [PATCH 2/3] runtime: Remove unused qemuArchBase.appendBridges definition qemuArchBase.appendBridges is never actually used, because the bare qemuArchBase type is itself never used (outside of unit tests). Instead *all* the subclasses of qemuArchBase override appendBridges() to call the very similar, but not identical genericAppendBridges. So, we can remove the qemuArchBase.appendBridges implementation. Furthermore, all those subclasses override appendBridges() in exactly the same way, and so we can remove *those* definitions and replace the base class qemuArchBase appendBridges() with that version, calling genericAppendBridges(). Signed-off-by: David Gibson --- src/runtime/virtcontainers/qemu_amd64.go | 5 ---- src/runtime/virtcontainers/qemu_arch_base.go | 27 +------------------- src/runtime/virtcontainers/qemu_arm64.go | 5 ---- src/runtime/virtcontainers/qemu_ppc64le.go | 5 ---- src/runtime/virtcontainers/qemu_s390x.go | 5 ---- 5 files changed, 1 insertion(+), 46 deletions(-) diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index eab95ff571..689575ab5d 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -193,11 +193,6 @@ func (q *qemuAmd64) appendImage(ctx context.Context, devices []govmmQemu.Device, return q.appendBlockImage(ctx, devices, path) } -// appendBridges appends to devices the given bridges -func (q *qemuAmd64) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { - return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) -} - // enable protection func (q *qemuAmd64) enableProtection() error { var err error diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 98fdfb7661..f338d4e4e0 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -12,7 +12,6 @@ import ( "fmt" "os" "runtime" - "strconv" "strings" govmmQemu "github.com/kata-containers/govmm/qemu" @@ -456,31 +455,7 @@ func (q *qemuArchBase) appendSCSIController(_ context.Context, devices []govmmQe // appendBridges appends to devices the given bridges func (q *qemuArchBase) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { - for idx, b := range q.Bridges { - if b.Type == types.CCW { - continue - } - t := govmmQemu.PCIBridge - if b.Type == types.PCIE { - t = govmmQemu.PCIEBridge - } - - q.Bridges[idx].Addr = bridgePCIStartAddr + idx - - devices = append(devices, - govmmQemu.BridgeDevice{ - Type: t, - Bus: defaultBridgeBus, - ID: b.ID, - // Each bridge is required to be assigned a unique chassis id > 0 - Chassis: idx + 1, - SHPC: true, - Addr: strconv.FormatInt(int64(q.Bridges[idx].Addr), 10), - }, - ) - } - - return devices + return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) } func generic9PVolume(volume types.Volume, nestedRun bool) govmmQemu.FSDevice { diff --git a/src/runtime/virtcontainers/qemu_arm64.go b/src/runtime/virtcontainers/qemu_arm64.go index be45a4e3a5..eced3b044d 100644 --- a/src/runtime/virtcontainers/qemu_arm64.go +++ b/src/runtime/virtcontainers/qemu_arm64.go @@ -89,11 +89,6 @@ func (q *qemuArm64) bridges(number uint32) { q.Bridges = genericBridges(number, q.qemuMachine.Type) } -// appendBridges appends to devices the given bridges -func (q *qemuArm64) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { - return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) -} - func (q *qemuArm64) appendImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) { if !q.disableNvdimm { return q.appendNvdimmImage(devices, path) diff --git a/src/runtime/virtcontainers/qemu_ppc64le.go b/src/runtime/virtcontainers/qemu_ppc64le.go index bff6feff16..51c9003ba2 100644 --- a/src/runtime/virtcontainers/qemu_ppc64le.go +++ b/src/runtime/virtcontainers/qemu_ppc64le.go @@ -123,11 +123,6 @@ func (q *qemuPPC64le) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) } -// appendBridges appends to devices the given bridges -func (q *qemuPPC64le) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { - return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) -} - func (q *qemuPPC64le) appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error) { return devices, fmt.Errorf("PPC64le does not support appending a vIOMMU") } diff --git a/src/runtime/virtcontainers/qemu_s390x.go b/src/runtime/virtcontainers/qemu_s390x.go index 829078cffa..b7611deccd 100644 --- a/src/runtime/virtcontainers/qemu_s390x.go +++ b/src/runtime/virtcontainers/qemu_s390x.go @@ -256,11 +256,6 @@ func (q *qemuS390x) append9PVolume(ctx context.Context, devices []govmmQemu.Devi return devices, nil } -// appendBridges appends to devices the given bridges -func (q *qemuS390x) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device { - return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type) -} - func (q *qemuS390x) appendSCSIController(ctx context.Context, devices []govmmQemu.Device, enableIOThreads bool) ([]govmmQemu.Device, *govmmQemu.IOThread, error) { d, t := genericSCSIController(enableIOThreads, q.nestedRun) addr, b, err := q.addDeviceToBridge(ctx, d.ID, types.CCW) From 8bbcb06af52294897a4d79099b74bcbaa1c9447c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 5 Aug 2021 14:39:04 +1000 Subject: [PATCH 3/3] qemu: Disable SHPC hotplug Under certain circumstances[0] Kata will attempt to use SHPC hotplug for PCI devices on the guest. In fact we explicitly enable SHPC on our PCI to PCI bridges, regardless of the qemu default. SHPC was designed a long, long time ago for physical hotplugging and works very poorly for a virtual environment. In particular it has a mandatory 5s delay to allow a (real, human) operator to back out the operation if they press a button by mistake. This alone makes it unusable for a fast start up application like Kata. Worse, the agent forces a PCI rescan during startup. That will race with the SHPC hotplug operation causing the device to go into a bad state where config space can't be accessed from the guest at all. The only reason we've sort of gotten away with this is that our default guest kernel configuration triggers what's arguably a kernel bug effectively disabling SHPC. That makes the agent rescan the only reason we see the new device. Now that we require a qemu >=6.1, which includes ACPI PCI hotplug on the q35 machine, we can explicitly disable SHPC in all cases. It's nothing but trouble. fixes #2174 Signed-off-by: David Gibson --- .../Nvidia-GPU-passthrough-and-Kata.md | 3 +-- src/runtime/virtcontainers/qemu.go | 17 ++++++++++++++++- src/runtime/virtcontainers/qemu_amd64_test.go | 15 +++++++++------ .../virtcontainers/qemu_arch_base_test.go | 15 +++++++++------ src/runtime/virtcontainers/qemu_arm64_test.go | 15 +++++++++------ 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md b/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md index 1a3253fdc8..58b36c02d7 100644 --- a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md +++ b/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md @@ -67,7 +67,7 @@ To use large BARs devices (for example, Nvidia Tesla P100), you need Kata versio The following configuration in the Kata `configuration.toml` file as shown below can work: -Hotplug for PCI devices by `shpchp` (Linux's SHPC PCI Hotplug driver): +Hotplug for PCI devices by `acpi_pcihp` (Linux's ACPI PCI Hotplug driver): ``` machine_type = "q35" @@ -91,7 +91,6 @@ The following kernel config options need to be enabled: ``` # Support PCI/PCIe device hotplug (Required for large BARs device) CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_HOTPLUG_PCI_SHPC=y # Support for loading modules (Required for load Nvidia drivers) CONFIG_MODULES=y diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index c2cee7a2b7..48252a0b5f 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -2126,8 +2126,23 @@ func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, ma ID: b.ID, // Each bridge is required to be assigned a unique chassis id > 0 Chassis: idx + 1, - SHPC: true, + SHPC: false, Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10), + // Certain guest BIOS versions think + // !SHPC means no hotplug, and won't + // reserve the IO and memory windows + // that will be needed for devices + // added underneath this bridge. This + // will only break for certain + // combinations of exact qemu, BIOS + // and guest kernel versions, but for + // consistency, just hint the usual + // default windows for a bridge (as + // the BIOS would use with SHPC) so + // that we can do ACPI hotplug. + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, ) } diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index 106abf31c5..4b8d044819 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -187,12 +187,15 @@ func TestQemuAmd64AppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, } diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 1cd904864b..4b495f73d9 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -307,12 +307,15 @@ func TestQemuArchBaseAppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, } diff --git a/src/runtime/virtcontainers/qemu_arm64_test.go b/src/runtime/virtcontainers/qemu_arm64_test.go index e421aa79b4..0d777af41b 100644 --- a/src/runtime/virtcontainers/qemu_arm64_test.go +++ b/src/runtime/virtcontainers/qemu_arm64_test.go @@ -78,12 +78,15 @@ func TestQemuArm64AppendBridges(t *testing.T) { expectedOut := []govmmQemu.Device{ govmmQemu.BridgeDevice{ - Type: govmmQemu.PCIBridge, - Bus: defaultBridgeBus, - ID: bridges[0].ID, - Chassis: 1, - SHPC: true, - Addr: "2", + Type: govmmQemu.PCIBridge, + Bus: defaultBridgeBus, + ID: bridges[0].ID, + Chassis: 1, + SHPC: false, + Addr: "2", + IOReserve: "4k", + MemReserve: "1m", + Pref64Reserve: "1m", }, }