From 34273da98f11589bdfa46327d3418643b492275a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 16:57:48 +1100 Subject: [PATCH] runtime/device: Allow VFIO devices to be presented to guest as VFIO devices On a conventional (e.g. runc) container, passing in a VFIO group device, /dev/vfio/NN, will result in the same VFIO group device being available within the container. With Kata, however, the VFIO device will be bound to the guest kernel's driver (if it has one), possibly appearing as some other device (or a network interface) within the guest. This add a new `vfio_mode` option to alter this. If set to "vfio" it will instruct the agent to remap VFIO devices to the VFIO driver within the guest as well, meaning they will appear as VFIO devices within the container. Unlike a runc container, the VFIO devices will have different names to the host, since the names correspond to the IOMMU groups of the guest and those can't be remapped with namespaces. For now we keep 'guest-kernel' as the value in the default configuration files, to maintain current Kata behaviour. In future we should change this to 'vfio' as the default. That will make Kata's default behaviour more closely resemble OCI specified behaviour. fixes #693 Signed-off-by: David Gibson --- src/runtime/config/configuration-clh.toml.in | 7 +++++++ src/runtime/config/configuration-qemu.toml.in | 7 +++++++ src/runtime/virtcontainers/device/config/config.go | 11 ++++++++++- src/runtime/virtcontainers/kata_agent.go | 13 +++++++++++-- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 076a113941..3e46d4961d 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -267,6 +267,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@ # Determines how VFIO devices should be be presented to the container. # Options: # +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# # - guest-kernel # This is a Kata-specific behaviour that's useful in certain cases. # The VFIO device is managed by whatever driver in the VM kernel diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f2c7c4fa73..5bc8e9cc38 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -547,6 +547,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@ # Determines how VFIO devices should be be presented to the container. # Options: # +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# # - guest-kernel # This is a Kata-specific behaviour that's useful in certain cases. # The VFIO device is managed by whatever driver in the VM kernel diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index 5e62d0c179..4f27d9358b 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -191,20 +191,29 @@ type BlockDrive struct { type VFIOModeType uint32 const ( + // VFIOModeVFIO specifies OCI compliant behaviour: VFIO + // devices specified to Kata appear as VFIO devices within the + // container + VFIOModeVFIO VFIOModeType = iota + // VFIOModeGuestKernel specifies Kata-specific behaviour // useful in certain cases: VFIO devices specified to Kata are // bound to whatever driver in the VM will take them. This // requires specialized containers expecting this behaviour to // locate and use the devices - VFIOModeGuestKernel = iota + VFIOModeGuestKernel ) const ( + vfioModeVfioStr = "vfio" vfioModeGuestKernelStr = "guest-kernel" ) func (m *VFIOModeType) VFIOSetMode(modeName string) error { switch modeName { + case vfioModeVfioStr: + *m = VFIOModeVFIO + return nil case vfioModeGuestKernelStr: *m = VFIOModeGuestKernel return nil diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index c1a39220fb..d3c4fa48b8 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -92,6 +92,7 @@ var ( kataNvdimmDevType = "nvdimm" kataVirtioFSDevType = "virtio-fs" kataWatchableBindDevType = "watchable-bind" + kataVfioDevType = "vfio" // VFIO device to used as VFIO in the container kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"} sharedDirVirtioFSOptions = []string{} @@ -1183,11 +1184,19 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * // (see qomGetPciPath() for details). kataDevice := &grpc.Device{ ContainerPath: dev.ContainerPath, - Type: kataVfioGuestKernelDevType, + Type: kataVfioDevType, Id: groupNum, Options: make([]string, len(devList)), } + // We always pass the device information to the agent, since + // it needs that to wait for them to be ready. But depending + // on the vfio_mode, we need to use a different device type so + // the agent can handle it properly + if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel { + kataDevice.Type = kataVfioGuestKernelDevType + } + for i, pciDev := range devList { kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath) } @@ -1417,7 +1426,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - k.constrainGRPCSpec(grpcSpec, passSeccomp, true) + k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel) req := &grpc.CreateContainerRequest{ ContainerId: c.id,