runtime/device: Allow VFIO devices to be presented to guest as VFIO devices

On a conventional (e.g. runc) container, passing in a VFIO group device,
/dev/vfio/NN, will result in the same VFIO group device being available
within the container.

With Kata, however, the VFIO device will be bound to the guest kernel's
driver (if it has one), possibly appearing as some other device (or a
network interface) within the guest.

This add a new `vfio_mode` option to alter this.  If set to "vfio" it will
instruct the agent to remap VFIO devices to the VFIO driver within the
guest as well, meaning they will appear as VFIO devices within the
container.

Unlike a runc container, the VFIO devices will have different names to the
host, since the names correspond to the IOMMU groups of the guest and those
can't be remapped with namespaces.

For now we keep 'guest-kernel' as the value in the default configuration
files, to maintain current Kata behaviour.  In future we should change this
to 'vfio' as the default.  That will make Kata's default behaviour more
closely resemble OCI specified behaviour.

fixes #693

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
This commit is contained in:
David Gibson 2021-10-08 16:57:48 +11:00
parent 68696e051d
commit 34273da98f
4 changed files with 35 additions and 3 deletions

View File

@ -267,6 +267,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - vfio
# Matches behaviour of OCI runtimes (e.g. runc) as much as
# possible. VFIO devices will appear in the container as VFIO
# character devices under /dev/vfio. The exact names may differ
# from the host (they need to match the VM's IOMMU group numbers
# rather than the host's)
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel

View File

@ -547,6 +547,13 @@ sandbox_bind_mounts=@DEFBINDMOUNTS@
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - vfio
# Matches behaviour of OCI runtimes (e.g. runc) as much as
# possible. VFIO devices will appear in the container as VFIO
# character devices under /dev/vfio. The exact names may differ
# from the host (they need to match the VM's IOMMU group numbers
# rather than the host's)
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel

View File

@ -191,20 +191,29 @@ type BlockDrive struct {
type VFIOModeType uint32
const (
// VFIOModeVFIO specifies OCI compliant behaviour: VFIO
// devices specified to Kata appear as VFIO devices within the
// container
VFIOModeVFIO VFIOModeType = iota
// VFIOModeGuestKernel specifies Kata-specific behaviour
// useful in certain cases: VFIO devices specified to Kata are
// bound to whatever driver in the VM will take them. This
// requires specialized containers expecting this behaviour to
// locate and use the devices
VFIOModeGuestKernel = iota
VFIOModeGuestKernel
)
const (
vfioModeVfioStr = "vfio"
vfioModeGuestKernelStr = "guest-kernel"
)
func (m *VFIOModeType) VFIOSetMode(modeName string) error {
switch modeName {
case vfioModeVfioStr:
*m = VFIOModeVFIO
return nil
case vfioModeGuestKernelStr:
*m = VFIOModeGuestKernel
return nil

View File

@ -92,6 +92,7 @@ var (
kataNvdimmDevType = "nvdimm"
kataVirtioFSDevType = "virtio-fs"
kataWatchableBindDevType = "watchable-bind"
kataVfioDevType = "vfio" // VFIO device to used as VFIO in the container
kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel
sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"}
sharedDirVirtioFSOptions = []string{}
@ -1183,11 +1184,19 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
// (see qomGetPciPath() for details).
kataDevice := &grpc.Device{
ContainerPath: dev.ContainerPath,
Type: kataVfioGuestKernelDevType,
Type: kataVfioDevType,
Id: groupNum,
Options: make([]string, len(devList)),
}
// We always pass the device information to the agent, since
// it needs that to wait for them to be ready. But depending
// on the vfio_mode, we need to use a different device type so
// the agent can handle it properly
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
kataDevice.Type = kataVfioGuestKernelDevType
}
for i, pciDev := range devList {
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath)
}
@ -1417,7 +1426,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
// We need to constrain the spec to make sure we're not
// passing irrelevant information to the agent.
k.constrainGRPCSpec(grpcSpec, passSeccomp, true)
k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
req := &grpc.CreateContainerRequest{
ContainerId: c.id,