runtime: Introduce "vfio_mode" config variable and annotation

In order to support DPDK workloads, we need to change the way VFIO devices
will be handled in Kata containers.  However, the current method, although
it is not remotely OCI compliant has real uses.  Therefore, introduce a new
runtime configuration field "vfio_mode" to control how VFIO devices will be
presented to the container.

We also add a new sandbox annotation -
io.katacontainers.config.runtime.vfio_mode - to override this on a
per-sandbox basis.

For now, the only allowed value is "guest-kernel" which refers to the
current behaviour where VFIO devices added to the container will be bound
to whatever driver in the VM kernel claims them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
This commit is contained in:
David Gibson 2021-10-08 16:37:28 +11:00
parent 730b9c433f
commit 57ab408576
9 changed files with 89 additions and 0 deletions

View File

@ -190,6 +190,7 @@ DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"]
DEFFILEMEMBACKEND := ""
DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"]
DEFMSIZE9P := 8192
DEFVFIOMODE := guest-kernel
# Default cgroup model
DEFSANDBOXCGROUPONLY ?= false
@ -459,6 +460,7 @@ USER_VARS += DEFENTROPYSOURCE
USER_VARS += DEFVALIDENTROPYSOURCES
USER_VARS += DEFSANDBOXCGROUPONLY
USER_VARS += DEFBINDMOUNTS
USER_VARS += DEFVFIOMODE
USER_VARS += FEATURE_SELINUX
USER_VARS += BUILDFLAGS

View File

@ -263,6 +263,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
# These will not be exposed to the container workloads, and are only provided for potential guest services.
sandbox_bind_mounts=@DEFBINDMOUNTS@
# VFIO Mode
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel
# claims it. This means it will appear as one or more device nodes
# or network interfaces depending on the nature of the device.
# Using this mode requires specially built workloads that know how
# to locate the relevant device interfaces within the VM.
#
vfio_mode="@DEFVFIOMODE@"
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.

View File

@ -543,6 +543,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
# These will not be exposed to the container workloads, and are only provided for potential guest services.
sandbox_bind_mounts=@DEFBINDMOUNTS@
# VFIO Mode
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel
# claims it. This means it will appear as one or more device nodes
# or network interfaces depending on the nature of the device.
# Using this mode requires specially built workloads that know how
# to locate the relevant device interfaces within the VM.
#
vfio_mode="@DEFVFIOMODE@"
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.

View File

@ -88,6 +88,7 @@ const defaultConfidentialGuest = false
const defaultGuestSwap = false
const defaultRootlessHypervisor = false
const defaultDisableSeccomp = false
const defaultVfioMode = "guest-kernel"
var defaultSGXEPCSize = int64(0)

View File

@ -143,6 +143,7 @@ type runtime struct {
JaegerEndpoint string `toml:"jaeger_endpoint"`
JaegerUser string `toml:"jaeger_user"`
JaegerPassword string `toml:"jaeger_password"`
VfioMode string `toml:"vfio_mode"`
SandboxBindMounts []string `toml:"sandbox_bind_mounts"`
Experimental []string `toml:"experimental"`
Debug bool `toml:"enable_debug"`
@ -1068,6 +1069,11 @@ func initConfig() (config oci.RuntimeConfig, err error) {
return oci.RuntimeConfig{}, err
}
err = config.VfioMode.VFIOSetMode(defaultVfioMode)
if err != nil {
return oci.RuntimeConfig{}, err
}
config = oci.RuntimeConfig{
HypervisorType: defaultHypervisor,
HypervisorConfig: GetDefaultHypervisorConfig(),
@ -1114,6 +1120,14 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
}
}
if tomlConf.Runtime.VfioMode != "" {
err = config.VfioMode.VFIOSetMode(tomlConf.Runtime.VfioMode)
if err != nil {
return "", config, err
}
}
if !ignoreLogging {
err := handleSystemLog("", "")
if err != nil {

View File

@ -187,6 +187,31 @@ type BlockDrive struct {
Swap bool
}
// VFIOMode indicates e behaviour mode for handling devices in the VM
type VFIOModeType uint32
const (
// VFIOModeGuestKernel specifies Kata-specific behaviour
// useful in certain cases: VFIO devices specified to Kata are
// bound to whatever driver in the VM will take them. This
// requires specialized containers expecting this behaviour to
// locate and use the devices
VFIOModeGuestKernel = iota
)
const (
vfioModeGuestKernelStr = "guest-kernel"
)
func (m *VFIOModeType) VFIOSetMode(modeName string) error {
switch modeName {
case vfioModeGuestKernelStr:
*m = VFIOModeGuestKernel
return nil
}
return fmt.Errorf("Unknown VFIO mode %s", modeName)
}
// VFIODeviceType indicates VFIO device type
type VFIODeviceType uint32

View File

@ -250,6 +250,10 @@ const (
// DisableNewNetNs is a sandbox annotation that determines if create a netns for hypervisor process.
DisableNewNetNs = kataAnnotRuntimePrefix + "disable_new_netns"
// VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated
// Overrides the runtime.vfio_mode parameter in the global configuration.toml
VfioMode = kataAnnotRuntimePrefix + "vfio_mode"
)
// Agent related annotations

View File

@ -116,6 +116,10 @@ type RuntimeConfig struct {
//the container network interface
InterNetworkModel vc.NetInterworkingModel
//Determines how VFIO devices should be presented to the
//container
VfioMode config.VFIOModeType
Debug bool
Trace bool
@ -826,6 +830,13 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r
sbConfig.NetworkConfig.InterworkingModel = runtimeConfig.InterNetworkModel
}
if value, ok := ocispec.Annotations[vcAnnotations.VfioMode]; ok {
if err := sbConfig.VfioMode.VFIOSetMode(value); err != nil {
return fmt.Errorf("Unknown VFIO mode \"%s\" in annotation %s",
value, vcAnnotations.VfioMode)
}
}
return nil
}
@ -893,6 +904,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c
ShmSize: shmSize,
VfioMode: runtime.VfioMode,
SystemdCgroup: systemdCgroup,
SandboxCgroupOnly: runtime.SandboxCgroupOnly,

View File

@ -134,6 +134,8 @@ type SandboxConfig struct {
ShmSize uint64
VfioMode config.VFIOModeType
// SharePidNs sets all containers to share the same sandbox level pid namespace.
SharePidNs bool