configuration: add configuration for StratoVirt hypervisor.

Add configuration-stratovirt.toml.in to generate the StratoVirt configuration, and parser to deliver config to StratoVirt. Fixes: #7794 Signed-off-by: Liu Wenyuan <liuwenyuan9@huawei.com>
2025-09-18 15:28:10 +00:00 · 2023-08-23 17:49:56 +08:00
parent 561c85be54
commit 9542211e71
2 changed files with 498 additions and 0 deletions
--- a/src/runtime/config/configuration-stratovirt.toml.in
+++ b/src/runtime/config/configuration-stratovirt.toml.in
@@ -0,0 +1,394 @@
+# Copyright (c) 2023 Huawei Technologies Co.,Ltd.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# XXX: WARNING: this file is auto-generated.
+# XXX:
+# XXX: Source file: "@CONFIG_STRATOVIRT_IN@"
+# XXX: Project:
+# XXX:   Name: @PROJECT_NAME@
+# XXX:   Type: @PROJECT_TYPE@
+
+[hypervisor.stratovirt]
+path = "@STRATOVIRTPATH@"
+kernel = "@KERNELPATH_STRATOVIRT@"
+#image = "@IMAGEPATH@"
+initrd = "@INITRDPATH@"
+machine_type = "@DEFMACHINETYPE_STRATOVIRT@"
+
+# rootfs filesystem type:
+#   - ext4 (default)
+#   - xfs
+#   - erofs
+rootfs_type = @DEFROOTFSTYPE@
+
+# List of valid annotation names for the hypervisor
+# Each member of the list is a regular expression, which is the base name
+# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
+enable_annotations = @DEFENABLEANNOTATIONS@
+
+# List of valid annotations values for the hypervisor
+# Each member of the list is a path pattern as described by glob(3).
+# The default if not set is empty (all annotations rejected.)
+# Your distribution recommends: @STRATOVIRTVALIDHYPERVISORPATHS@
+valid_hypervisor_paths = @STRATOVIRTVALIDHYPERVISORPATHS@
+
+# Optional space-separated list of options to pass to the guest kernel.
+# For example, use `kernel_params = "vsyscall=emulate"` if you are having
+# trouble running pre-2.15 glibc.
+#
+# WARNING: - any parameter specified here will take priority over the default
+# parameter value of the same name used to start the virtual machine.
+# Do not set values here unless you understand the impact of doing so as you
+# may stop the virtual machine from booting.
+# To see the list of default parameters, enable hypervisor debug, create a
+# container and look for 'default-kernel-parameters' log entries.
+kernel_params = "@KERNELPARAMS@"
+
+# Default number of vCPUs per SB/VM:
+# unspecified or 0                --> will be set to @DEFVCPUS@
+# < 0                             --> will be set to the actual number of physical cores
+# > 0 <= number of physical cores --> will be set to the specified number
+# > number of physical cores      --> will be set to the actual number of physical cores
+default_vcpus = 1
+
+# Default maximum number of vCPUs per SB/VM:
+# unspecified or == 0             --> will be set to the actual number of physical cores or to the maximum number
+#                                     of vCPUs supported by KVM if that number is exceeded
+# > 0 <= number of physical cores --> will be set to the specified number
+# > number of physical cores      --> will be set to the actual number of physical cores or to the maximum number
+#                                     of vCPUs supported by KVM if that number is exceeded
+# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
+# the actual number of physical cores is greater than it.
+# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
+# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
+# can be added to a SB/VM, but the memory footprint will be big. Another example, with
+# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
+# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
+# unless you know what are you doing.
+# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
+default_maxvcpus = @DEFMAXVCPUS@
+
+# Bridges can be used to hot plug devices.
+# Limitations:
+# * Currently only pci bridges are supported
+# * Until 30 devices per bridge can be hot plugged.
+# * Until 5 PCI bridges can be cold plugged per VM.
+#   This limitation could be a bug in the kernel
+# Default number of bridges per SB/VM:
+# unspecified or 0   --> will be set to @DEFBRIDGES@
+# > 1 <= 5           --> will be set to the specified number
+# > 5                --> will be set to 5
+default_bridges = @DEFBRIDGES@
+
+# Default memory size in MiB for SB/VM.
+# If unspecified then it will be set @DEFMEMSZ@ MiB.
+default_memory = @DEFMEMSZ@
+#
+# Default memory slots per SB/VM.
+# If unspecified then it will be set @DEFMEMSLOTS@.
+# This is will determine the times that memory will be hotadded to sandbox/VM.
+#memory_slots = @DEFMEMSLOTS@
+
+# Default maximum memory in MiB per SB / VM
+# unspecified or == 0           --> will be set to the actual amount of physical RAM
+# > 0 <= amount of physical RAM --> will be set to the specified number
+# > amount of physical RAM      --> will be set to the actual amount of physical RAM
+default_maxmemory = @DEFMAXMEMSZ@
+
+# The size in MiB will be plused to max memory of hypervisor.
+# It is the memory address space for the NVDIMM devie.
+# If set block storage driver (block_device_driver) to "nvdimm",
+# should set memory_offset to the size of block device.
+# Default 0
+#memory_offset = 0
+
+# Disable block device from being used for a container's rootfs.
+# In case of a storage driver like devicemapper where a container's
+# root file system is backed by a block device, the block device is passed
+# directly to the hypervisor for performance reasons.
+# This flag prevents the block device from being passed to the hypervisor,
+# virtio-fs is used instead to pass the rootfs.
+disable_block_device_use = @DEFDISABLEBLOCK@
+
+# Shared file system type:
+#   - virtio-fs (default)
+#   - virtio-fs-nydus
+#   - none
+shared_fs = "@DEFSHAREDFS_STRATOVIRT_VIRTIOFS@"
+
+# Path to vhost-user-fs daemon.
+virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
+
+# List of valid annotations values for the virtiofs daemon
+# The default if not set is empty (all annotations rejected.)
+valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
+
+# Default size of DAX cache in MiB
+virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
+
+# Extra args for virtiofsd daemon
+#
+# Format example:
+#   ["--arg1=xxx", "--arg2=yyy"]
+# Examples:
+#   Set virtiofsd log level to debug : ["--log-level=debug"]
+#
+# see `virtiofsd -h` for possible options.
+virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
+
+# Cache mode:
+#
+#  - never
+#    Metadata, data, and pathname lookup are not cached in guest. They are
+#    always fetched from host and any changes are immediately pushed to host.
+#
+#  - auto
+#    Metadata and pathname lookup cache expires after a configured amount of
+#    time (default is 1 second). Data is cached while the file is open (close
+#    to open consistency).
+#
+#  - always
+#    Metadata, data, and pathname lookup are cached in guest and never expire.
+virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
+
+# Block storage driver to be used for the hypervisor in case the container
+# rootfs is backed by a block device. This is virtio-scsi, virtio-blk
+# or nvdimm.
+block_device_driver = "@DEFBLOCKSTORAGEDRIVER_STRATOVIRT@"
+
+# Specifies cache-related options will be set to block devices or not.
+# Default false
+#block_device_cache_set = true
+
+# Specifies cache-related options for block devices.
+# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
+# Default false
+#block_device_cache_direct = true
+
+# Specifies cache-related options for block devices.
+# Denotes whether flush requests for the device are ignored.
+# Default false
+#block_device_cache_noflush = true
+
+# Enable huge pages for VM RAM, default false
+# Enabling this will result in the VM memory
+# being allocated using huge pages.
+# This is useful when you want to use vhost-user network
+# stacks within the container. This will automatically
+# result in memory pre allocation
+#enable_hugepages = true
+
+# Enable vIOMMU, default false
+# Enabling this will result in the VM having a vIOMMU device
+# This will also add the following options to the kernel's
+# command line: intel_iommu=on,iommu=pt
+#enable_iommu = true
+
+# This option changes the default hypervisor and kernel parameters
+# to enable debug output where available.
+#
+# Default false
+#enable_debug = true
+
+# Disable the customizations done in the runtime when it detects
+# that it is running on top a VMM. This will result in the runtime
+# behaving as it would when running on bare metal.
+#
+#disable_nesting_checks = true
+
+#
+# Default entropy source.
+# The path to a host source of entropy (including a real hardware RNG)
+# /dev/urandom and /dev/random are two main options.
+# Be aware that /dev/random is a blocking source of entropy.  If the host
+# runs out of entropy, the VMs boot time will increase leading to get startup
+# timeouts.
+# The source of entropy /dev/urandom is non-blocking and provides a
+# generally acceptable source of entropy. It should work well for pretty much
+# all practical purposes.
+entropy_source = "@DEFENTROPYSOURCE@"
+
+# Path to OCI hook binaries in the *guest rootfs*.
+# This does not affect host-side hooks which must instead be added to
+# the OCI spec passed to the runtime.
+#
+# You can create a rootfs with hooks by customizing the osbuilder scripts:
+# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
+#
+# Hooks must be stored in a subdirectory of guest_hook_path according to their
+# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
+# The agent will scan these directories for executable files and add them, in
+# lexicographical order, to the lifecycle of the guest container.
+# Hooks are executed in the runtime namespace of the guest. See the official documentation:
+# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
+# Warnings will be logged if any error is encountered while scanning for hooks,
+# but it will not abort container execution.
+#guest_hook_path = "/usr/share/oci/hooks"
+
+# disable applying SELinux on the VMM process (default false)
+disable_selinux = @DEFDISABLESELINUX@
+
+# disable applying SELinux on the container process
+# If set to false, the type `container_t` is applied to the container process by default.
+# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
+# with `SELINUX=yes`.
+# (default: true)
+disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
+
+[factory]
+# VM templating support. Once enabled, new VMs are created from template
+# using vm cloning. They will share the same initial kernel, initramfs and
+# agent memory by mapping it readonly. It helps speeding up new container
+# creation and saves a lot of memory if there are many kata containers running
+# on the same host.
+#
+# When disabled, new VMs are created from scratch.
+#
+# Note: Requires "initrd=" to be set ("image=" is not supported).
+#
+# Default false
+#enable_template = true
+
+[agent.@PROJECT_TYPE@]
+# If enabled, make the agent display debug-level messages.
+# (default: disabled)
+#enable_debug = true
+
+# Enable agent tracing.
+#
+# If enabled, the agent will generate OpenTelemetry trace spans.
+#
+# Notes:
+#
+# - If the runtime also has tracing enabled, the agent spans will be
+#   associated with the appropriate runtime parent span.
+# - If enabled, the runtime will wait for the container to shutdown,
+#   increasing the container shutdown time slightly.
+#
+# (default: disabled)
+#enable_tracing = true
+
+# Comma separated list of kernel modules and their parameters.
+# These modules will be loaded in the guest kernel using modprobe(8).
+# The following example can be used to load two kernel modules with parameters
+#  - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
+# The first word is considered as the module name and the rest as its parameters.
+# Container will not be started when:
+#  * A kernel module is specified and the modprobe command is not installed in the guest
+#    or it fails loading the module.
+#  * The module is not available in the guest or it doesn't met the guest kernel
+#    requirements, like architecture and version.
+#
+kernel_modules = []
+
+# Enable debug console.
+
+# If enabled, user can connect guest OS running inside hypervisor
+# through "kata-runtime exec <sandbox-id>" command
+
+#debug_console_enabled = true
+
+# Agent connection dialing timeout value in seconds
+# (default: 45)
+dial_timeout = 45
+
+[runtime]
+# If enabled, the runtime will log additional debug messages to the
+# system log
+# (default: disabled)
+#enable_debug = true
+#
+# Internetworking model
+# Determines how the VM should be connected to the
+# the container network interface
+# Options:
+#
+#   - macvtap
+#     Used when the Container network interface can be bridged using
+#     macvtap.
+#
+#   - none
+#     Used when customize network. Only creates a tap device. No veth pair.
+#
+#   - tcfilter
+#     Uses tc filter rules to redirect traffic from the network interface
+#     provided by plugin to a tap interface connected to the VM.
+#
+internetworking_model = "@DEFNETWORKMODEL_STRATOVIRT@"
+
+# disable guest seccomp
+# Determines whether container seccomp profiles are passed to the virtual
+# machine and applied by the kata agent. If set to true, seccomp is not applied
+# within the guest
+# (default: true)
+disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
+
+# vCPUs pinning settings
+# if enabled, each vCPU thread will be scheduled to a fixed CPU
+# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
+#enable_vcpus_pinning = false
+
+# Apply a custom SELinux security policy to the container process inside the VM.
+# This is used when you want to apply a type other than the default `container_t`,
+# so general users should not uncomment and apply it.
+# (format: "user:role:type")
+# Note: You cannot specify MCS policy with the label because the sensitivity levels and
+# categories are determined automatically by high-level container runtimes such as containerd.
+#guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
+
+# If enabled, the runtime will create opentracing.io traces and spans.
+# (See https://www.jaegertracing.io/docs/getting-started).
+# (default: disabled)
+#enable_tracing = true
+
+# Set the full url to the Jaeger HTTP Thrift collector.
+# The default if not set will be "http://localhost:14268/api/traces"
+#jaeger_endpoint = ""
+
+# Sets the username to be used if basic auth is required for Jaeger.
+#jaeger_user = ""
+
+# Sets the password to be used if basic auth is required for Jaeger.
+#jaeger_password = ""
+
+# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
+# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
+# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
+# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
+# (like OVS) directly.
+# (default: false)
+#disable_new_netns = true
+
+# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
+# The container cgroups in the host are not created, just one single cgroup per sandbox.
+# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
+# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
+# The sandbox cgroup is constrained if there is no container type annotation.
+# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
+sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY@
+
+# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
+# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
+# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
+# Compatibility for determining appropriate sandbox (VM) size:
+# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
+#   does not yet support sandbox sizing annotations.
+# - When running single containers using a tool like ctr, container sizing information will be available.
+static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_STRATOVIRT@
+
+# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
+# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
+disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
+
+# Enabled experimental feature list, format: ["a", "b"].
+# Experimental features are features not stable enough for production,
+# they may break compatibility, and are prepared for a big version bump.
+# Supported experimental features:
+# (default: [])
+experimental = @DEFAULTEXPFEATURES@
+
+# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
+# (default: false)
+#enable_pprof = true
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -52,6 +52,7 @@ const (
 	qemuHypervisorTableType        = "qemu"
 	acrnHypervisorTableType        = "acrn"
 	dragonballHypervisorTableType  = "dragonball"
+	stratovirtHypervisorTableType  = "stratovirt"

 	// the maximum amount of PCI bridges that can be cold plugged in a VM
 	maxPCIBridges uint32 = 5
@@ -1141,6 +1142,106 @@ func newDragonballHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 	}, nil
 }

+func newStratovirtHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
+	hypervisor, err := h.path()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	kernel, err := h.kernel()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	initrd, err := h.initrd()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	image, err := h.image()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	if image != "" && initrd != "" {
+		return vc.HypervisorConfig{},
+			errors.New("having both an image and an initrd defined in the configuration file is not supported")
+	}
+
+	if image == "" && initrd == "" {
+		return vc.HypervisorConfig{},
+			errors.New("image or initrd must be defined in the configuration file")
+	}
+
+	rootfsType, err := h.rootfsType()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	kernelParams := h.kernelParams()
+	machineType := h.machineType()
+
+	blockDriver, err := h.blockDeviceDriver()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	if vSock, err := utils.SupportsVsocks(); !vSock {
+		return vc.HypervisorConfig{}, err
+	}
+
+	sharedFS, err := h.sharedFS()
+	if err != nil {
+		return vc.HypervisorConfig{}, err
+	}
+
+	if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
+		return vc.HypervisorConfig{},
+			fmt.Errorf("Stratovirt Hypervisor does not support %s shared filesystem option", sharedFS)
+	}
+
+	if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
+		return vc.HypervisorConfig{},
+			fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
+	}
+
+	return vc.HypervisorConfig{
+		HypervisorPath:        hypervisor,
+		HypervisorPathList:    h.HypervisorPathList,
+		KernelPath:            kernel,
+		InitrdPath:            initrd,
+		ImagePath:             image,
+		RootfsType:            rootfsType,
+		KernelParams:          vc.DeserializeParams(strings.Fields(kernelParams)),
+		HypervisorMachineType: machineType,
+		NumVCPUsF:             h.defaultVCPUs(),
+		DefaultMaxVCPUs:       h.defaultMaxVCPUs(),
+		MemorySize:            h.defaultMemSz(),
+		MemSlots:              h.defaultMemSlots(),
+		MemOffset:             h.defaultMemOffset(),
+		DefaultMaxMemorySize:  h.defaultMaxMemSz(),
+		EntropySource:         h.GetEntropySource(),
+		DefaultBridges:        h.defaultBridges(),
+		DisableBlockDeviceUse: h.DisableBlockDeviceUse,
+		SharedFS:              sharedFS,
+		VirtioFSDaemon:        h.VirtioFSDaemon,
+		VirtioFSDaemonList:    h.VirtioFSDaemonList,
+		VirtioFSCacheSize:     h.VirtioFSCacheSize,
+		VirtioFSCache:         h.defaultVirtioFSCache(),
+		VirtioFSExtraArgs:     h.VirtioFSExtraArgs,
+		HugePages:             h.HugePages,
+		Debug:                 h.Debug,
+		DisableNestingChecks:  h.DisableNestingChecks,
+		BlockDeviceDriver:     blockDriver,
+		DisableVhostNet:       true,
+		GuestHookPath:         h.guestHookPath(),
+		EnableAnnotations:     h.EnableAnnotations,
+		DisableSeccomp:        h.DisableSeccomp,
+		DisableSeLinux:        h.DisableSeLinux,
+		DisableGuestSeLinux:   h.DisableGuestSeLinux,
+	}, nil
+}
+
 func newFactoryConfig(f factory) (oci.FactoryConfig, error) {
 	if f.TemplatePath == "" {
 		f.TemplatePath = defaultTemplatePath
@@ -1177,6 +1278,9 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi
 		case dragonballHypervisorTableType:
 			config.HypervisorType = vc.DragonballHypervisor
 			hConfig, err = newDragonballHypervisorConfig(hypervisor)
+		case stratovirtHypervisorTableType:
+			config.HypervisorType = vc.StratovirtHypervisor
+			hConfig, err = newStratovirtHypervisorConfig(hypervisor)
 		default:
 			err = fmt.Errorf("%s: %+q", errInvalidHypervisorPrefix, k)
 		}