diff --git a/src/runtime/config/configuration-stratovirt.toml.in b/src/runtime/config/configuration-stratovirt.toml.in new file mode 100644 index 0000000000..c98cf6bd78 --- /dev/null +++ b/src/runtime/config/configuration-stratovirt.toml.in @@ -0,0 +1,394 @@ +# Copyright (c) 2023 Huawei Technologies Co.,Ltd. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_STRATOVIRT_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.stratovirt] +path = "@STRATOVIRTPATH@" +kernel = "@KERNELPATH_STRATOVIRT@" +#image = "@IMAGEPATH@" +initrd = "@INITRDPATH@" +machine_type = "@DEFMACHINETYPE_STRATOVIRT@" + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type = @DEFROOTFSTYPE@ + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @STRATOVIRTVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @STRATOVIRTVALIDHYPERVISORPATHS@ + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to @DEFVCPUS@ +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. +default_maxvcpus = @DEFMAXVCPUS@ + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ +# +# Default memory slots per SB/VM. +# If unspecified then it will be set @DEFMEMSLOTS@. +# This is will determine the times that memory will be hotadded to sandbox/VM. +#memory_slots = @DEFMEMSLOTS@ + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = @DEFMAXMEMSZ@ + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM devie. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +#memory_offset = 0 + +# Disable block device from being used for a container's rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +disable_block_device_use = @DEFDISABLEBLOCK@ + +# Shared file system type: +# - virtio-fs (default) +# - virtio-fs-nydus +# - none +shared_fs = "@DEFSHAREDFS_STRATOVIRT_VIRTIOFS@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# List of valid annotations values for the virtiofs daemon +# The default if not set is empty (all annotations rejected.) +valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@ + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["--arg1=xxx", "--arg2=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["--log-level=debug"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "@DEFBLOCKSTORAGEDRIVER_STRATOVIRT@" + +# Specifies cache-related options will be set to block devices or not. +# Default false +#block_device_cache_set = true + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +#block_device_cache_direct = true + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +#block_device_cache_noflush = true + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +#enable_hugepages = true + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: intel_iommu=on,iommu=pt +#enable_iommu = true + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +#enable_debug = true + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +entropy_source = "@DEFENTROPYSOURCE@" + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" + +# disable applying SELinux on the VMM process (default false) +disable_selinux = @DEFDISABLESELINUX@ + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux = @DEFDISABLEGUESTSELINUX@ + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +#enable_template = true + +[agent.@PROJECT_TYPE@] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +#enable_tracing = true + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules = [] + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +#debug_console_enabled = true + +# Agent connection dialing timeout value in seconds +# (default: 45) +dial_timeout = 45 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model = "@DEFNETWORKMODEL_STRATOVIRT@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@ + +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +#enable_vcpus_pinning = false + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +#guest_selinux_label = "@DEFGUESTSELINUXLABEL@" + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +#jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +#jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +#jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY@ + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_STRATOVIRT@ + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental = @DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +#enable_pprof = true diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 1afec55d50..7aa09b995f 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -52,6 +52,7 @@ const ( qemuHypervisorTableType = "qemu" acrnHypervisorTableType = "acrn" dragonballHypervisorTableType = "dragonball" + stratovirtHypervisorTableType = "stratovirt" // the maximum amount of PCI bridges that can be cold plugged in a VM maxPCIBridges uint32 = 5 @@ -1141,6 +1142,106 @@ func newDragonballHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { }, nil } +func newStratovirtHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { + hypervisor, err := h.path() + if err != nil { + return vc.HypervisorConfig{}, err + } + + kernel, err := h.kernel() + if err != nil { + return vc.HypervisorConfig{}, err + } + + initrd, err := h.initrd() + if err != nil { + return vc.HypervisorConfig{}, err + } + + image, err := h.image() + if err != nil { + return vc.HypervisorConfig{}, err + } + + if image != "" && initrd != "" { + return vc.HypervisorConfig{}, + errors.New("having both an image and an initrd defined in the configuration file is not supported") + } + + if image == "" && initrd == "" { + return vc.HypervisorConfig{}, + errors.New("image or initrd must be defined in the configuration file") + } + + rootfsType, err := h.rootfsType() + if err != nil { + return vc.HypervisorConfig{}, err + } + + kernelParams := h.kernelParams() + machineType := h.machineType() + + blockDriver, err := h.blockDeviceDriver() + if err != nil { + return vc.HypervisorConfig{}, err + } + + if vSock, err := utils.SupportsVsocks(); !vSock { + return vc.HypervisorConfig{}, err + } + + sharedFS, err := h.sharedFS() + if err != nil { + return vc.HypervisorConfig{}, err + } + + if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS { + return vc.HypervisorConfig{}, + fmt.Errorf("Stratovirt Hypervisor does not support %s shared filesystem option", sharedFS) + } + + if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" { + return vc.HypervisorConfig{}, + fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS) + } + + return vc.HypervisorConfig{ + HypervisorPath: hypervisor, + HypervisorPathList: h.HypervisorPathList, + KernelPath: kernel, + InitrdPath: initrd, + ImagePath: image, + RootfsType: rootfsType, + KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), + HypervisorMachineType: machineType, + NumVCPUsF: h.defaultVCPUs(), + DefaultMaxVCPUs: h.defaultMaxVCPUs(), + MemorySize: h.defaultMemSz(), + MemSlots: h.defaultMemSlots(), + MemOffset: h.defaultMemOffset(), + DefaultMaxMemorySize: h.defaultMaxMemSz(), + EntropySource: h.GetEntropySource(), + DefaultBridges: h.defaultBridges(), + DisableBlockDeviceUse: h.DisableBlockDeviceUse, + SharedFS: sharedFS, + VirtioFSDaemon: h.VirtioFSDaemon, + VirtioFSDaemonList: h.VirtioFSDaemonList, + VirtioFSCacheSize: h.VirtioFSCacheSize, + VirtioFSCache: h.defaultVirtioFSCache(), + VirtioFSExtraArgs: h.VirtioFSExtraArgs, + HugePages: h.HugePages, + Debug: h.Debug, + DisableNestingChecks: h.DisableNestingChecks, + BlockDeviceDriver: blockDriver, + DisableVhostNet: true, + GuestHookPath: h.guestHookPath(), + EnableAnnotations: h.EnableAnnotations, + DisableSeccomp: h.DisableSeccomp, + DisableSeLinux: h.DisableSeLinux, + DisableGuestSeLinux: h.DisableGuestSeLinux, + }, nil +} + func newFactoryConfig(f factory) (oci.FactoryConfig, error) { if f.TemplatePath == "" { f.TemplatePath = defaultTemplatePath @@ -1177,6 +1278,9 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi case dragonballHypervisorTableType: config.HypervisorType = vc.DragonballHypervisor hConfig, err = newDragonballHypervisorConfig(hypervisor) + case stratovirtHypervisorTableType: + config.HypervisorType = vc.StratovirtHypervisor + hConfig, err = newStratovirtHypervisorConfig(hypervisor) default: err = fmt.Errorf("%s: %+q", errInvalidHypervisorPrefix, k) }