diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 7cf88ac042..c3ad4b9bac 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -77,7 +77,9 @@ CLHBINDIR := $(PREFIXDEPS)/bin QEMUBINDIR := $(PREFIXDEPS)/bin PROJECT_DIR = $(PROJECT_TAG) IMAGENAME = $(PROJECT_TAG).img +IMAGECONFIDENTIALNAME = $(PROJECT_TAG)-confidential.img INITRDNAME = $(PROJECT_TAG)-initrd.img +INITRDCONFIDENTIALNAME = $(PROJECT_TAG)-initrd-confidential.img TARGET = $(PROJECT_COMPONENT) SYSCONFDIR := /etc LOCALSTATEDIR := /var @@ -112,7 +114,9 @@ PKGDATADIR := $(PREFIXDEPS)/share/$(PROJECT_DIR) PKGRUNDIR := $(LOCALSTATEDIR)/run/$(PROJECT_DIR) KERNELDIR := $(PKGDATADIR) IMAGEPATH := $(PKGDATADIR)/$(IMAGENAME) +IMAGECONFIDENTIALPATH := $(PKGDATADIR)/$(IMAGECONFIDENTIALNAME) INITRDPATH := $(PKGDATADIR)/$(INITRDNAME) +INITRDCONFIDENTIALPATH := $(PKGDATADIR)/$(INITRDCONFIDENTIALNAME) ROOTFSTYPE_EXT4 := \"ext4\" ROOTFSTYPE_XFS := \"xfs\" @@ -297,6 +301,18 @@ ifneq (,$(QEMUCMD)) CONFIGS += $(CONFIG_QEMU_SE) + CONFIG_FILE_QEMU_COCO_DEV = configuration-qemu-runtime-rs-coco-dev.toml + CONFIG_QEMU_COCO_DEV = config/$(CONFIG_FILE_QEMU_COCO_DEV) + CONFIG_QEMU_COCO_DEV_IN = $(CONFIG_QEMU_COCO_DEV).in + + CONFIG_PATH_QEMU_COCO_DEV = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_COCO_DEV)) + CONFIG_PATHS += $(CONFIG_PATH_QEMU_COCO_DEV) + + SYSCONFIG_QEMU_COCO_DEV = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_COCO_DEV)) + SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_COCO_DEV) + + CONFIGS += $(CONFIG_QEMU_COCO_DEV) + KERNELTYPE_QEMU = uncompressed KERNEL_NAME_QEMU = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_QEMU)) KERNELPATH_QEMU = $(KERNELDIR)/$(KERNEL_NAME_QEMU) @@ -304,6 +320,10 @@ ifneq (,$(QEMUCMD)) KERNEL_NAME_QEMU_SE = kata-containers-se.img KERNELPATH_QEMU_SE = $(KERNELDIR)/$(KERNEL_NAME_QEMU_SE) + KERNEL_TYPE_COCO = compressed + KERNEL_NAME_COCO = $(call MAKE_KERNEL_NAME_COCO,$(KERNELCONFIDENTIALTYPE)) + KERNELPATH_COCO = $(KERNELDIR)/$(KERNEL_NAME_COCO) + # overriding options DEFSTATICRESOURCEMGMT_QEMU := true @@ -320,6 +340,7 @@ endif DEFMAXVCPUS_QEMU := 0 DEFSHAREDFS_QEMU_VIRTIOFS := virtio-fs DEFSHAREDFS_QEMU_SEL_VIRTIOFS := none + DEFSHAREDFS_QEMU_COCO_DEV_VIRTIOFS := none DEFBLOCKDEVICEAIO_QEMU := io_uring DEFNETWORKMODEL_QEMU := tcfilter DEFDISABLEGUESTSELINUX := true @@ -386,6 +407,7 @@ USER_VARS += CONFIG_PATH USER_VARS += CONFIG_QEMU_IN USER_VARS += CONFIG_QEMU_SE_IN USER_VARS += CONFIG_REMOTE_IN +USER_VARS += CONFIG_QEMU_COCO_DEV_IN USER_VARS += DESTDIR USER_VARS += HYPERVISOR USER_VARS += USE_BUILDIN_DB @@ -411,8 +433,13 @@ USER_VARS += FCVALIDJAILERPATHS USER_VARS += DEFMAXMEMSZ_FC USER_VARS += SYSCONFIG USER_VARS += IMAGENAME +USER_VARS += IMAGECONFIDENTIALNAME USER_VARS += IMAGEPATH +USER_VARS += IMAGECONFIDENTIALPATH +USER_VARS += INITRDNAME +USER_VARS += INITRDCONFIDENTIALNAME USER_VARS += INITRDPATH +USER_VARS += INITRDCONFIDENTIALPATH USER_VARS += DEFROOTFSTYPE USER_VARS += VMROOTFSDRIVER_DB USER_VARS += VMROOTFSDRIVER_CLH @@ -424,6 +451,7 @@ USER_VARS += KERNELPATH_DB USER_VARS += KERNELPATH_QEMU USER_VARS += KERNELPATH_QEMU_SE USER_VARS += KERNELPATH_FC +USER_VARS += KERNELPATH_COCO USER_VARS += KERNELPATH USER_VARS += KERNELVIRTIOFSPATH USER_VARS += FIRMWAREPATH @@ -475,6 +503,7 @@ USER_VARS += DEFBLOCKSTORAGEDRIVER_FC USER_VARS += DEFSHAREDFS_CLH_VIRTIOFS USER_VARS += DEFSHAREDFS_QEMU_VIRTIOFS USER_VARS += DEFSHAREDFS_QEMU_SEL_VIRTIOFS +USER_VARS += DEFSHAREDFS_QEMU_COCO_DEV_VIRTIOFS USER_VARS += DEFVIRTIOFSDAEMON USER_VARS += DEFVALIDVIRTIOFSDAEMONPATHS USER_VARS += DEFVIRTIOFSCACHESIZE @@ -608,6 +637,10 @@ define MAKE_KERNEL_NAME $(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container) endef +define MAKE_KERNEL_NAME_COCO +$(if $(findstring uncompressed,$1),vmlinux-confidential.container,vmlinuz-confidential.container) +endef + .DEFAULT_GOAL := default GENERATED_FILES += $(CONFIGS) diff --git a/src/runtime-rs/config/configuration-qemu-runtime-rs-coco-dev.toml.in b/src/runtime-rs/config/configuration-qemu-runtime-rs-coco-dev.toml.in new file mode 100644 index 0000000000..ce25840a81 --- /dev/null +++ b/src/runtime-rs/config/configuration-qemu-runtime-rs-coco-dev.toml.in @@ -0,0 +1,825 @@ +# Copyright (c) 2017-2019 Intel Corporation +# Copyright (c) 2021 Adobe Inc. +# Copyright (c) 2024-2025 IBM Corp. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_QEMU_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.qemu] +path = "@QEMUPATH@" +kernel = "@KERNELPATH_COCO@" +image = "@IMAGECONFIDENTIALPATH@" +# initrd = "@INITRDCONFIDENTIALPATH@" +machine_type = "@MACHINETYPE@" + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type=@DEFROOTFSTYPE@ + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm +vm_rootfs_driver = "@VMROOTFSDRIVER_QEMU@" + +# Enable confidential guest support. +# Toggling that setting may trigger different hardware features, ranging +# from memory encryption to both memory and CPU-state encryption and integrity. +# The Kata Containers runtime dynamically detects the available feature set and +# aims at enabling the largest possible one, returning an error if none is +# available, or none is supported by the hypervisor. +# +# Known limitations: +# * Does not work by design: +# - CPU Hotplug +# - Memory Hotplug +# - NVDIMM devices +# +# Default false +# confidential_guest = true + +# Choose AMD SEV-SNP confidential guests +# In case of using confidential guests on AMD hardware that supports both SEV +# and SEV-SNP, the following enables SEV-SNP guests. SEV guests are default. +# Default false +# sev_snp_guest = true + +# Enable running QEMU VMM as a non-root user. +# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +# rootless = true + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@ + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Path to the firmware. +# If you want that qemu uses the default firmware leave this option empty +firmware = "@FIRMWAREPATH@" + +# Path to the firmware volume. +# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables +# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables +# can be customized per each user while UEFI code is kept same. +firmware_volume = "@FIRMWAREVOLUMEPATH@" + +# Machine accelerators +# comma-separated list of machine accelerators to pass to the hypervisor. +# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` +machine_accelerators="@MACHINEACCELERATORS@" + +# Qemu seccomp sandbox feature +# comma-separated list of seccomp sandbox features to control the syscall access. +# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"` +# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox +# Another note: enabling this feature may reduce performance, you may enable +# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html +#seccompsandbox="@DEFSECCOMPSANDBOXPARAM@" + +# CPU features +# comma-separated list of cpu features to pass to the cpu +# For example, `cpu_features = "pmu=off,vmx=off" +cpu_features="@CPUFEATURES@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to @DEFVCPUS@ +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = @DEFVCPUS_QEMU@ + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. +default_maxvcpus = @DEFMAXVCPUS_QEMU@ + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in qemu or in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Reclaim guest freed memory. +# Enabling this will result in the VM balloon device having f_reporting=on set. +# Then the hypervisor will use it to reclaim guest freed memory. +# This is useful for reducing the amount of memory used by a VM. +# Enabling this feature may sometimes reduce the speed of memory access in +# the VM. +# +# Default false +#reclaim_guest_freed_memory = true + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ +# +# Default memory slots per SB/VM. +# If unspecified then it will be set @DEFMEMSLOTS@. +# This is will determine the times that memory will be hotadded to sandbox/VM. +#memory_slots = @DEFMEMSLOTS@ + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = @DEFMAXMEMSZ@ + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM devie. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +#memory_offset = 0 + +# Specifies virtio-mem will be enabled or not. +# Please note that this option should be used with the command +# "echo 1 > /proc/sys/vm/overcommit_memory". +# Default false +#enable_virtio_mem = true + +# Disable block device from being used for a container's rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +disable_block_device_use = @DEFDISABLEBLOCK@ + +# Shared file system type: +# - virtio-fs (default) +# - virtio-9p +# - virtio-fs-nydus +# - none +shared_fs = "@DEFSHAREDFS_QEMU_COCO_DEV_VIRTIOFS@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# List of valid annotations values for the virtiofs daemon +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@ +valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@ + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Default size of virtqueues +virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["--arg1=xxx", "--arg2=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["--log-level=debug"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@" + +# aio is the I/O mechanism used by qemu +# Options: +# +# - threads +# Pthread based disk I/O. +# +# - native +# Native Linux I/O. +# +# - io_uring +# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and +# qemu >=5.0. +block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@" + +# Specifies cache-related options will be set to block devices or not. +# Default false +#block_device_cache_set = true + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +#block_device_cache_direct = true + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +#block_device_cache_noflush = true + +# Enable iothreads (data-plane) to be used. This causes IO to be +# handled in a separate IO thread. This is currently only implemented +# for SCSI. +# +enable_iothreads = @DEFENABLEIOTHREADS@ + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +#enable_mem_prealloc = true + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +#enable_hugepages = true + +# Enable vhost-user storage device, default false +# Enabling this will result in some Linux reserved block type +# major range 240-254 being chosen to represent vhost-user devices. +enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@ + +# The base directory specifically used for vhost-user devices. +# Its sub-path "block" is used for block devices; "block/sockets" is +# where we expect vhost-user sockets to live; "block/devices" is where +# simulated block device nodes for vhost-user devices to live. +vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@" + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: intel_iommu=on,iommu=pt +#enable_iommu = true + +# Enable IOMMU_PLATFORM, default false +# Enabling this will result in the VM device having iommu_platform=on set +#enable_iommu_platform = true + +# List of valid annotations values for the vhost user store path +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@ +valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@ + +# The timeout for reconnecting on non-server spdk sockets when the remote end goes away. +# qemu will delay this many seconds and then attempt to reconnect. +# Zero disables reconnecting, and the default is zero. +vhost_user_reconnect_timeout_sec = 0 + +# Enable file based guest memory support. The default is an empty string which +# will disable this feature. In the case of virtio-fs, this is enabled +# automatically and '/dev/shm' is used as the backing folder. +# This option will be ignored if VM templating is enabled. +#file_mem_backend = "@DEFFILEMEMBACKEND@" + +# List of valid annotations values for the file_mem_backend annotation +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@ +valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@ + +# -pflash can add image file to VM. The arguments of it should be in format +# of ["/path/to/flash0.img", "/path/to/flash1.img"] +pflashes = [] + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +#enable_debug = true + +# This option allows to add an extra HMP or QMP socket when `enable_debug = true` +# +# WARNING: Anyone with access to the extra socket can take full control of +# Qemu. This is for debugging purpose only and must *NEVER* be used in +# production. +# +# Valid values are : +# - "hmp" +# - "qmp" +# - "qmp-pretty" (same as "qmp" with pretty json formatting) +# +# If set to the empty string "", no extra monitor socket is added. This is +# the default. +#extra_monitor_socket = "hmp" + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# This is the msize used for 9p shares. It is the number of bytes +# used for 9p packet payload. +#msize_9p = @DEFMSIZE9P@ + +# If false and nvdimm is supported, use nvdimm device to plug guest image. +# Otherwise virtio-block device is used. +# +# nvdimm is not supported when `confidential_guest = true`. +# +# Default is false +#disable_image_nvdimm = true + +# VFIO devices are hotplugged on a bridge by default. +# Enable hotplugging on root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hotplugging on +# a bridge. +# Default false +#hotplug_vfio_on_root_bus = true + +# Enable hot-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. +# The default setting is "no-port" +#hot_plug_vfio = "root-port" + +# In a confidential compute environment hot-plugging can compromise +# security. +# Enable cold-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. +# The default setting is "no-port", which means disabled. +#cold_plug_vfio = "root-port" + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +#pcie_root_port = 2 + +# Before hot plugging a PCIe device onto a switch port, you need add a pcie_switch_port device fist. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means how many devices attached onto pcie_switch_port will be created. +# This value is valid when hotplug_vfio_on_root_bus is true, and machine_type is "q35" +# Default 0 +#pcie_switch_port = 2 + +# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off +# security (vhost-net runs ring0) for network I/O performance. +#disable_vhost_net = true + +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +#entropy_source= "@DEFENTROPYSOURCE@" + +# List of valid annotations values for entropy_source +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDENTROPYSOURCES@ +valid_entropy_sources = @DEFVALIDENTROPYSOURCES@ + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" + +# Enable connection to Quote Generation Service (QGS) +# The "tdx_quote_generation_service_socket_port" parameter configures how QEMU connects to the TDX Quote Generation Service (QGS). +# This connection is essential for Trusted Domain (TD) attestation, as QGS signs the TDREPORT sent by QEMU via the GetQuote hypercall. +# By default QGS runs on vsock port 4050, but can be modified by the host admin. For QEMU's tdx-guest object, this connection needs to +# be specified in a JSON format, for example: +# -object '{"qom-type":"tdx-guest","id":"tdx","quote-generation-socket":{"type":"vsock","cid":"2","port":"4050"}}' +# It's important to note that setting "tdx_quote_generation_service_socket_port" to 0 enables communication via Unix Domain Sockets (UDS). +# To activate UDS, the QGS service itself must be launched with the "-port=0" parameter and the UDS will always be located at /var/run/tdx-qgs/qgs.socket. +# -object '{"qom-type":"tdx-guest","id":"tdx","quote-generation-socket":{"type":"unix","path":"/var/run/tdx-qgs/qgs.socket"}}' +# tdx_quote_generation_service_socket_port = @QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT@ + +# +# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic. +# Default 0-sized value means unlimited rate. +#rx_rate_limiter_max_rate = 0 +# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM). +# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block) +# to discipline traffic. +# Default 0-sized value means unlimited rate. +#tx_rate_limiter_max_rate = 0 + +# Set where to save the guest memory dump file. +# If set, when GUEST_PANICKED event occurred, +# guest memeory will be dumped to host filesystem under guest_memory_dump_path, +# This directory will be created automatically if it does not exist. +# +# The dumped file(also called vmcore) can be processed with crash or gdb. +# +# WARNING: +# Dump guest’s memory can take very long depending on the amount of guest memory +# and use much disk space. +#guest_memory_dump_path="/var/crash/kata" + +# If enable paging. +# Basically, if you want to use "gdb" rather than "crash", +# or need the guest-virtual addresses in the ELF vmcore, +# then you should enable paging. +# +# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details +#guest_memory_dump_paging=false + +# use legacy serial for guest console if available and implemented for architecture. Default false +#use_legacy_serial = true + +# disable applying SELinux on the VMM process (default false) +disable_selinux=@DEFDISABLESELINUX@ + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux=@DEFDISABLEGUESTSELINUX@ + + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +#enable_template = true + +# Specifies the path of template. +# +# Default "/run/vc/vm/template" +#template_path = "/run/vc/vm/template" + +# The number of caches of VMCache: +# unspecified or == 0 --> VMCache is disabled +# > 0 --> will be set to the specified number +# +# VMCache is a function that creates VMs as caches before using it. +# It helps speed up new container creation. +# The function consists of a server and some clients communicating +# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto. +# The VMCache server will create some VMs and cache them by factory cache. +# It will convert the VM to gRPC format and transport it when gets +# requestion from clients. +# Factory grpccache is the VMCache client. It will request gRPC format +# VM and convert it back to a VM. If VMCache function is enabled, +# kata-runtime will request VM from factory grpccache when it creates +# a new sandbox. +# +# Default 0 +#vm_cache_number = 0 + +# Specify the address of the Unix socket that is used by VMCache. +# +# Default /var/run/kata-containers/cache.sock +#vm_cache_endpoint = "/var/run/kata-containers/cache.sock" + +[agent.@PROJECT_TYPE@] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +#enable_tracing = true + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules=[] + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +#debug_console_enabled = true + +# Agent dial timeout in millisecond. +# (default: 10) +#dial_timeout_ms = 10 + +# Agent reconnect timeout in millisecond. +# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300) +# If you find pod cannot connect to the agent when starting, please +# consider increasing this value to increase the retry times. +# You'd better not change the value of dial_timeout_ms, unless you have an +# idea of what you are doing. +# (default: 3000) +#reconnect_timeout_ms = 3000 + +[agent.@PROJECT_TYPE@.mem_agent] +# Control the mem-agent function enable or disable. +# Default to false +#mem_agent_enable = true + +# Control the mem-agent memcg function disable or enable +# Default to false +#memcg_disable = false + +# Control the mem-agent function swap enable or disable. +# Default to false +#memcg_swap = false + +# Control the mem-agent function swappiness max number. +# Default to 50 +#memcg_swappiness_max = 50 + +# Control the mem-agent memcg function wait period seconds +# Default to 600 +#memcg_period_secs = 600 + +# Control the mem-agent memcg wait period PSI percent limit. +# If the percentage of memory and IO PSI stall time within +# the memcg waiting period for a cgroup exceeds this value, +# then the aging and eviction for this cgroup will not be +# executed after this waiting period. +# Default to 1 +#memcg_period_psi_percent_limit = 1 + +# Control the mem-agent memcg eviction PSI percent limit. +# If the percentage of memory and IO PSI stall time for a cgroup +# exceeds this value during an eviction cycle, the eviction for +# this cgroup will immediately stop and will not resume until +# the next memcg waiting period. +# Default to 1 +#memcg_eviction_psi_percent_limit = 1 + +# Control the mem-agent memcg eviction run aging count min. +# A cgroup will only perform eviction when the number of aging cycles +# in memcg is greater than or equal to memcg_eviction_run_aging_count_min. +# Default to 3 +#memcg_eviction_run_aging_count_min = 3 + +# Control the mem-agent compact function disable or enable +# Default to false +#compact_disable = false + +# Control the mem-agent compaction function wait period seconds +# Default to 600 +#compact_period_secs = 600 + +# Control the mem-agent compaction function wait period PSI percent limit. +# If the percentage of memory and IO PSI stall time within +# the compaction waiting period exceeds this value, +# then the compaction will not be executed after this waiting period. +# Default to 1 +#compact_period_psi_percent_limit = 1 + +# Control the mem-agent compaction function compact PSI percent limit. +# During compaction, the percentage of memory and IO PSI stall time +# is checked every second. If this percentage exceeds +# compact_psi_percent_limit, the compaction process will stop. +# Default to 5 +#compact_psi_percent_limit = 5 + +# Control the maximum number of seconds for each compaction of mem-agent compact function. +# Default to 180 +#compact_sec_max = 180 + +# Control the mem-agent compaction function compact order. +# compact_order is use with compact_threshold. +# Default to 9 +#compact_order = 9 + +# Control the mem-agent compaction function compact threshold. +# compact_threshold is the pages number. +# When examining the /proc/pagetypeinfo, if there's an increase in the +# number of movable pages of orders smaller than the compact_order +# compared to the amount following the previous compaction, +# and this increase surpasses a certain threshold—specifically, +# more than 'compact_threshold' number of pages. +# Or the number of free pages has decreased by 'compact_threshold' +# since the previous compaction. +# then the system should initiate another round of memory compaction. +# Default to 1024 +#compact_threshold = 1024 + +# Control the mem-agent compaction function force compact times. +# After one compaction, if there has not been a compaction within +# the next compact_force_times times, a compaction will be forced +# regardless of the system's memory situation. +# If compact_force_times is set to 0, will do force compaction each time. +# If compact_force_times is set to 18446744073709551615, will never do force compaction. +# Default to 18446744073709551615 +#compact_force_times = 18446744073709551615 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="@DEFNETWORKMODEL_QEMU@" + +name="@RUNTIMENAME@" +hypervisor_name="@HYPERVISOR_QEMU@" +agent_name="@PROJECT_TYPE@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ + +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +# enable_vcpus_pinning = false + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +#guest_selinux_label="@DEFGUESTSELINUXLABEL@" + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +#jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +#jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +#jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY_QEMU@ + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_QEMU@ + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +sandbox_bind_mounts=@DEFBINDMOUNTS@ + +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir=@DEFDISABLEGUESTEMPTYDIR@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental=@DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +# enable_pprof = true