diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index b9497d846c..64cb4977b3 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -291,9 +291,24 @@ ifneq (,$(CLHCMD)) CONFIGS += $(CONFIG_CLH) + CONFIG_FILE_CLH_AZURE = configuration-clh-azure-runtime-rs.toml + CONFIG_CLH_AZURE = config/$(CONFIG_FILE_CLH_AZURE) + CONFIG_CLH_AZURE_IN = $(CONFIG_CLH_AZURE).in + + CONFIG_PATH_CLH_AZURE = $(abspath $(CONFDIR)/$(CONFIG_FILE_CLH_AZURE)) + CONFIG_PATHS += $(CONFIG_PATH_CLH_AZURE) + + SYSCONFIG_CLH_AZURE = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_CLH_AZURE)) + SYSCONFIG_PATHS += $(SYSCONFIG_CLH_AZURE) + + CONFIGS += $(CONFIG_CLH_AZURE) + # CLH-specific options (all should be suffixed by "_CLH") # currently, huge pages are required for virtiofsd support DEFNETWORKMODEL_CLH := tcfilter + IMAGEPATH_CLH_AZURE := $(PKGDATADIR)/kata-containers-mariner.img + KERNELPATH_CLH_AZURE := /usr/share/cloud-hypervisor/vmlinux.bin + DEFSTATICRESOURCEMGMT_CLH_AZURE := true KERNELTYPE_CLH = uncompressed KERNEL_NAME_CLH = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_CLH)) KERNELPATH_CLH = $(KERNELDIR)/$(KERNEL_NAME_CLH) @@ -596,6 +611,7 @@ USER_VARS += QEMUTDXPATH USER_VARS += QEMUTDXVALIDHYPERVISORPATHS USER_VARS += FIRMWAREPATH_CLH USER_VARS += KERNELPATH_CLH +USER_VARS += KERNELPATH_CLH_AZURE USER_VARS += FCCMD USER_VARS += FCPATH USER_VARS += FCVALIDHYPERVISORPATHS @@ -608,6 +624,7 @@ USER_VARS += IMAGENAME USER_VARS += IMAGENAME_NV USER_VARS += IMAGECONFIDENTIALNAME USER_VARS += IMAGEPATH +USER_VARS += IMAGEPATH_CLH_AZURE USER_VARS += IMAGEPATH_NV USER_VARS += IMAGECONFIDENTIALPATH USER_VARS += INITRDNAME @@ -711,6 +728,7 @@ USER_VARS += DEFENABLEVCPUSPINNING_QEMU USER_VARS += DEFSTATICRESOURCEMGMT_DB USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFSTATICRESOURCEMGMT_CLH +USER_VARS += DEFSTATICRESOURCEMGMT_CLH_AZURE USER_VARS += DEFSTATICRESOURCEMGMT_QEMU USER_VARS += DEFSTATICRESOURCEMGMT_COCO USER_VARS += DEFDISABLEIMAGENVDIMM diff --git a/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in b/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in new file mode 100644 index 0000000000..8237012486 --- /dev/null +++ b/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in @@ -0,0 +1,545 @@ +# Copyright (c) 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_CLH_AZURE_RUNTIME_RS_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.clh] +path = "@CLHPATH@" +kernel = "@KERNELPATH_CLH_AZURE@" +image = "@IMAGEPATH_CLH_AZURE@" + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type = @DEFROOTFSTYPE@ + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. +# +# virtio-pmem is not supported with Cloud Hypervisor. +vm_rootfs_driver = "@VMROOTFSDRIVER_CLH@" + +# Path to the firmware. +# If you want Cloud Hypervisor to use a specific firmware, set its path below. +firmware = "@FIRMWAREPATH@" + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @CLHVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @CLHVALIDHYPERVISORPATHS@ + +# List of valid annotations values for ctlpath +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: +valid_ctlpaths = [] + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to 1 +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = @DEFVCPUS@ + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +default_maxvcpus = @DEFMAXVCPUS@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ + +# Shared file system type: +# - virtio-fs +# - virtio-fs-nydus +# - none +shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Default size of virtqueues +virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Reclaim guest freed memory. +# Enabling this will result in the VM balloon device having f_reporting=on set. +# Then the hypervisor will use it to reclaim guest freed memory. +# This is useful for reducing the amount of memory used by a VM. +# Enabling this feature may sometimes reduce the speed of memory access in +# the VM. +# +# Default false +reclaim_guest_freed_memory = false + +# Block device driver to be used by the hypervisor when a container's storage +# is backed by a block device or a file. This driver facilitates attaching the +# storage directly to the guest VM. +block_device_driver = "virtio-blk-pci" + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +block_device_cache_direct = false + +# Bandwidth rate limiter options +# +# disk_rate_limiter_bw_max_rate controls disk I/O bandwidth (size in bits/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +disk_rate_limiter_bw_max_rate = 0 + +# disk_rate_limiter_bw_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +disk_rate_limiter_bw_one_time_burst = 0 + +# Operation rate limiter options +# +# disk_rate_limiter_ops_max_rate controls disk I/O bandwidth (size in ops/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +disk_rate_limiter_ops_max_rate = 0 + +# disk_rate_limiter_ops_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +disk_rate_limiter_ops_one_time_burst = 0 + +# Virtio queue size. Size: byte. default 128 +queue_size = 128 + +# Block device multi-queue, default 1 +num_queues = 1 + +# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest. +# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs). +# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale +# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net). +# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads), +# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend. +# Default: 1, Range: 1..=256 +network_queues = @DEFNETQUEUES@ + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +enable_mem_prealloc = false + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +enable_hugepages = false + +# Enable running clh VMM as a non-root user. +# By default clh VMM run as root. When this is set to true, clh VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +rootless = false + +# Disable the 'seccomp' feature from Cloud Hypervisor, firecracker or dragonball, default false +disable_seccomp = false + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +enable_debug = false + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +disable_nesting_checks = false + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +# Recommended value when enabling: "/usr/share/oci/hooks" +guest_hook_path = "" + +# Enable swap in the guest. Default false. +# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device. +enable_guest_swap = false + +# If enable_guest_swap is enabled, the swap device will be created in the guest +# at this path. Default "/run/kata-containers/swap". +guest_swap_path = "/run/kata-containers/swap" + +# The percentage of the total memory to be used as swap device. +# Default 100. +guest_swap_size_percent = 100 + +# The threshold in seconds to create swap device in the guest. +# Kata will wait guest_swap_create_threshold_secs seconds before creating swap device. +# Default 60. +guest_swap_create_threshold_secs = 60 + +[agent.@PROJECT_TYPE@] +container_pipe_size = @PIPESIZE@ +# If enabled, make the agent display debug-level messages. +# (default: disabled) +enable_debug = false + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +enable_tracing = false + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +debug_console_enabled = false + +# Agent dial timeout in millisecond. +# (default: 10) +dial_timeout_ms = 10 + +# Agent reconnect timeout in millisecond. +# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300) +# If you find pod cannot connect to the agent when starting, please +# consider increasing this value to increase the retry times. +# You'd better not change the value of dial_timeout_ms, unless you have an +# idea of what you are doing. +# (default: 3000) +reconnect_timeout_ms = 3000 + +# Create Container Request Timeout +# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest. +# It's also used to ensure that workloads, especially those involving large image pulls within the guest, +# have sufficient time to complete. +# +# Effective Timeout Determination: +# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values: +# - create_container_timeout: The timeout value configured for creating containers (default: 30 seconds). +# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below: +# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout) +# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s) +create_container_timeout = @DEFCREATECONTAINERTIMEOUT@ + +[agent.@PROJECT_TYPE@.mem_agent] +# Control the mem-agent function enable or disable. +# Default to false +mem_agent_enable = false + +# Control the mem-agent memcg function disable or enable +# Default to false +memcg_disable = false + +# Control the mem-agent function swap enable or disable. +# Default to false +memcg_swap = false + +# Control the mem-agent function swappiness max number. +# Default to 50 +memcg_swappiness_max = 50 + +# Control the mem-agent memcg function wait period seconds +# Default to 600 +memcg_period_secs = 600 + +# Control the mem-agent memcg wait period PSI percent limit. +# If the percentage of memory and IO PSI stall time within +# the memcg waiting period for a cgroup exceeds this value, +# then the aging and eviction for this cgroup will not be +# executed after this waiting period. +# Default to 1 +memcg_period_psi_percent_limit = 1 + +# Control the mem-agent memcg eviction PSI percent limit. +# If the percentage of memory and IO PSI stall time for a cgroup +# exceeds this value during an eviction cycle, the eviction for +# this cgroup will immediately stop and will not resume until +# the next memcg waiting period. +# Default to 1 +memcg_eviction_psi_percent_limit = 1 + +# Control the mem-agent memcg eviction run aging count min. +# A cgroup will only perform eviction when the number of aging cycles +# in memcg is greater than or equal to memcg_eviction_run_aging_count_min. +# Default to 3 +memcg_eviction_run_aging_count_min = 3 + +# Control the mem-agent compact function disable or enable +# Default to false +compact_disable = false + +# Control the mem-agent compaction function wait period seconds +# Default to 600 +compact_period_secs = 600 + +# Control the mem-agent compaction function wait period PSI percent limit. +# If the percentage of memory and IO PSI stall time within +# the compaction waiting period exceeds this value, +# then the compaction will not be executed after this waiting period. +# Default to 1 +compact_period_psi_percent_limit = 1 + +# Control the mem-agent compaction function compact PSI percent limit. +# During compaction, the percentage of memory and IO PSI stall time +# is checked every second. If this percentage exceeds +# compact_psi_percent_limit, the compaction process will stop. +# Default to 5 +compact_psi_percent_limit = 5 + +# Control the maximum number of seconds for each compaction of mem-agent compact function. +# Default to 300 +compact_sec_max = 300 + +# Control the mem-agent compaction function compact order. +# compact_order is use with compact_threshold. +# Default to 9 +compact_order = 9 + +# Control the mem-agent compaction function compact threshold. +# compact_threshold is the pages number. +# When examining the /proc/pagetypeinfo, if there's an increase in the +# number of movable pages of orders smaller than the compact_order +# compared to the amount following the previous compaction, +# and this increase surpasses a certain threshold—specifically, +# more than 'compact_threshold' number of pages. +# Or the number of free pages has decreased by 'compact_threshold' +# since the previous compaction. +# then the system should initiate another round of memory compaction. +# Default to 1024 +compact_threshold = 1024 + +# Control the mem-agent compaction function force compact times. +# After one compaction, if there has not been a compaction within +# the next compact_force_times times, a compaction will be forced +# regardless of the system's memory situation. +# If compact_force_times is set to 0, will do force compaction each time. +# If compact_force_times is set to 18446744073709551615, will never do force compaction. +# Default to 18446744073709551615 +# Note: Using a large but valid u64 value (within i64::MAX range) instead of u64::MAX to avoid TOML parser issues +# Using 9223372036854775807 (i64::MAX) which is effectively "never" for practical purposes +compact_force_times = 9223372036854775807 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +enable_debug = false + +# If enabled, enabled, it means that 1) if the runtime exits abnormally, +# the cleanup process will be skipped, and 2) the runtime will not exit +# even if the health check fails. +# This option is typically used to retain abnormal information for debugging. +# (default: false) +keep_abnormal = false + +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - bridged (Deprecated) +# Uses a linux bridge to interconnect the container interface to +# the VM. Works for most cases except macvlan and ipvlan. +# ***NOTE: This feature has been deprecated with plans to remove this +# feature in the future. Please use other network models listed below. +# +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model = "@DEFNETWORKMODEL_CLH@" + +name = "@RUNTIMENAME@" +hypervisor_name = "@HYPERVISOR_NAME_CLH@" +agent_name = "@PROJECT_TYPE@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@ + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +enable_tracing = false + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=bridged` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +disable_new_netns = false + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_CLH@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental = @DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +enable_pprof = false + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_CLH_AZURE@ + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +# Now it supports three kinds of bind mount format: +# - "/path/to", default readonly mode. +# - "/path/to:ro", readonly mode. +# - "/path/to:rw", readwrite mode. +sandbox_bind_mounts = @DEFBINDMOUNTS@ + +# Base directory of directly attachable network config. +# Network devices for VM-based containers are allowed to be placed in the +# host netns to eliminate as many hops as possible, which is what we +# called a "Directly Attachable Network". The config, set by special CNI +# plugins, is used to tell the Kata containers what devices are attached +# to the hypervisor. +# (default: /run/kata-containers/dans) +dan_conf = "@DEFDANCONF@" diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 88ef8077ff..2cf4540550 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -539,9 +539,24 @@ ifneq (,$(CLHCMD)) CONFIGS += $(CONFIG_CLH) + CONFIG_FILE_CLH_AZURE = configuration-clh-azure.toml + CONFIG_CLH_AZURE = config/$(CONFIG_FILE_CLH_AZURE) + CONFIG_CLH_AZURE_IN = $(CONFIG_CLH_AZURE).in + + CONFIG_PATH_CLH_AZURE = $(abspath $(CONFDIR)/$(CONFIG_FILE_CLH_AZURE)) + CONFIG_PATHS += $(CONFIG_PATH_CLH_AZURE) + + SYSCONFIG_CLH_AZURE = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_CLH_AZURE)) + SYSCONFIG_PATHS += $(SYSCONFIG_CLH_AZURE) + + CONFIGS += $(CONFIG_CLH_AZURE) + # CLH-specific options (all should be suffixed by "_CLH") # currently, huge pages are required for virtiofsd support DEFNETWORKMODEL_CLH := tcfilter + IMAGEPATH_CLH_AZURE := $(PKGDATADIR)/kata-containers-mariner.img + KERNELPATH_CLH_AZURE := /usr/share/cloud-hypervisor/vmlinux.bin + DEFSTATICRESOURCEMGMT_CLH_AZURE := true KERNELTYPE_CLH = uncompressed KERNEL_NAME_CLH = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_CLH)) KERNELPATH_CLH = $(KERNELDIR)/$(KERNEL_NAME_CLH) @@ -666,6 +681,7 @@ USER_VARS += SYSCONFIG USER_VARS += IMAGENAME USER_VARS += IMAGECONFIDENTIALNAME USER_VARS += IMAGEPATH +USER_VARS += IMAGEPATH_CLH_AZURE USER_VARS += IMAGECONFIDENTIALPATH USER_VARS += INITRDNAME USER_VARS += INITRDCONFIDENTIALNAME @@ -704,6 +720,7 @@ USER_VARS += KERNELCONFIDENTIALPATH USER_VARS += KERNELCONFIDENTIALPATH_CCA USER_VARS += KERNELSEPATH USER_VARS += KERNELPATH_CLH +USER_VARS += KERNELPATH_CLH_AZURE USER_VARS += KERNELPATH_FC USER_VARS += KERNELPATH_STRATOVIRT USER_VARS += KERNELVIRTIOFSPATH @@ -811,6 +828,7 @@ USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFSTATICRESOURCEMGMT USER_VARS += DEFSTATICRESOURCEMGMT_QEMU USER_VARS += DEFSTATICRESOURCEMGMT_CLH +USER_VARS += DEFSTATICRESOURCEMGMT_CLH_AZURE USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFSTATICRESOURCEMGMT_STRATOVIRT USER_VARS += DEFSTATICRESOURCEMGMT_TEE diff --git a/src/runtime/config/configuration-clh-azure.toml.in b/src/runtime/config/configuration-clh-azure.toml.in new file mode 100644 index 0000000000..4a9c0962b2 --- /dev/null +++ b/src/runtime/config/configuration-clh-azure.toml.in @@ -0,0 +1,538 @@ +# Copyright (c) 2019 Ericsson Eurolab Deutschland GmbH +# Copyright (c) 2021 Adobe Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_CLH_AZURE_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.clh] +path = "@CLHPATH@" +kernel = "@KERNELPATH_CLH_AZURE@" +image = "@IMAGEPATH_CLH_AZURE@" + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type = @DEFROOTFSTYPE@ + +# Enable running clh VMM as a non-root user. +# By default clh VMM run as root. When this is set to true, clh VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +rootless = false + +# disable applying SELinux on the VMM process (default false) +disable_selinux = @DEFDISABLESELINUX@ + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux = @DEFDISABLEGUESTSELINUX@ + +# Path to the firmware. +# If you want Cloud Hypervisor to use a specific firmware, set its path below. +# This is option is only used when confidential_guest is enabled. +# +# For more information about firmwared that can be used with specific TEEs, +# please, refer to: +# * Intel TDX: +# - td-shim: https://github.com/confidential-containers/td-shim +# +# firmware = "@FIRMWAREPATH@" + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @CLHVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @CLHVALIDHYPERVISORPATHS@ + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to @DEFVCPUS@ +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +default_maxvcpus = @DEFMAXVCPUS@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ + +# Default memory slots per SB/VM. +# If unspecified then it will be set @DEFMEMSLOTS@. +# This is will determine the times that memory will be hotadded to sandbox/VM. +memory_slots = @DEFMEMSLOTS@ + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = @DEFMAXMEMSZ@ + +# Disable hotplugging host block devices to guest VMs for container rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. +disable_block_device_use = @DEFDISABLEBLOCK@ + +# Shared file system type: +# - virtio-fs (default) +# - virtio-fs-nydus +# - none +shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# List of valid annotations values for the virtiofs daemon +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@ +valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@ + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Default size of virtqueues +virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["--arg1=xxx", "--arg2=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["--log-level=debug"] +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - metadata +# Metadata and pathname lookup are cached in guest and never expire. +# Data is never cached in guest. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-blk. +block_device_driver = "virtio-blk" + +# Specifies cache-related options will be set to block devices or not. +# Default false +block_device_cache_set = false + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +block_device_cache_direct = false + +# Reclaim guest freed memory. +# Enabling this will result in the VM balloon device having f_reporting=on set. +# Then the hypervisor will use it to reclaim guest freed memory. +# This is useful for reducing the amount of memory used by a VM. +# Enabling this feature may sometimes reduce the speed of memory access in +# the VM. +# +# Default false +reclaim_guest_freed_memory = false + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +enable_hugepages = false + +# Disable the 'seccomp' feature from Cloud Hypervisor, default false +disable_seccomp = false + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: iommu=pt +enable_iommu = false + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +enable_debug = false + +# This option specifies the loglevel of the hypervisor +# +# Default 1 +hypervisor_loglevel = 1 + +# If false and nvdimm is supported, use nvdimm device to plug guest image. +# Otherwise virtio-block device is used. +# +# nvdimm is not supported with Cloud Hypervisor or when `confidential_guest = true`. +disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM_CLH@ + +# Enable hot-plugging of VFIO devices to a root-port. +# The default setting is "no-port" +hot_plug_vfio = "no-port" + +# In a confidential compute environment hot-plugging can compromise +# security. +# Enable cold-plugging of VFIO devices to a root-port. +# The default setting is "no-port", which means disabled. +cold_plug_vfio = "no-port" + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +guest_hook_path = "" +# +# These options are related to network rate limiter at the VMM level, and are +# based on the Cloud Hypervisor I/O throttling. Those are disabled by default +# and we strongly advise users to refer the Cloud Hypervisor official +# documentation for a better understanding of its internals: +# https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/io_throttling.md +# +# Bandwidth rate limiter options +# +# net_rate_limiter_bw_max_rate controls network I/O bandwidth (size in bits/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +net_rate_limiter_bw_max_rate = 0 +# +# net_rate_limiter_bw_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if net_rate_limiter_bw_max_rate is +# set to a non zero value. +net_rate_limiter_bw_one_time_burst = 0 +# +# Operation rate limiter options +# +# net_rate_limiter_ops_max_rate controls network I/O bandwidth (size in ops/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +net_rate_limiter_ops_max_rate = 0 +# +# net_rate_limiter_ops_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if net_rate_limiter_bw_max_rate is +# set to a non zero value. +net_rate_limiter_ops_one_time_burst = 0 +# +# These options are related to disk rate limiter at the VMM level, and are +# based on the Cloud Hypervisor I/O throttling. Those are disabled by default +# and we strongly advise users to refer the Cloud Hypervisor official +# documentation for a better understanding of its internals: +# https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/io_throttling.md +# +# Bandwidth rate limiter options +# +# disk_rate_limiter_bw_max_rate controls disk I/O bandwidth (size in bits/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +disk_rate_limiter_bw_max_rate = 0 +# +# disk_rate_limiter_bw_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +disk_rate_limiter_bw_one_time_burst = 0 +# +# Operation rate limiter options +# +# disk_rate_limiter_ops_max_rate controls disk I/O bandwidth (size in ops/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +disk_rate_limiter_ops_max_rate = 0 +# +# disk_rate_limiter_ops_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +disk_rate_limiter_ops_one_time_burst = 0 + +[agent.@PROJECT_TYPE@] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +enable_debug = false + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +enable_tracing = false + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +debug_console_enabled = false + +# Agent connection dialing timeout value in seconds +# (default: 45) +dial_timeout = 45 + +# Confidential Data Hub API timeout value in seconds +# (default: 50) +cdh_api_timeout = 50 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +enable_debug = false +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model = "@DEFNETWORKMODEL_CLH@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@ + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +# Example value when enabling: "system_u:system_r:container_t" +guest_selinux_label = "@DEFGUESTSELINUXLABEL@" + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +enable_tracing = false + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +disable_new_netns = false + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY@ + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_CLH_AZURE@ + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +sandbox_bind_mounts = @DEFBINDMOUNTS@ + +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode = "@DEFVFIOMODE@" + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@ + +# Specifies how Kubernetes emptyDir volumes are handled. +# Options: +# +# - shared-fs (default) +# Shares the emptyDir folder with the guest using the method given +# by the `shared_fs` setting. +# +# - block-encrypted +# Plugs a block device to be encrypted in the guest. +# +emptydir_mode = "@DEFEMPTYDIRMODE@" + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental = @DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +enable_pprof = false + +# Indicates the CreateContainer request timeout needed for the workload(s) +# It using guest_pull this includes the time to pull the image inside the guest +# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s) +# Note: The effective timeout is determined by the lesser of two values: runtime-request-timeout from kubelet config +# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout) and create_container_timeout. +# In essence, the timeout used for guest pull=runtime-request-timeout no cold plug +# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need +# explicit CDI annotation for cold plug (applies mainly +# to non-k8s cases) +# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet +# based cold plug. +pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK@"