diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index abc96a5532..f492d1d334 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -76,6 +76,7 @@ PREFIXDEPS := $(PREFIX) LIBEXECDIR := $(PREFIXDEPS)/libexec SHAREDIR := $(PREFIX)/share DEFAULTSDIR := $(SHAREDIR)/defaults +CLHBINDIR := $(PREFIXDEPS)/bin PROJECT_DIR = $(PROJECT_TAG) IMAGENAME = $(PROJECT_TAG).img TARGET = $(PROJECT_COMPONENT) @@ -93,6 +94,9 @@ DEFAULT_HYPERVISOR ?= $(HYPERVISOR_DB) ##VAR HYPERVISOR= List of hypervisors this build system can generate configuration for. HYPERVISORS := $(HYPERVISOR_DB) $(HYPERVISOR_ACRN) $(HYPERVISOR_FC) $(HYPERVISOR_QEMU) $(HYPERVISOR_CLH) +CLHPATH := $(CLHBINDIR)/$(CLHCMD) +CLHVALIDHYPERVISORPATHS := [\"$(CLHPATH)\"] + DBVALIDHYPERVISORPATHS := [] PKGDATADIR := $(PREFIXDEPS)/share/$(PROJECT_DIR) KERNELDIR := $(PKGDATADIR) @@ -160,6 +164,7 @@ DEFMSIZE9P := 8192 DEFVFIOMODE := guest-kernel ##VAR DEFSANDBOXCGROUPONLY= Default cgroup model DEFSANDBOXCGROUPONLY ?= false +DEFSTATICRESOURCEMGMT ?= false DEFSTATICRESOURCEMGMT_DB ?= false DEFBINDMOUNTS := [] DEFDANCONF := /run/kata-containers/dans @@ -173,6 +178,7 @@ MONITOR_OUTPUT = $(CURDIR)/$(MONITOR) MONITOR_DIR = $(CLI_DIR)/kata-monitor SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') VERSION := ${shell cat ./VERSION} +KERNELPARAMS := "" # List of configuration files to build and install CONFIGS = @@ -207,7 +213,7 @@ ifneq (,$(DBCMD)) DEFMAXVCPUS_DB := 1 DEFBLOCKSTORAGEDRIVER_DB := virtio-blk-mmio DEFNETWORKMODEL_DB := tcfilter - KERNELPARAMS = console=ttyS1 agent.log_vport=1025 + KERNELPARAMS_DB = console=ttyS1 agent.log_vport=1025 KERNELTYPE_DB = uncompressed KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME_DB,$(KERNELTYPE_DB)) KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB) @@ -217,6 +223,30 @@ ifneq (,$(DBCMD)) DBSHAREDFS := inline-virtio-fs endif +ifneq (,$(CLHCMD)) + KNOWN_HYPERVISORS += $(HYPERVISOR_CLH) + + CONFIG_FILE_CLH = configuration-cloud-hypervisor.toml + CONFIG_CLH = config/$(CONFIG_FILE_CLH) + CONFIG_CLH_IN = $(CONFIG_CLH).in + + CONFIG_PATH_CLH = $(abspath $(CONFDIR)/$(CONFIG_FILE_CLH)) + CONFIG_PATHS += $(CONFIG_PATH_CLH) + + SYSCONFIG_CLH = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_CLH)) + SYSCONFIG_PATHS += $(SYSCONFIG_CLH) + + CONFIGS += $(CONFIG_CLH) + + # CLH-specific options (all should be suffixed by "_CLH") + # currently, huge pages are required for virtiofsd support + DEFNETWORKMODEL_CLH := tcfilter + KERNELTYPE_CLH = uncompressed + KERNEL_NAME_CLH = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_CLH)) + KERNELPATH_CLH = $(KERNELDIR)/$(KERNEL_NAME_CLH) + VMROOTFSDRIVER_CLH := virtio-pmem +endif + ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_DB)) DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_DB) endif @@ -233,11 +263,16 @@ USER_VARS += DBPATH USER_VARS += DBVALIDHYPERVISORPATHS USER_VARS += DBCTLPATH USER_VARS += DBVALIDCTLPATHS +USER_VARS += CLHPATH +USER_VARS += CLHVALIDHYPERVISORPATHS +USER_VARS += FIRMWAREPATH_CLH +USER_VARS += KERNELPATH_CLH USER_VARS += SYSCONFIG USER_VARS += IMAGENAME USER_VARS += IMAGEPATH USER_VARS += DEFROOTFSTYPE USER_VARS += VMROOTFSDRIVER_DB +USER_VARS += VMROOTFSDRIVER_CLH USER_VARS += MACHINETYPE USER_VARS += KERNELDIR USER_VARS += KERNELTYPE @@ -250,6 +285,7 @@ USER_VARS += MACHINEACCELERATORS USER_VARS += CPUFEATURES USER_VARS += DEFMACHINETYPE_CLH USER_VARS += KERNELPARAMS +USER_VARS += KERNELPARAMS_DB USER_VARS += LIBEXECDIR USER_VARS += LOCALSTATEDIR USER_VARS += PKGDATADIR @@ -273,6 +309,7 @@ USER_VARS += DEFMEMSZ USER_VARS += DEFMEMSLOTS USER_VARS += DEFBRIDGES USER_VARS += DEFNETWORKMODEL_DB +USER_VARS += DEFNETWORKMODEL_CLH USER_VARS += DEFDISABLEGUESTEMPTYDIR USER_VARS += DEFDISABLEGUESTSECCOMP USER_VARS += DEFDISABLESELINUX @@ -298,12 +335,14 @@ USER_VARS += DEFMSIZE9P USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY +USER_VARS += DEFSTATICRESOURCEMGMT USER_VARS += DEFSTATICRESOURCEMGMT_DB USER_VARS += DEFBINDMOUNTS USER_VARS += DEFVFIOMODE USER_VARS += BUILDFLAGS USER_VARS += RUNTIMENAME USER_VARS += HYPERVISOR_DB +USER_VARS += HYPERVISOR_CLH USER_VARS += PIPESIZE USER_VARS += DBSHAREDFS USER_VARS += KATA_INSTALL_GROUP @@ -383,6 +422,12 @@ define MAKE_KERNEL_NAME_DB $(if $(findstring uncompressed,$1),vmlinux-dragonball-experimental.container,vmlinuz-dragonball-experimental.container) endef +# Returns the name of the kernel file to use based on the provided KERNELTYPE. +# # $1 : KERNELTYPE (compressed or uncompressed) +define MAKE_KERNEL_NAME +$(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container) +endef + .DEFAULT_GOAL := default GENERATED_FILES += $(CONFIGS) diff --git a/src/runtime-rs/arch/x86_64-options.mk b/src/runtime-rs/arch/x86_64-options.mk index 0e837f0657..ea44aeb506 100644 --- a/src/runtime-rs/arch/x86_64-options.mk +++ b/src/runtime-rs/arch/x86_64-options.mk @@ -13,3 +13,6 @@ QEMUCMD := qemu-system-x86_64 # dragonball binary name DBCMD := dragonball + +# cloud-hypervisor binary name +CLHCMD := cloud-hypervisor diff --git a/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in new file mode 100644 index 0000000000..82243abae0 --- /dev/null +++ b/src/runtime-rs/config/configuration-cloud-hypervisor.toml.in @@ -0,0 +1,350 @@ +# Copyright (c) 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_CLH_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.cloud-hypervisor] +path = "@CLHPATH@" +kernel = "@KERNELPATH_CLH@" +image = "@IMAGEPATH@" + +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type=@DEFROOTFSTYPE@ + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. +vm_rootfs_driver = "@VMROOTFSDRIVER_CLH@" + +# Enable confidential guest support. +# Toggling that setting may trigger different hardware features, ranging +# from memory encryption to both memory and CPU-state encryption and integrity. +# The Kata Containers runtime dynamically detects the available feature set and +# aims at enabling the largest possible one, returning an error if none is +# available, or none is supported by the hypervisor. +# +# Known limitations: +# * Does not work by design: +# - CPU Hotplug +# - Memory Hotplug +# - NVDIMM devices +# +# Supported TEEs: +# * Intel TDX +# +# Default false +# confidential_guest = true + +# Path to the firmware. +# If you want Cloud Hypervisor to use a specific firmware, set its path below. +# This is option is only used when confidential_guest is enabled. +# +# For more information about firmwared that can be used with specific TEEs, +# please, refer to: +# * Intel TDX: +# - td-shim: https://github.com/confidential-containers/td-shim +# +# firmware = "@FIRMWAREPATH@" + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @CLHVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @CLHVALIDHYPERVISORPATHS@ + +# List of valid annotations values for ctlpath +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: +# valid_ctlpaths = + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to 1 +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = @DEFVCPUS@ + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +default_maxvcpus = @DEFMAXVCPUS@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ + +# Shared file system type: +# - virtio-fs +# - virtio-fs-nydus +# - none +shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. +block_device_driver = "virtio-blk-pci" + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +#enable_hugepages = true + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +#enable_debug = true + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" + +[agent.@PROJECT_TYPE@] +container_pipe_size=@PIPESIZE@ +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +#enable_tracing = true + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +#debug_console_enabled = true + +# Agent connection dialing timeout value in seconds +# (default: 45) +dial_timeout = 45 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true + +# If enabled, enabled, it means that 1) if the runtime exits abnormally, +# the cleanup process will be skipped, and 2) the runtime will not exit +# even if the health check fails. +# This option is typically used to retain abnormal information for debugging. +# (default: false) +#keep_abnormal = true + +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - bridged (Deprecated) +# Uses a linux bridge to interconnect the container interface to +# the VM. Works for most cases except macvlan and ipvlan. +# ***NOTE: This feature has been deprecated with plans to remove this +# feature in the future. Please use other network models listed below. +# +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="@DEFNETWORKMODEL_CLH@" + +name="@RUNTIMENAME@" +hypervisor_name="@HYPERVISOR_CLH@" +agent_name="@PROJECT_TYPE@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +#jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +#jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +#jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=bridged` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental=@DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +# enable_pprof = true + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@ + +# If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path. +# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. +# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` +# These will not be exposed to the container workloads, and are only provided for potential guest services. +# Now it supports three kinds of bind mount format: +# - "/path/to", default readonly mode. +# - "/path/to:ro", readonly mode. +# - "/path/to:rw", readwrite mode. +sandbox_bind_mounts=@DEFBINDMOUNTS@ + +# Base directory of directly attachable network config. +# Network devices for VM-based containers are allowed to be placed in the +# host netns to eliminate as many hops as possible, which is what we +# called a "Directly Attachable Network". The config, set by special CNI +# plugins, is used to tell the Kata containers what devices are attached +# to the hypervisor. +# (default: /run/kata-containers/dans) +dan_conf = "@DEFDANCONF@" diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index f4b6bcfdbd..9e5bf51621 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -54,7 +54,7 @@ valid_hypervisor_paths = @DBVALIDHYPERVISORPATHS@ # may stop the virtual machine from booting. # To see the list of default parameters, enable hypervisor debug, create a # container and look for 'default-kernel-parameters' log entries. -kernel_params = "@KERNELPARAMS@" +kernel_params = "@KERNELPARAMS_DB@" # Path to the firmware. # If you want that DB uses the default firmware leave this option empty