diff --git a/Makefile b/Makefile index 7c7d77e0d1..a7a9306239 100644 --- a/Makefile +++ b/Makefile @@ -119,15 +119,18 @@ HYPERVISOR_FC = firecracker JAILER_FC = jailer HYPERVISOR_NEMU = nemu HYPERVISOR_QEMU = qemu +HYPERVISOR_QEMU_VIRTIOFS = qemu-virtiofs # Determines which hypervisor is specified in $(CONFIG_FILE). DEFAULT_HYPERVISOR = $(HYPERVISOR_QEMU) # List of hypervisors this build system can generate configuration for. -HYPERVISORS := $(HYPERVISOR_ACRN) $(HYPERVISOR_FC) $(HYPERVISOR_QEMU) $(HYPERVISOR_NEMU) +HYPERVISORS := $(HYPERVISOR_ACRN) $(HYPERVISOR_FC) $(HYPERVISOR_QEMU) $(HYPERVISOR_QEMU_VIRTIOFS) $(HYPERVISOR_NEMU) QEMUPATH := $(QEMUBINDIR)/$(QEMUCMD) +QEMUVIRTIOFSPATH := $(QEMUBINDIR)/$(QEMUVIRTIOFSCMD) + NEMUPATH := $(NEMUBINDIR)/$(NEMUCMD) FCPATH = $(FCBINDIR)/$(FCCMD) @@ -237,6 +240,28 @@ ifneq (,$(QEMUCMD)) KERNELPATH = $(KERNELDIR)/$(KERNELNAME) endif +ifneq (,$(QEMUVIRTIOFSCMD)) + KNOWN_HYPERVISORS += $(HYPERVISOR_QEMU_VIRTIOFS) + + CONFIG_FILE_QEMU_VIRTIOFS = configuration-qemu-virtiofs.toml + CONFIG_QEMU_VIRTIOFS = $(CLI_DIR)/config/$(CONFIG_FILE_QEMU_VIRTIOFS) + CONFIG_QEMU_VIRTIOFS_IN = $(CONFIG_QEMU_VIRTIOFS).in + + CONFIG_PATH_QEMU_VIRTIOFS = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_VIRTIOFS)) + CONFIG_PATHS += $(CONFIG_PATH_QEMU_VIRTIOFS) + + SYSCONFIG_QEMU_VIRTIOFS = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_VIRTIOFS)) + SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_VIRTIOFS) + + CONFIGS += $(CONFIG_QEMU_VIRTIOFS) + + # qemu-specific options (all should be suffixed by "_QEMU") + DEFBLOCKSTORAGEDRIVER_QEMU_VIRTIOFS := virtio-fs + DEFNETWORKMODEL_QEMU := tcfilter + KERNELNAME = $(call MAKE_KERNEL_NAME,$(KERNELTYPE)) + KERNELPATH = $(KERNELDIR)/$(KERNELNAME) +endif + ifneq (,$(NEMUCMD)) KNOWN_HYPERVISORS += $(HYPERVISOR_NEMU) @@ -325,6 +350,10 @@ ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_QEMU)) DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_QEMU) endif +ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_QEMU_VIRTIOFS)) + DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_QEMU) +endif + ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_FC)) DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_FC) endif @@ -394,6 +423,8 @@ USER_VARS += NETMONPATH USER_VARS += QEMUBINDIR USER_VARS += QEMUCMD USER_VARS += QEMUPATH +USER_VARS += QEMUVIRTIOFSCMD +USER_VARS += QEMUVIRTIOFSPATH USER_VARS += SHAREDIR USER_VARS += SHIMPATH USER_VARS += SYSCONFDIR @@ -412,6 +443,7 @@ USER_VARS += DEFDISABLEBLOCK USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN USER_VARS += DEFBLOCKSTORAGEDRIVER_FC USER_VARS += DEFBLOCKSTORAGEDRIVER_QEMU +USER_VARS += DEFBLOCKSTORAGEDRIVER_QEMU_VIRTIOFS USER_VARS += DEFBLOCKSTORAGEDRIVER_NEMU USER_VARS += DEFSHAREDFS USER_VARS += DEFSHAREDFS_NEMU @@ -551,6 +583,7 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit -e "s|@PROJECT_TAG@|$(PROJECT_TAG)|g" \ -e "s|@PROJECT_TYPE@|$(PROJECT_TYPE)|g" \ -e "s|@QEMUPATH@|$(QEMUPATH)|g" \ + -e "s|@QEMUVIRTIOFSPATH@|$(QEMUVIRTIOFSPATH)|g" \ -e "s|@RUNTIME_NAME@|$(TARGET)|g" \ -e "s|@MACHINETYPE@|$(MACHINETYPE)|g" \ -e "s|@SHIMPATH@|$(SHIMPATH)|g" \ @@ -569,6 +602,7 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit -e "s|@DEFBLOCKSTORAGEDRIVER_ACRN@|$(DEFBLOCKSTORAGEDRIVER_ACRN)|g" \ -e "s|@DEFBLOCKSTORAGEDRIVER_FC@|$(DEFBLOCKSTORAGEDRIVER_FC)|g" \ -e "s|@DEFBLOCKSTORAGEDRIVER_QEMU@|$(DEFBLOCKSTORAGEDRIVER_QEMU)|g" \ + -e "s|@DEFBLOCKSTORAGEDRIVER_QEMU_VIRTIOFS@|$(DEFBLOCKSTORAGEDRIVER_QEMU_VIRTIOFS)|g" \ -e "s|@DEFBLOCKSTORAGEDRIVER_NEMU@|$(DEFBLOCKSTORAGEDRIVER_NEMU)|g" \ -e "s|@DEFSHAREDFS@|$(DEFSHAREDFS)|g" \ -e "s|@DEFSHAREDFS_NEMU@|$(DEFSHAREDFS_NEMU)|g" \ @@ -731,6 +765,9 @@ endif ifneq (,$(findstring $(HYPERVISOR_QEMU),$(KNOWN_HYPERVISORS))) @printf "\t$(HYPERVISOR_QEMU) hypervisor path (QEMUPATH) : %s\n" $(abspath $(QEMUPATH)) endif +ifneq (,$(findstring $(HYPERVISOR_QEMU_VIRTIOFS),$(KNOWN_HYPERVISORS))) + @printf "\t$(HYPERVISOR_QEMU_VIRTIOFS) hypervisor path (QEMUVIRTIOFSPATH) : %s\n" $(abspath $(QEMUVIRTIOFSPATH)) +endif ifneq (,$(findstring $(HYPERVISOR_NEMU),$(KNOWN_HYPERVISORS))) @printf "\t$(HYPERVISOR_NEMU) hypervisor path (NEMUPATH) : %s\n" $(abspath $(NEMUPATH)) endif diff --git a/arch/amd64-options.mk b/arch/amd64-options.mk index fd49e379f8..d4b076ba08 100644 --- a/arch/amd64-options.mk +++ b/arch/amd64-options.mk @@ -11,6 +11,9 @@ MACHINEACCELERATORS := QEMUCMD := qemu-system-x86_64 +# Qemu experimental with virtiofs +QEMUVIRTIOFSCMD := qemu-virtiofs-system-x86_64 + # Firecracker binary name FCCMD := firecracker # Firecracker's jailer binary name diff --git a/cli/config/configuration-qemu-virtiofs.toml.in b/cli/config/configuration-qemu-virtiofs.toml.in new file mode 100644 index 0000000000..8f5a618ca9 --- /dev/null +++ b/cli/config/configuration-qemu-virtiofs.toml.in @@ -0,0 +1,438 @@ +# Copyright (c) 2017-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_QEMU_VIRTIOFS_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.qemu] +path = "@QEMUVIRTIOFSPATH@" +kernel = "@KERNELPATH@" +initrd = "@INITRDPATH@" +image = "@IMAGEPATH@" +machine_type = "@MACHINETYPE@" + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Path to the firmware. +# If you want that qemu uses the default firmware leave this option empty +firmware = "@FIRMWAREPATH@" + +# Machine accelerators +# comma-separated list of machine accelerators to pass to the hypervisor. +# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` +machine_accelerators="@MACHINEACCELERATORS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to @DEFVCPUS@ +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +default_maxvcpus = @DEFMAXVCPUS@ + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in qemu or in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ +# +# Default memory slots per SB/VM. +# If unspecified then it will be set @DEFMEMSLOTS@. +# This is will determine the times that memory will be hotadded to sandbox/VM. +#memory_slots = @DEFMEMSLOTS@ + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM devie. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +#memory_offset = 0 + +# Disable block device from being used for a container's rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# 9pfs is used instead to pass the rootfs. +disable_block_device_use = @DEFDISABLEBLOCK@ + +# Shared file system type: +# - virtio-9p (default) +# - virtio-fs +shared_fs = "@DEFSHAREDFS_NEMU@" + +# Path to vhost-user-fs daemon. +virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@" + +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - none +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@" + +# Specifies cache-related options will be set to block devices or not. +# Default false +#block_device_cache_set = true + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +#block_device_cache_direct = true + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +#block_device_cache_noflush = true + +# Enable iothreads (data-plane) to be used. This causes IO to be +# handled in a separate IO thread. This is currently only implemented +# for SCSI. +# +enable_iothreads = @DEFENABLEIOTHREADS@ + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +#enable_mem_prealloc = true + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +#enable_hugepages = true + +# Enable file based guest memory support. The default is an empty string which +# will disable this feature. In the case of virtio-fs, this is enabled +# automatically and '/dev/shm' is used as the backing folder. +# This option will be ignored if VM templating is enabled. +#file_mem_backend = "" + +# Enable swap of vm memory. Default false. +# The behaviour is undefined if mem_prealloc is also set to true +#enable_swap = true + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. This extra output is added +# to the proxy logs, but only when proxy debug is also enabled. +# +# Default false +#enable_debug = true + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# This is the msize used for 9p shares. It is the number of bytes +# used for 9p packet payload. +#msize_9p = @DEFMSIZE9P@ + +# If true and vsocks are supported, use vsocks to communicate directly +# with the agent and no proxy is started, otherwise use unix +# sockets and start a proxy to communicate with the agent. +# Default false +#use_vsock = true + +# VFIO devices are hotplugged on a bridge by default. +# Enable hotplugging on root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hotplugging on +# a bridge. This value is valid for "pc" machine type. +# Default false +#hotplug_vfio_on_root_bus = true + +# If host doesn't support vhost_net, set to true. Thus we won't create vhost fds for nics. +# Default false +#disable_vhost_net = true +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +#entropy_source= "@DEFENTROPYSOURCE@" + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,postart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered will scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +#enable_template = true + +# Specifies the path of template. +# +# Default "/run/vc/vm/template" +#template_path = "/run/vc/vm/template" + +# The number of caches of VMCache: +# unspecified or == 0 --> VMCache is disabled +# > 0 --> will be set to the specified number +# +# VMCache is a function that creates VMs as caches before using it. +# It helps speed up new container creation. +# The function consists of a server and some clients communicating +# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto. +# The VMCache server will create some VMs and cache them by factory cache. +# It will convert the VM to gRPC format and transport it when gets +# requestion from clients. +# Factory grpccache is the VMCache client. It will request gRPC format +# VM and convert it back to a VM. If VMCache function is enabled, +# kata-runtime will request VM from factory grpccache when it creates +# a new sandbox. +# +# Default 0 +#vm_cache_number = 0 + +# Specify the address of the Unix socket that is used by VMCache. +# +# Default /var/run/kata-containers/cache.sock +#vm_cache_endpoint = "/var/run/kata-containers/cache.sock" + +[proxy.@PROJECT_TYPE@] +path = "@PROXYPATH@" + +# If enabled, proxy messages will be sent to the system log +# (default: disabled) +#enable_debug = true + +[shim.@PROJECT_TYPE@] +path = "@SHIMPATH@" + +# If enabled, shim messages will be sent to the system log +# (default: disabled) +#enable_debug = true + +# If enabled, the shim will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# +# Note: By default, the shim runs in a separate network namespace. Therefore, +# to allow it to send trace details to the Jaeger agent running on the host, +# it is necessary to set 'disable_new_netns=true' so that it runs in the host +# network namespace. +# +# (default: disabled) +#enable_tracing = true + +[agent.@PROJECT_TYPE@] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the default trace mode is "dynamic" and the +# default trace type is "isolated". The trace mode and type are set +# explicity with the `trace_type=` and `trace_mode=` options. +# +# Notes: +# +# - Tracing is ONLY enabled when `enable_tracing` is set: explicitly +# setting `trace_mode=` and/or `trace_type=` without setting `enable_tracing` +# will NOT activate agent tracing. +# +# - See https://github.com/kata-containers/agent/blob/master/TRACING.md for +# full details. +# +# (default: disabled) +#enable_tracing = true +# +#trace_mode = "dynamic" +#trace_type = "isolated" + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules=[] + + +[netmon] +# If enabled, the network monitoring process gets started when the +# sandbox is created. This allows for the detection of some additional +# network being added to the existing network namespace, after the +# sandbox has been created. +# (default: disabled) +#enable_netmon = true + +# Specify the path to the netmon binary. +path = "@NETMONPATH@" + +# If enabled, netmon messages will be sent to the system log +# (default: disabled) +#enable_debug = true + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - bridged (Deprecated) +# Uses a linux bridge to interconnect the container interface to +# the VM. Works for most cases except macvlan and ipvlan. +# ***NOTE: This feature has been deprecated with plans to remove this +# feature in the future. Please use other network models listed below. +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="@DEFNETWORKMODEL_QEMU@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `enable_netmon` +# `disable_new_netns` conflicts with `internetworking_model=bridged` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# If you are using docker, `disable_new_netns` only works with `docker run --net=none` +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The sandbox cgroup is not constrained by the runtime +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# They may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# 1. "newstore": new persist storage driver which breaks backward compatibility, +# expected to move out of experimental in 2.0.0. +# (default: []) +experimental=@DEFAULTEXPFEATURES@