Merge pull request #8070 from nubificus/feat_add-fc-runtime-rs

runtime-rs: firecracker hypervisor backend
This commit is contained in:
Anastassios Nanos 2024-07-03 22:29:30 +03:00 committed by GitHub
commit db75b5f3c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 1687 additions and 14 deletions

View File

@ -88,3 +88,13 @@ pub const DEFAULT_CH_PCI_BRIDGES: u32 = 2;
pub const MAX_CH_PCI_BRIDGES: u32 = 5;
pub const MAX_CH_VCPUS: u32 = 256;
pub const MIN_CH_MEMORY_SIZE_MB: u32 = 64;
//Default configuration for firecracker
pub const DEFAULT_FIRECRACKER_ENTROPY_SOURCE: &str = "/dev/urandom";
pub const DEFAULT_FIRECRACKER_MEMORY_SIZE_MB: u32 = 128;
pub const DEFAULT_FIRECRACKER_MEMORY_SLOTS: u32 = 128;
pub const DEFAULT_FIRECRACKER_VCPUS: u32 = 1;
pub const DEFAULT_FIRECRACKER_GUEST_KERNEL_IMAGE: &str = "vmlinux";
pub const DEFAULT_FIRECRACKER_GUEST_KERNEL_PARAMS: &str = "";
pub const MAX_FIRECRACKER_VCPUS: u32 = 32;
pub const MIN_FIRECRACKER_MEMORY_SIZE_MB: u32 = 128;

View File

@ -0,0 +1,116 @@
// Copyright (c) 2019-2021 Alibaba Cloud
// Copyright (c) 2022-2023 Nubificus LTD
//
// SPDX-License-Identifier: Apache-2.0
//
use std::io::Result;
use std::path::Path;
use std::sync::Arc;
use super::{default, register_hypervisor_plugin};
use crate::config::default::MAX_FIRECRACKER_VCPUS;
use crate::config::default::MIN_FIRECRACKER_MEMORY_SIZE_MB;
use crate::config::{ConfigPlugin, TomlConfig};
use crate::{eother, validate_path};
/// Hypervisor name for firecracker, used to index `TomlConfig::hypervisor`.
pub const HYPERVISOR_NAME_FIRECRACKER: &str = "firecracker";
/// Configuration information for firecracker.
#[derive(Default, Debug)]
pub struct FirecrackerConfig {}
impl FirecrackerConfig {
/// Create a new instance of `FirecrackerConfig`.
pub fn new() -> Self {
FirecrackerConfig {}
}
/// Register the firecracker plugin.
pub fn register(self) {
let plugin = Arc::new(self);
register_hypervisor_plugin(HYPERVISOR_NAME_FIRECRACKER, plugin);
}
}
impl ConfigPlugin for FirecrackerConfig {
fn get_max_cpus(&self) -> u32 {
MAX_FIRECRACKER_VCPUS
}
fn get_min_memory(&self) -> u32 {
MIN_FIRECRACKER_MEMORY_SIZE_MB
}
fn name(&self) -> &str {
HYPERVISOR_NAME_FIRECRACKER
}
/// Adjust the configuration information after loading from configuration file.
fn adjust_config(&self, conf: &mut TomlConfig) -> Result<()> {
if let Some(firecracker) = conf.hypervisor.get_mut(HYPERVISOR_NAME_FIRECRACKER) {
if firecracker.boot_info.kernel.is_empty() {
firecracker.boot_info.kernel =
default::DEFAULT_FIRECRACKER_GUEST_KERNEL_IMAGE.to_string();
}
if firecracker.boot_info.kernel_params.is_empty() {
firecracker.boot_info.kernel_params =
default::DEFAULT_FIRECRACKER_GUEST_KERNEL_PARAMS.to_string();
}
if firecracker.machine_info.entropy_source.is_empty() {
firecracker.machine_info.entropy_source =
default::DEFAULT_FIRECRACKER_ENTROPY_SOURCE.to_string();
}
if firecracker.memory_info.default_memory == 0 {
firecracker.memory_info.default_memory =
default::DEFAULT_FIRECRACKER_MEMORY_SIZE_MB;
}
}
Ok(())
}
/// Validate the configuration information.
fn validate(&self, conf: &TomlConfig) -> Result<()> {
if let Some(firecracker) = conf.hypervisor.get(HYPERVISOR_NAME_FIRECRACKER) {
if firecracker.path.is_empty() {
return Err(eother!("Firecracker path is empty"));
}
validate_path!(
firecracker.path,
"FIRECRACKER binary path `{}` is invalid: {}"
)?;
if firecracker.boot_info.kernel.is_empty() {
return Err(eother!("Guest kernel image for firecracker is empty"));
}
if firecracker.boot_info.image.is_empty() {
return Err(eother!(
"Both guest boot image and initrd for firecracker are empty"
));
}
if (firecracker.cpu_info.default_vcpus > 0
&& firecracker.cpu_info.default_vcpus as u32 > default::MAX_FIRECRACKER_VCPUS)
|| firecracker.cpu_info.default_maxvcpus > default::MAX_FIRECRACKER_VCPUS
{
return Err(eother!(
"Firecracker hypervisor can not support {} vCPUs",
firecracker.cpu_info.default_maxvcpus
));
}
if firecracker.memory_info.default_memory < MIN_FIRECRACKER_MEMORY_SIZE_MB {
return Err(eother!(
"Firecracker hypervisor has minimal memory limitation {}",
MIN_FIRECRACKER_MEMORY_SIZE_MB
));
}
}
Ok(())
}
}

View File

@ -59,6 +59,9 @@ pub const VIRTIO_SCSI: &str = "virtio-scsi";
/// Virtual PMEM device driver.
pub const VIRTIO_PMEM: &str = "virtio-pmem";
mod firecracker;
pub use self::firecracker::{FirecrackerConfig, HYPERVISOR_NAME_FIRECRACKER};
const VIRTIO_9P: &str = "virtio-9p";
const VIRTIO_FS: &str = "virtio-fs";
const VIRTIO_FS_INLINE: &str = "inline-virtio-fs";
@ -530,6 +533,7 @@ impl TopologyConfigInfo {
HYPERVISOR_NAME_QEMU,
HYPERVISOR_NAME_CH,
HYPERVISOR_NAME_DRAGONBALL,
HYPERVISOR_NAME_FIRECRACKER,
];
let hypervisor_name = toml_config.runtime.hypervisor_name.as_str();
if !hypervisor_names.contains(&hypervisor_name) {

View File

@ -25,8 +25,8 @@ pub mod hypervisor;
pub use self::agent::Agent;
use self::default::DEFAULT_AGENT_DBG_CONSOLE_PORT;
pub use self::hypervisor::{
BootInfo, CloudHypervisorConfig, DragonballConfig, Hypervisor, QemuConfig,
HYPERVISOR_NAME_DRAGONBALL, HYPERVISOR_NAME_QEMU,
BootInfo, CloudHypervisorConfig, DragonballConfig, FirecrackerConfig, Hypervisor, QemuConfig,
HYPERVISOR_NAME_DRAGONBALL, HYPERVISOR_NAME_FIRECRACKER, HYPERVISOR_NAME_QEMU,
};
mod runtime;

View File

@ -37,6 +37,9 @@ fn get_uds_with_sid(short_id: &str, path: &str) -> Result<String> {
return Ok(format!("unix://{}", p.display()));
}
let _ = fs::create_dir_all(kata_run_path.join(short_id))
.context(format!("failed to create directory {:?}", kata_run_path.join(short_id)));
let target_ids: Vec<String> = fs::read_dir(&kata_run_path)?
.filter_map(|e| {
let x = e.ok()?.file_name().to_string_lossy().into_owned();

View File

@ -1635,6 +1635,8 @@ dependencies = [
"dragonball",
"futures 0.3.28",
"go-flag",
"hyper",
"hyperlocal",
"hypervisor",
"kata-sys-util",
"kata-types",

View File

@ -109,6 +109,12 @@ ROOTFSTYPE_XFS := \"xfs\"
ROOTFSTYPE_EROFS := \"erofs\"
DEFROOTFSTYPE := $(ROOTFSTYPE_EXT4)
FCBINDIR := $(PREFIXDEPS)/bin
FCPATH = $(FCBINDIR)/$(FCCMD)
FCVALIDHYPERVISORPATHS := [\"$(FCPATH)\"]
FCJAILERPATH = $(FCBINDIR)/$(FCJAILERCMD)
FCVALIDJAILERPATHS = [\"$(FCJAILERPATH)\"]
PKGLIBEXECDIR := $(LIBEXECDIR)/$(PROJECT_DIR)
FIRMWAREPATH :=
FIRMWAREVOLUMEPATH :=
@ -164,8 +170,11 @@ DEFMSIZE9P := 8192
DEFVFIOMODE := guest-kernel
##VAR DEFSANDBOXCGROUPONLY=<bool> Default cgroup model
DEFSANDBOXCGROUPONLY ?= false
DEFSANDBOXCGROUPONLY_DB ?= true
DEFSANDBOXCGROUPONLY_FC ?= true
DEFSTATICRESOURCEMGMT ?= false
DEFSTATICRESOURCEMGMT_DB ?= false
DEFSTATICRESOURCEMGMT_FC ?= true
DEFBINDMOUNTS := []
DEFDANCONF := /run/kata-containers/dans
SED = sed
@ -216,7 +225,7 @@ ifneq (,$(DBCMD))
KERNELTYPE_DB = uncompressed
KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME_DB,$(KERNELTYPE_DB))
KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB)
DEFSANDBOXCGROUPONLY = true
DEFSANDBOXCGROUPONLY_DB = true
RUNTIMENAME := virt_container
PIPESIZE := 1
DBSHAREDFS := inline-virtio-fs
@ -244,6 +253,9 @@ ifneq (,$(CLHCMD))
KERNEL_NAME_CLH = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_CLH))
KERNELPATH_CLH = $(KERNELDIR)/$(KERNEL_NAME_CLH)
VMROOTFSDRIVER_CLH := virtio-pmem
DEFSTATICRESOURCEMGMT = true
DEFSANDBOXCGROUPONLY = true
endif
ifneq (,$(QEMUCMD))
@ -288,6 +300,28 @@ endif
DEFSECCOMPSANDBOXPARAM := on,obsolete=deny,spawn=deny,resourcecontrol=deny
DEFGUESTSELINUXLABEL := system_u:system_r:container_t
endif
ifneq (,$(FCCMD))
KNOWN_HYPERVISORS += $(HYPERVISOR_FC)
CONFIG_FILE_FC = configuration-rs-fc.toml
CONFIG_FC = config/$(CONFIG_FILE_FC)
CONFIG_FC_IN = $(CONFIG_FC).in
CONFIG_PATH_FC = $(abspath $(CONFDIR)/$(CONFIG_FILE_FC))
CONFIG_PATHS += $(CONFIG_PATH_FC)
SYSCONFIG_FC = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_FC))
SYSCONFIG_PATHS += $(SYSCONFIG_FC)
CONFIGS += $(CONFIG_FC)
# firecracker-specific options (all should be suffixed by "_FC")
DEFBLOCKSTORAGEDRIVER_FC := virtio-blk-mmio
DEFMAXMEMSZ_FC := 2048
DEFNETWORKMODEL_FC := tcfilter
KERNELPARAMS = console=ttyS0 agent.log_vport=1025
KERNELTYPE_FC = uncompressed
KERNEL_NAME_FC = $(call MAKE_KERNEL_NAME_FC,$(KERNELTYPE_FC))
KERNELPATH_FC = $(KERNELDIR)/$(KERNEL_NAME_FC)
DEFSANDBOXCGROUPONLY_FC = true
RUNTIMENAME := virt_container
DEFSTATICRESOURCEMGMT_FC ?= true
endif
ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_DB))
DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_DB)
@ -296,16 +330,21 @@ endif
ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_QEMU))
DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_QEMU)
endif
ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_FC))
DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_FC)
endif
# list of variables the user may wish to override
USER_VARS += ARCH
USER_VARS += BINDIR
USER_VARS += CONFIG_DB_IN
USER_VARS += CONFIG_FC_IN
USER_VARS += CONFIG_PATH
USER_VARS += CONFIG_QEMU_IN
USER_VARS += DESTDIR
USER_VARS += DEFAULT_HYPERVISOR
USER_VARS += DBCMD
USER_VARS += DBCTLCMD
USER_VARS += FCCTLCMD
USER_VARS += DBPATH
USER_VARS += DBVALIDHYPERVISORPATHS
USER_VARS += DBCTLPATH
@ -316,6 +355,13 @@ USER_VARS += QEMUPATH
USER_VARS += QEMUVALIDHYPERVISORPATHS
USER_VARS += FIRMWAREPATH_CLH
USER_VARS += KERNELPATH_CLH
USER_VARS += FCCMD
USER_VARS += FCPATH
USER_VARS += FCVALIDHYPERVISORPATHS
USER_VARS += FCJAILERPATH
USER_VARS += FCVALIDJAILERPATHS
USER_VARS += FCVALIDJAILERPATHS
USER_VARS += DEFMAXMEMSZ_FC
USER_VARS += SYSCONFIG
USER_VARS += IMAGENAME
USER_VARS += IMAGEPATH
@ -329,6 +375,8 @@ USER_VARS += KERNELDIR
USER_VARS += KERNELTYPE
USER_VARS += KERNELPATH_DB
USER_VARS += KERNELPATH_QEMU
USER_VARS += KERNELPATH_FC
USER_VARS += KERNELPATH
USER_VARS += KERNELVIRTIOFSPATH
USER_VARS += FIRMWAREPATH
USER_VARS += FIRMWAREVOLUMEPATH
@ -365,6 +413,7 @@ USER_VARS += DEFBRIDGES
USER_VARS += DEFNETWORKMODEL_DB
USER_VARS += DEFNETWORKMODEL_CLH
USER_VARS += DEFNETWORKMODEL_QEMU
USER_VARS += DEFNETWORKMODEL_FC
USER_VARS += DEFDISABLEGUESTEMPTYDIR
USER_VARS += DEFDISABLEGUESTSECCOMP
USER_VARS += DEFDISABLESELINUX
@ -374,6 +423,7 @@ USER_VARS += DEFDISABLEBLOCK
USER_VARS += DEFBLOCKSTORAGEDRIVER_DB
USER_VARS += DEFBLOCKSTORAGEDRIVER_QEMU
USER_VARS += DEFBLOCKDEVICEAIO_QEMU
USER_VARS += DEFBLOCKSTORAGEDRIVER_FC
USER_VARS += DEFSHAREDFS_CLH_VIRTIOFS
USER_VARS += DEFSHAREDFS_QEMU_VIRTIOFS
USER_VARS += DEFVIRTIOFSDAEMON
@ -396,8 +446,11 @@ USER_VARS += DEFENTROPYSOURCE
USER_VARS += DEFVALIDENTROPYSOURCES
USER_VARS += DEFSANDBOXCGROUPONLY
USER_VARS += DEFSANDBOXCGROUPONLY_QEMU
USER_VARS += DEFSANDBOXCGROUPONLY_DB
USER_VARS += DEFSANDBOXCGROUPONLY_FC
USER_VARS += DEFSTATICRESOURCEMGMT
USER_VARS += DEFSTATICRESOURCEMGMT_DB
USER_VARS += DEFSTATICRESOURCEMGMT_FC
USER_VARS += DEFBINDMOUNTS
USER_VARS += DEFVFIOMODE
USER_VARS += BUILDFLAGS
@ -405,6 +458,7 @@ USER_VARS += RUNTIMENAME
USER_VARS += HYPERVISOR_DB
USER_VARS += HYPERVISOR_CLH
USER_VARS += HYPERVISOR_QEMU
USER_VARS += HYPERVISOR_FC
USER_VARS += PIPESIZE
USER_VARS += DBSHAREDFS
USER_VARS += KATA_INSTALL_GROUP
@ -442,6 +496,7 @@ RUNTIME_VERSION=$(VERSION)
GENERATED_VARS = \
VERSION \
CONFIG_DB_IN \
CONFIG_FC_IN \
$(USER_VARS)
@ -483,6 +538,9 @@ endef
define MAKE_KERNEL_NAME_DB
$(if $(findstring uncompressed,$1),vmlinux-dragonball-experimental.container,vmlinuz-dragonball-experimental.container)
endef
define MAKE_KERNEL_NAME_FC
$(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container)
endef
# Returns the name of the kernel file to use based on the provided KERNELTYPE.
# # $1 : KERNELTYPE (compressed or uncompressed)

View File

@ -13,3 +13,5 @@ QEMUCMD := qemu-system-aarch64
# dragonball binary name
DBCMD := dragonball
FCCMD := firecracker
FCJAILERCMD := jailer

View File

@ -16,3 +16,7 @@ DBCMD := dragonball
# cloud-hypervisor binary name
CLHCMD := cloud-hypervisor
# firecracker binary (vmm and jailer)
FCCMD := firecracker
FCJAILERCMD := jailer

View File

@ -341,7 +341,7 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# The sandbox cgroup is constrained if there is no container type annotation.
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY_DB@
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,

View File

@ -0,0 +1,373 @@
# Copyright (c) 2017-2023 Intel Corporation
# Copyright (c) Adobe Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# XXX: WARNING: this file is auto-generated.
# XXX:
# XXX: Source file: "@CONFIG_FC_IN@"
# XXX: Project:
# XXX: Name: @PROJECT_NAME@
# XXX: Type: @PROJECT_TYPE@
[hypervisor.firecracker]
path = "@FCPATH@"
kernel = "@KERNELPATH_FC@"
image = "@IMAGEPATH@"
rootfs_type=@DEFROOTFSTYPE@
# List of valid annotation names for the hypervisor
# Each member of the list is a regular expression, which is the base name
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
enable_annotations = @DEFENABLEANNOTATIONS@
# List of valid annotations values for the hypervisor
# Each member of the list is a path pattern as described by glob(3).
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @FCVALIDHYPERVISORPATHS@
valid_hypervisor_paths = @FCVALIDHYPERVISORPATHS@
# Path for the jailer specific to firecracker
# If the jailer path is not set kata will launch firecracker
# without a jail. If the jailer is set firecracker will be
# launched in a jailed enviornment created by the jailer
#jailer_path = "@FCJAILERPATH@"
# List of valid jailer path values for the hypervisor
# Each member of the list can be a regular expression
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @FCVALIDJAILERPATHS@
valid_jailer_paths = @FCVALIDJAILERPATHS@
# Optional space-separated list of options to pass to the guest kernel.
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
# trouble running pre-2.15 glibc.
#
# WARNING: - any parameter specified here will take priority over the default
# parameter value of the same name used to start the virtual machine.
# Do not set values here unless you understand the impact of doing so as you
# may stop the virtual machine from booting.
# To see the list of default parameters, enable hypervisor debug, create a
# container and look for 'default-kernel-parameters' log entries.
kernel_params = "@KERNELPARAMS@"
# Default number of vCPUs per SB/VM:
# unspecified or 0 --> will be set to @DEFVCPUS@
# < 0 --> will be set to the actual number of physical cores
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores
default_vcpus = 1
# Default maximum number of vCPUs per SB/VM:
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
# the actual number of physical cores is greater than it.
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
# unless you know what are you doing.
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
default_maxvcpus = @DEFMAXVCPUS@
# Bridges can be used to hot plug devices.
# Limitations:
# * Currently only pci bridges are supported
# * Until 30 devices per bridge can be hot plugged.
# * Until 5 PCI bridges can be cold plugged per VM.
# This limitation could be a bug in the kernel
# Default number of bridges per SB/VM:
# unspecified or 0 --> will be set to @DEFBRIDGES@
# > 1 <= 5 --> will be set to the specified number
# > 5 --> will be set to 5
default_bridges = @DEFBRIDGES@
# Default memory size in MiB for SB/VM.
# If unspecified then it will be set @DEFMEMSZ@ MiB.
default_memory = @DEFMEMSZ@
#
# Default memory slots per SB/VM.
# If unspecified then it will be set @DEFMEMSLOTS@.
# This is will determine the times that memory will be hotadded to sandbox/VM.
memory_slots = @DEFMEMSLOTS@
# The size in MiB will be plused to max memory of hypervisor.
# It is the memory address space for the NVDIMM devie.
# If set block storage driver (block_device_driver) to "nvdimm",
# should set memory_offset to the size of block device.
# Default 0
#memory_offset = 0
# Default maximum memory in MiB per SB / VM
# unspecified or == 0 --> will be set to the actual amount of physical RAM
# > 0 <= amount of physical RAM --> will be set to the specified number
# > amount of physical RAM --> will be set to the actual amount of physical RAM
default_maxmemory = @DEFMAXMEMSZ_FC@
# Block storage driver to be used for the hypervisor in case the container
# rootfs is backed by a block device. This is virtio-scsi, virtio-blk
# or nvdimm.
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_FC@"
# Specifies cache-related options will be set to block devices or not.
# Default false
#block_device_cache_set = true
# Specifies cache-related options for block devices.
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
# Default false
#block_device_cache_direct = true
# Specifies cache-related options for block devices.
# Denotes whether flush requests for the device are ignored.
# Default false
#block_device_cache_noflush = true
# Enable pre allocation of VM RAM, default false
# Enabling this will result in lower container density
# as all of the memory will be allocated and locked
# This is useful when you want to reserve all the memory
# upfront or in the cases where you want memory latencies
# to be very predictable
# Default false
#enable_mem_prealloc = true
# Enable huge pages for VM RAM, default false
# Enabling this will result in the VM memory
# being allocated using huge pages.
# This is useful when you want to use vhost-user network
# stacks within the container. This will automatically
# result in memory pre allocation
#enable_hugepages = true
# Enable vIOMMU, default false
# Enabling this will result in the VM having a vIOMMU device
# This will also add the following options to the kernel's
# command line: intel_iommu=on,iommu=pt
#enable_iommu = true
# This option changes the default hypervisor and kernel parameters
# to enable debug output where available.
#
# Default false
#enable_debug = true
# Disable the customizations done in the runtime when it detects
# that it is running on top a VMM. This will result in the runtime
# behaving as it would when running on bare metal.
#
#disable_nesting_checks = true
# This is the msize used for 9p shares. It is the number of bytes
# used for 9p packet payload.
#msize_9p = @DEFMSIZE9P@
# VFIO devices are hotplugged on a bridge by default.
# Enable hotplugging on root bus. This may be required for devices with
# a large PCI bar, as this is a current limitation with hotplugging on
# a bridge.
# Default false
#hotplug_vfio_on_root_bus = true
#
# Default entropy source.
# The path to a host source of entropy (including a real hardware RNG)
# /dev/urandom and /dev/random are two main options.
# Be aware that /dev/random is a blocking source of entropy. If the host
# runs out of entropy, the VMs boot time will increase leading to get startup
# timeouts.
# The source of entropy /dev/urandom is non-blocking and provides a
# generally acceptable source of entropy. It should work well for pretty much
# all practical purposes.
#entropy_source= "@DEFENTROPYSOURCE@"
# List of valid annotations values for entropy_source
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
# Path to OCI hook binaries in the *guest rootfs*.
# This does not affect host-side hooks which must instead be added to
# the OCI spec passed to the runtime.
#
# You can create a rootfs with hooks by customizing the osbuilder scripts:
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
#
# Hooks must be stored in a subdirectory of guest_hook_path according to their
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
# The agent will scan these directories for executable files and add them, in
# lexicographical order, to the lifecycle of the guest container.
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
# Warnings will be logged if any error is encountered will scanning for hooks,
# but it will not abort container execution.
#guest_hook_path = "/usr/share/oci/hooks"
#
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
# In Firecracker, it provides a built-in rate limiter, which is based on TBF(Token Bucket Filter)
# queueing discipline.
# Default 0-sized value means unlimited rate.
#rx_rate_limiter_max_rate = 0
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
# In Firecracker, it provides a built-in rate limiter, which is based on TBF(Token Bucket Filter)
# queueing discipline.
# Default 0-sized value means unlimited rate.
#tx_rate_limiter_max_rate = 0
# disable applying SELinux on the VMM process (default false)
disable_selinux=@DEFDISABLESELINUX@
[factory]
# VM templating support. Once enabled, new VMs are created from template
# using vm cloning. They will share the same initial kernel, initramfs and
# agent memory by mapping it readonly. It helps speeding up new container
# creation and saves a lot of memory if there are many kata containers running
# on the same host.
#
# When disabled, new VMs are created from scratch.
#
# Note: Requires "initrd=" to be set ("image=" is not supported).
#
# Default false
#enable_template = true
[agent.@PROJECT_TYPE@]
# If enabled, make the agent display debug-level messages.
# (default: disabled)
#enable_debug = true
# Enable agent tracing.
#
# If enabled, the agent will generate OpenTelemetry trace spans.
#
# Notes:
#
# - If the runtime also has tracing enabled, the agent spans will be
# associated with the appropriate runtime parent span.
# - If enabled, the runtime will wait for the container to shutdown,
# increasing the container shutdown time slightly.
#
# (default: disabled)
#enable_tracing = true
# Comma separated list of kernel modules and their parameters.
# These modules will be loaded in the guest kernel using modprobe(8).
# The following example can be used to load two kernel modules with parameters
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
# The first word is considered as the module name and the rest as its parameters.
# Container will not be started when:
# * A kernel module is specified and the modprobe command is not installed in the guest
# or it fails loading the module.
# * The module is not available in the guest or it doesn't met the guest kernel
# requirements, like architecture and version.
#
kernel_modules=[]
# Enable debug console.
# If enabled, user can connect guest OS running inside hypervisor
# through "kata-runtime exec <sandbox-id>" command
#debug_console_enabled = true
# Agent connection dialing timeout value in seconds
# (default: 45)
dial_timeout = 45
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log
# (default: disabled)
#enable_debug = true
#
# Internetworking model
# Determines how the VM should be connected to the
# the container network interface
# Options:
#
# - macvtap
# Used when the Container network interface can be bridged using
# macvtap.
#
# - none
# Used when customize network. Only creates a tap device. No veth pair.
#
# - tcfilter
# Uses tc filter rules to redirect traffic from the network interface
# provided by plugin to a tap interface connected to the VM.
#
internetworking_model="@DEFNETWORKMODEL_FC@"
name="@RUNTIMENAME@"
hypervisor_name="@HYPERVISOR_FC@"
agent_name="@PROJECT_TYPE@"
# disable guest seccomp
# Determines whether container seccomp profiles are passed to the virtual
# machine and applied by the kata agent. If set to true, seccomp is not applied
# within the guest
# (default: true)
disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled)
#enable_tracing = true
# Set the full url to the Jaeger HTTP Thrift collector.
# The default if not set will be "http://localhost:14268/api/traces"
#jaeger_endpoint = ""
# Sets the username to be used if basic auth is required for Jaeger.
#jaeger_user = ""
# Sets the password to be used if basic auth is required for Jaeger.
#jaeger_password = ""
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
# (like OVS) directly.
# (default: false)
#disable_new_netns = true
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# The sandbox cgroup is constrained if there is no container type annotation.
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY_FC@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_FC@
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
disable_guest_empty_dir=@DEFDISABLEGUESTEMPTYDIR@
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.
# Supported experimental features:
# (default: [])
experimental=@DEFAULTEXPFEATURES@
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
# (default: false)
# enable_pprof = true

View File

@ -49,6 +49,9 @@ qapi-qmp = "0.14.0"
[target.'cfg(not(target_arch = "s390x"))'.dependencies]
dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs", "vhost-net", "dbs-upcall", "virtio-mem", "virtio-balloon", "vhost-user-net", "host-device"] }
dbs-utils = { path = "../../../dragonball/src/dbs_utils" }
hyperlocal = "0.8.0"
hyper = {version = "0.14.18", features = ["client"]}
[features]
default = []

View File

@ -0,0 +1,324 @@
//Copyright (c) 2019-2022 Alibaba Cloud
//Copyright (c) 2023 Nubificus Ltd
//
//SPDX-License-Identifier: Apache-2.0
use crate::{
firecracker::{
inner_hypervisor::{FC_AGENT_SOCKET_NAME, ROOT},
sl, FcInner,
},
kernel_param::KernelParams,
NetworkConfig, Param,
};
use anyhow::{anyhow, Context, Result};
use dbs_utils::net::MacAddr;
use hyper::{Body, Method, Request, Response};
use hyperlocal::Uri;
use kata_sys_util::mount;
use nix::mount::MsFlags;
use serde_json::json;
use tokio::{fs, fs::File};
const REQUEST_RETRY: u32 = 500;
const FC_KERNEL: &str = "vmlinux";
const FC_ROOT_FS: &str = "rootfs";
const DRIVE_PREFIX: &str = "drive";
const DISK_POOL_SIZE: u32 = 6;
impl FcInner {
pub(crate) fn get_resource(&self, src: &str, dst: &str) -> Result<String> {
if self.jailed {
self.jail_resource(src, dst)
} else {
Ok(src.to_string())
}
}
fn jail_resource(&self, src: &str, dst: &str) -> Result<String> {
if src.is_empty() || dst.is_empty() {
return Err(anyhow!("invalid param src {} dst {}", src, dst));
}
let jailed_location = [self.vm_path.as_str(), ROOT, dst].join("/");
mount::bind_mount_unchecked(src, jailed_location.as_str(), false, MsFlags::MS_SLAVE)
.context("bind_mount ERROR")?;
let mut abs_path = String::from("/");
abs_path.push_str(dst);
Ok(abs_path)
}
// Remounting jailer root to ensure it has exec permissions, since firecracker binary will
// execute from there
pub(crate) async fn remount_jailer_with_exec(&self) -> Result<()> {
let localpath = [self.vm_path.clone(), ROOT.to_string()].join("/");
let _ = fs::create_dir_all(&localpath)
.await
.context(format!("failed to create directory {:?}", &localpath));
mount::bind_mount_unchecked(&localpath, &localpath, false, MsFlags::MS_SHARED)
.context("bind mount jailer root")?;
mount::bind_remount(&localpath, false).context("rebind mount jailer root")?;
Ok(())
}
pub(crate) async fn prepare_hvsock(&mut self) -> Result<()> {
let rel_uds_path = match self.jailed {
false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"),
true => FC_AGENT_SOCKET_NAME.to_string(),
};
let body_vsock: String = json!({
"guest_cid": 3,
"uds_path": rel_uds_path,
"vsock_id": ROOT,
})
.to_string();
self.request_with_retry(Method::PUT, "/vsock", body_vsock)
.await?;
Ok(())
}
pub(crate) async fn prepare_vmm_resources(&mut self) -> Result<()> {
let mut kernel_params = KernelParams::new(self.config.debug_info.enable_debug);
kernel_params.push(Param::new("pci", "off"));
kernel_params.push(Param::new("iommu", "off"));
let rootfs_driver = self.config.blockdev_info.block_device_driver.clone();
kernel_params.append(&mut KernelParams::new_rootfs_kernel_params(
&rootfs_driver,
&self.config.boot_info.rootfs_type,
)?);
kernel_params.append(&mut KernelParams::from_string(
&self.config.boot_info.kernel_params,
));
let mut parameters = String::new().to_owned();
for param in &kernel_params.to_string() {
parameters.push_str(&param.to_string());
}
let kernel = self
.get_resource(&self.config.boot_info.kernel, FC_KERNEL)
.context("get resource KERNEL")?;
let rootfs = self
.get_resource(&self.config.boot_info.image, FC_ROOT_FS)
.context("get resource ROOTFS")?;
let body_kernel: String = json!({
"kernel_image_path": kernel,
"boot_args": parameters,
})
.to_string();
let body_rootfs: String = json!({
"drive_id": "rootfs",
"path_on_host": rootfs,
"is_root_device": false,
"is_read_only": true
})
.to_string();
info!(sl(), "Before first request");
self.request_with_retry(Method::PUT, "/boot-source", body_kernel)
.await?;
self.request_with_retry(Method::PUT, "/drives/rootfs", body_rootfs)
.await?;
let abs_path = [&self.vm_path, ROOT].join("/");
let rel_path = "/".to_string();
let _ = fs::create_dir_all(&abs_path)
.await
.context(format!("failed to create directory {:?}", &abs_path));
// We create some placeholder drives to be used for patching block devices while the vmm is
// running, as firecracker does not support device hotplug.
for i in 1..DISK_POOL_SIZE {
let full_path_name = format!("{}/drive{}", abs_path, i);
let _ = File::create(&full_path_name)
.await
.context(format!("failed to create file {:?}", &full_path_name));
let path_on_host = match self.jailed {
false => abs_path.clone(),
true => rel_path.clone(),
};
let body: String = json!({
"drive_id": format!("drive{}",i),
"path_on_host": format!("{}/drive{}", path_on_host, i),
"is_root_device": false,
"is_read_only": false
})
.to_string();
self.request_with_retry(Method::PUT, &format!("/drives/drive{}", i), body)
.await?;
}
Ok(())
}
pub(crate) async fn patch_container_rootfs(
&mut self,
drive_id: &str,
drive_path: &str,
) -> Result<()> {
let new_drive_id = &[DRIVE_PREFIX, drive_id].concat();
let new_drive_path = self
.get_resource(drive_path, new_drive_id)
.context("get resource CONTAINER ROOTFS")?;
let body: String = json!({
"drive_id": format!("drive{drive_id}"),
"path_on_host": new_drive_path
})
.to_string();
self.request_with_retry(
Method::PATCH,
&["/drives/", &format!("drive{drive_id}")].concat(),
body,
)
.await?;
Ok(())
}
pub(crate) async fn add_net_device(
&mut self,
config: &NetworkConfig,
device_id: String,
) -> Result<()> {
let g_mac = match &config.guest_mac {
Some(mac) => MacAddr::from_bytes(&mac.0).ok(),
None => None,
};
let body: String = json!({
"iface_id": &device_id,
"guest_mac": g_mac,
"host_dev_name": &config.host_dev_name
})
.to_string();
self.request_with_retry(
Method::PUT,
&["/network-interfaces/", &device_id].concat(),
body,
)
.await?;
Ok(())
}
pub(crate) async fn request_with_retry(
&self,
method: Method,
uri: &str,
data: String,
) -> Result<()> {
let url: hyper::Uri = Uri::new(&self.asock_path, uri).into();
self.send_request_with_retry(method, url, data).await
}
pub(crate) async fn send_request_with_retry(
&self,
method: Method,
uri: hyper::Uri,
data: String,
) -> Result<()> {
debug!(sl(), "METHOD: {:?}", method.clone());
debug!(sl(), "URI: {:?}", uri.clone());
debug!(sl(), "DATA: {:?}", data.clone());
for _count in 0..REQUEST_RETRY {
let req = Request::builder()
.method(method.clone())
.uri(uri.clone())
.header("Accept", "application/json")
.header("Content-Type", "application/json")
.body(Body::from(data.clone()))?;
match self.send_request(req).await {
Ok(resp) => {
debug!(sl(), "Request sent, resp: {:?}", resp);
return Ok(());
}
Err(resp) => {
debug!(sl(), "Request sent with error, resp: {:?}", resp);
std::thread::sleep(std::time::Duration::from_millis(10));
continue;
}
}
}
Err(anyhow::anyhow!(
"After {} attempts, it still doesn't work.",
REQUEST_RETRY
))
}
pub(crate) async fn send_request(&self, req: Request<Body>) -> Result<Response<Body>> {
let resp = self.client.request(req).await?;
let status = resp.status();
debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp);
if status.is_success() {
return Ok(resp);
} else {
let body = hyper::body::to_bytes(resp.into_body()).await?;
if body.is_empty() {
debug!(sl(), "Request FAILED WITH STATUS: {:?}", status);
None
} else {
let body = String::from_utf8_lossy(&body).into_owned();
debug!(
sl(),
"Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body
);
Some(body)
};
}
Err(anyhow::anyhow!(
"After {} attempts, it
still doesn't work.",
REQUEST_RETRY
))
}
pub(crate) fn cleanup_resource(&self) {
if self.jailed {
self.umount_jail_resource(FC_KERNEL).ok();
self.umount_jail_resource(FC_ROOT_FS).ok();
for i in 1..DISK_POOL_SIZE {
self.umount_jail_resource(&[DRIVE_PREFIX, &i.to_string()].concat())
.ok();
}
self.umount_jail_resource("").ok();
}
std::fs::remove_dir_all(self.vm_path.as_str())
.map_err(|err| {
error!(
sl(),
"failed to remove dir all for {} with error: {:?}", &self.vm_path, &err
);
err
})
.ok();
}
pub(crate) fn umount_jail_resource(&self, jailed_path: &str) -> Result<()> {
let path = match jailed_path {
// Handle final case to umount the bind-mounted `/run/kata/firecracker/{id}/root` dir
"" => [self.vm_path.clone(), ROOT.to_string()].join("/"),
// Handle generic case to umount the bind-mounted
// `/run/kata/firecracker/{id}/root/asset` file/dir
_ => [
self.vm_path.clone(),
ROOT.to_string(),
jailed_path.to_string(),
]
.join("/"),
};
nix::mount::umount2(path.as_str(), nix::mount::MntFlags::MNT_DETACH)
.with_context(|| format!("umount path {}", &path))
}
}

View File

@ -0,0 +1,208 @@
//Copyright (c) 2019-2022 Alibaba Cloud
//Copyright (c) 2023 Nubificus Ltd
//
//SPDX-License-Identifier: Apache-2.0
use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl};
use crate::HypervisorState;
use crate::MemoryConfig;
use crate::HYPERVISOR_FIRECRACKER;
use crate::{device::DeviceType, VmmState};
use anyhow::{Context, Result};
use async_trait::async_trait;
use hyper::Client;
use hyperlocal::{UnixClientExt, UnixConnector};
use kata_types::{
capabilities::{Capabilities, CapabilityBits},
config::hypervisor::Hypervisor as HypervisorConfig,
};
use nix::sched::{setns, CloneFlags};
use persist::sandbox_persist::Persist;
use std::os::unix::io::AsRawFd;
use tokio::process::Command;
unsafe impl Send for FcInner {}
unsafe impl Sync for FcInner {}
#[derive(Debug)]
pub struct FcInner {
pub(crate) id: String,
pub(crate) asock_path: String,
pub(crate) state: VmmState,
pub(crate) config: HypervisorConfig,
pub(crate) pid: Option<u32>,
pub(crate) vm_path: String,
pub(crate) netns: Option<String>,
pub(crate) client: Client<UnixConnector>,
pub(crate) jailer_root: String,
pub(crate) jailed: bool,
pub(crate) run_dir: String,
pub(crate) pending_devices: Vec<DeviceType>,
pub(crate) capabilities: Capabilities,
}
impl FcInner {
pub fn new() -> FcInner {
let mut capabilities = Capabilities::new();
capabilities.set(CapabilityBits::BlockDeviceSupport);
FcInner {
id: String::default(),
asock_path: String::default(),
state: VmmState::NotReady,
config: Default::default(),
pid: None,
netns: None,
vm_path: String::default(),
client: Client::unix(),
jailer_root: String::default(),
jailed: false,
run_dir: String::default(),
pending_devices: vec![],
capabilities,
}
}
pub(crate) async fn prepare_vmm(&mut self, netns: Option<String>) -> Result<()> {
let mut cmd: Command;
self.netns = netns.clone();
match self.jailed {
true => {
debug!(sl(), "Running Jailed");
cmd = Command::new(&self.config.jailer_path);
let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/");
let args = [
"--id",
&self.id,
"--gid",
"0",
"--uid",
"0",
"--exec-file",
&self.config.path,
"--chroot-base-dir",
&self.jailer_root,
"--",
"--api-sock",
&api_socket,
];
cmd.args(args);
}
false => {
debug!(sl(), "Running non-Jailed");
cmd = Command::new(&self.config.path);
cmd.args(["--api-sock", &self.asock_path]);
}
}
debug!(sl(), "Exec: {:?}", cmd);
// Make sure we're in the correct Network Namespace
unsafe {
let _pre = cmd.pre_exec(move || {
if let Some(netns_path) = &netns {
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
let netns_fd = std::fs::File::open(netns_path);
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
.context("set netns failed");
}
Ok(())
});
}
let mut child = cmd.spawn()?;
match child.id() {
Some(id) => {
let cur_tid = nix::unistd::gettid().as_raw() as u32;
info!(
sl(),
"VMM spawned successfully: PID: {:?}, current TID: {:?}", id, cur_tid
);
self.pid = Some(id);
}
None => {
let exit_status = child.wait().await?;
error!(sl(), "Process exited, status: {:?}", exit_status);
}
};
Ok(())
}
pub(crate) fn hypervisor_config(&self) -> HypervisorConfig {
debug!(sl(), "[Firecracker]: Hypervisor config");
self.config.clone()
}
pub(crate) fn set_hypervisor_config(&mut self, config: HypervisorConfig) {
debug!(sl(), "[Firecracker]: Set Hypervisor config");
self.config = config;
}
pub(crate) fn resize_memory(&mut self, new_mem_mb: u32) -> Result<(u32, MemoryConfig)> {
warn!(
sl(),
"memory size unchanged, requested: {:?} Not implemented", new_mem_mb
);
Ok((
0,
MemoryConfig {
..Default::default()
},
))
}
pub(crate) fn set_capabilities(&mut self, flag: CapabilityBits) {
self.capabilities.add(flag);
}
pub(crate) fn set_guest_memory_block_size(&mut self, size: u32) {
warn!(
sl(),
"guest memory block size unchanged, requested: {:?}, Not implemented", size
);
}
pub(crate) fn guest_memory_block_size_mb(&self) -> u32 {
warn!(sl(), "guest memory block size Not implemented");
0
}
}
#[async_trait]
impl Persist for FcInner {
type State = HypervisorState;
type ConstructorArgs = ();
async fn save(&self) -> Result<Self::State> {
Ok(HypervisorState {
hypervisor_type: HYPERVISOR_FIRECRACKER.to_string(),
id: self.id.clone(),
vm_path: self.vm_path.clone(),
config: self.hypervisor_config(),
jailed: self.jailed,
jailer_root: self.jailer_root.clone(),
run_dir: self.run_dir.clone(),
netns: self.netns.clone(),
..Default::default()
})
}
async fn restore(
_hypervisor_args: Self::ConstructorArgs,
hypervisor_state: Self::State,
) -> Result<Self> {
Ok(FcInner {
id: hypervisor_state.id,
asock_path: String::default(),
state: VmmState::NotReady,
vm_path: hypervisor_state.vm_path,
config: hypervisor_state.config,
netns: hypervisor_state.netns,
pid: None,
jailed: hypervisor_state.jailed,
jailer_root: hypervisor_state.jailer_root,
client: Client::unix(),
pending_devices: vec![],
run_dir: hypervisor_state.run_dir,
capabilities: Capabilities::new(),
})
}
}

View File

@ -0,0 +1,102 @@
//Copyright (c) 2019-2022 Alibaba Cloud
//Copyright (c) 2019-2022 Ant Group
//Copyright (c) 2023 Nubificus Ltd
//
//SPDX-License-Identifier: Apache-2.0
use super::FcInner;
use crate::firecracker::{
inner_hypervisor::{FC_AGENT_SOCKET_NAME, ROOT},
sl,
};
use crate::VmmState;
use crate::{device::DeviceType, HybridVsockConfig, VsockConfig};
use anyhow::{anyhow, Context, Result};
use serde_json::json;
impl FcInner {
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<()> {
if self.state == VmmState::NotReady {
info!(sl(), "VMM not ready, queueing device {}", device);
self.pending_devices.insert(0, device);
return Ok(());
}
debug!(sl(), "Add Device {} ", &device);
match device {
DeviceType::Block(block) => self
.hotplug_block_device(block.config.path_on_host.as_str(), block.config.index)
.await
.context("add block device"),
DeviceType::Network(network) => self
.add_net_device(&network.config, network.device_id)
.await
.context("add net device"),
DeviceType::HybridVsock(hvsock) => {
self.add_hvsock(&hvsock.config).await.context("add vsock")
}
DeviceType::Vsock(vsock) => self.add_vsock(&vsock.config).await.context("add vsock"),
_ => Err(anyhow!("unhandled device: {:?}", device)),
}
}
// Since Firecracker doesn't support sharefs, we patch block devices on pre-start inserted
// dummy drives
pub(crate) async fn hotplug_block_device(&mut self, path: &str, id: u64) -> Result<()> {
if id > 0 {
self.patch_container_rootfs(&id.to_string(), path).await?;
}
Ok(())
}
pub(crate) async fn remove_device(&mut self, device: DeviceType) -> Result<()> {
info!(sl(), "Remove Device {} ", device);
Ok(())
}
pub(crate) async fn update_device(&mut self, device: DeviceType) -> Result<()> {
info!(sl(), "update device {:?}", &device);
Ok(())
}
pub(crate) async fn add_hvsock(&mut self, config: &HybridVsockConfig) -> Result<()> {
let rel_uds_path = match self.jailed {
false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"),
true => FC_AGENT_SOCKET_NAME.to_string(),
};
let body_vsock: String = json!({
"vsock_id": String::from(ROOT),
"guest_cid": config.guest_cid,
"uds_path": rel_uds_path,
})
.to_string();
info!(sl(), "HybridVsock configure: {:?}", &body_vsock);
self.request_with_retry(hyper::Method::PUT, "/vsock", body_vsock)
.await?;
Ok(())
}
pub(crate) async fn add_vsock(&mut self, config: &VsockConfig) -> Result<()> {
let rel_uds_path = match self.jailed {
false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"),
true => FC_AGENT_SOCKET_NAME.to_string(),
};
let body_vsock: String = json!({
"vsock_id": String::from(ROOT),
"guest_cid": config.guest_cid,
"uds_path": rel_uds_path,
})
.to_string();
info!(sl(), "HybridVsock configure: {:?}", &body_vsock);
self.request_with_retry(hyper::Method::PUT, "/vsock", body_vsock)
.await?;
Ok(())
}
}

View File

@ -0,0 +1,192 @@
//Copyright (c) 2019-2022 Alibaba Cloud
//Copyright (c) 2023 Nubificus Ltd
//
//SPDX-License-Identifier: Apache-2.0
use crate::firecracker::{sl, FcInner};
use crate::{VcpuThreadIds, VmmState, HYPERVISOR_FIRECRACKER};
use anyhow::{anyhow, Context, Result};
use kata_types::capabilities::Capabilities;
use kata_types::config::KATA_PATH;
use std::collections::HashSet;
use std::iter::FromIterator;
use tokio::fs;
pub const FC_API_SOCKET_NAME: &str = "fc.sock";
pub const FC_AGENT_SOCKET_NAME: &str = "kata.hvsock";
pub const ROOT: &str = "root";
const HYBRID_VSOCK_SCHEME: &str = "hvsock";
impl FcInner {
pub(crate) async fn prepare_vm(&mut self, id: &str, _netns: Option<String>) -> Result<()> {
debug!(sl(), "Preparing Firecracker");
self.id = id.to_string();
if !self.config.jailer_path.is_empty() {
debug!(sl(), "Running jailed");
self.jailed = true;
self.jailer_root = KATA_PATH.to_string();
debug!(sl(), "jailer_root: {:?}", self.jailer_root);
self.vm_path = [
self.jailer_root.clone(),
HYPERVISOR_FIRECRACKER.to_string(),
id.to_string(),
]
.join("/");
debug!(sl(), "VM Path: {:?}", self.vm_path);
self.run_dir = [self.vm_path.clone(), "root".to_string(), "run".to_string()].join("/");
debug!(sl(), "Rundir: {:?}", self.run_dir);
let _ = self.remount_jailer_with_exec().await;
} else {
self.vm_path = [KATA_PATH.to_string(), id.to_string()].join("/");
debug!(sl(), "VM Path: {:?}", self.vm_path);
self.run_dir = [self.vm_path.clone(), "run".to_string()].join("/");
debug!(sl(), "Rundir: {:?}", self.run_dir);
}
// We construct the FC API socket path based on the run_dir variable (jailed or
// non-jailed).
self.asock_path = [self.run_dir.as_str(), "fc.sock"].join("/");
debug!(sl(), "Socket Path: {:?}", self.asock_path);
let _ = fs::create_dir_all(self.run_dir.as_str())
.await
.context(format!("failed to create directory {:?}", self.vm_path));
self.netns = _netns.clone();
self.prepare_vmm(self.netns.clone()).await?;
self.state = VmmState::VmmServerReady;
self.prepare_vmm_resources().await?;
self.prepare_hvsock().await?;
Ok(())
}
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
debug!(sl(), "Starting sandbox");
let body: String = serde_json::json!({
"action_type": "InstanceStart"
})
.to_string();
self.request_with_retry(hyper::Method::PUT, "/actions", body)
.await?;
self.state = VmmState::VmRunning;
Ok(())
}
pub(crate) async fn stop_vm(&mut self) -> Result<()> {
debug!(sl(), "Stopping sandbox");
if self.state != VmmState::VmRunning {
debug!(sl(), "VM not running!");
} else if let Some(pid_to_kill) = &self.pid {
let pid = ::nix::unistd::Pid::from_raw(*pid_to_kill as i32);
if let Err(err) = ::nix::sys::signal::kill(pid, nix::sys::signal::SIGKILL) {
if err != ::nix::Error::ESRCH {
debug!(sl(), "Failed to kill VMM with pid {} {:?}", pid, err);
}
}
}
Ok(())
}
pub(crate) fn pause_vm(&self) -> Result<()> {
warn!(sl(), "Pause VM: Not implemented");
Ok(())
}
pub(crate) async fn save_vm(&self) -> Result<()> {
warn!(sl(), "Save VM: Not implemented");
Ok(())
}
pub(crate) fn resume_vm(&self) -> Result<()> {
warn!(sl(), "Resume VM: Not implemented");
Ok(())
}
pub(crate) async fn get_agent_socket(&self) -> Result<String> {
debug!(sl(), "Get kata-agent socket");
let vsock_path = match self.jailed {
false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"),
true => [self.vm_path.as_str(), ROOT, FC_AGENT_SOCKET_NAME].join("/"),
};
Ok(format!("{}://{}", HYBRID_VSOCK_SCHEME, vsock_path))
}
pub(crate) async fn disconnect(&mut self) {
warn!(sl(), "Disconnect: Not implemented");
}
pub(crate) async fn get_thread_ids(&self) -> Result<VcpuThreadIds> {
debug!(sl(), "Get Thread IDs");
Ok(VcpuThreadIds::default())
}
pub(crate) async fn get_pids(&self) -> Result<Vec<u32>> {
debug!(sl(), "Get PIDs");
let mut pids = HashSet::new();
// get shim thread ids
pids.insert(self.pid.unwrap());
debug!(sl(), "PIDs: {:?}", pids);
Ok(Vec::from_iter(pids.into_iter()))
}
pub(crate) async fn get_vmm_master_tid(&self) -> Result<u32> {
debug!(sl(), "Get VMM master TID");
if let Some(pid) = self.pid {
Ok(pid)
} else {
Err(anyhow!("could not get vmm master tid"))
}
}
pub(crate) async fn get_ns_path(&self) -> Result<String> {
debug!(sl(), "Get NS path");
if let Some(pid) = self.pid {
let ns_path = format!("/proc/{}/ns", pid);
Ok(ns_path)
} else {
Err(anyhow!("could not get ns path"))
}
}
pub(crate) async fn cleanup(&self) -> Result<()> {
debug!(sl(), "Cleanup");
self.cleanup_resource();
std::fs::remove_dir_all(self.vm_path.as_str())
.map_err(|err| {
error!(
sl(),
"failed to remove dir all for {} with error: {:?}", &self.vm_path, &err
);
err
})
.ok();
Ok(())
}
pub(crate) async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> {
warn!(sl(), "Resize vCPU: Not implemented");
Ok((old_vcpu, new_vcpu))
}
pub(crate) async fn check(&self) -> Result<()> {
warn!(sl(), "Check: Not implemented");
Ok(())
}
pub(crate) async fn get_jailer_root(&self) -> Result<String> {
debug!(sl(), "Get Jailer Root");
Ok(self.jailer_root.clone())
}
pub(crate) async fn capabilities(&self) -> Result<Capabilities> {
debug!(sl(), "Capabilities");
Ok(self.capabilities.clone())
}
pub(crate) async fn get_hypervisor_metrics(&self) -> Result<String> {
warn!(sl(), "Get Hypervisor Metrics: Not implemented");
todo!()
}
}

View File

@ -0,0 +1,215 @@
//Copyright (c) 2019-2022 Alibaba Cloud
//Copyright (c) 2023 Nubificus Ltd
//
//SPDX-License-Identifier: Apache-2.0
mod fc_api;
mod inner;
mod inner_device;
mod inner_hypervisor;
use super::HypervisorState;
use crate::MemoryConfig;
use crate::{device::DeviceType, Hypervisor, HypervisorConfig, VcpuThreadIds};
use anyhow::Context;
use anyhow::Result;
use async_trait::async_trait;
use inner::FcInner;
use kata_types::capabilities::Capabilities;
use kata_types::capabilities::CapabilityBits;
use persist::sandbox_persist::Persist;
use std::sync::Arc;
use tokio::sync::RwLock;
#[derive(Debug)]
pub struct Firecracker {
inner: Arc<RwLock<FcInner>>,
}
// Convenience function to set the scope.
pub fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "firecracker"))
}
impl Default for Firecracker {
fn default() -> Self {
Self::new()
}
}
impl Firecracker {
pub fn new() -> Self {
Self {
inner: Arc::new(RwLock::new(FcInner::new())),
}
}
pub async fn set_hypervisor_config(&mut self, config: HypervisorConfig) {
let mut inner = self.inner.write().await;
inner.set_hypervisor_config(config)
}
}
#[async_trait]
impl Hypervisor for Firecracker {
async fn prepare_vm(&self, id: &str, netns: Option<String>) -> Result<()> {
let mut inner = self.inner.write().await;
inner.prepare_vm(id, netns).await
}
async fn start_vm(&self, timeout: i32) -> Result<()> {
let mut inner = self.inner.write().await;
inner.start_vm(timeout).await
}
async fn stop_vm(&self) -> Result<()> {
let mut inner = self.inner.write().await;
inner.stop_vm().await
}
async fn pause_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.pause_vm()
}
async fn resume_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.resume_vm()
}
async fn save_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.save_vm().await
}
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
let mut inner = self.inner.write().await;
match inner.add_device(device.clone()).await {
Ok(_) => Ok(device),
Err(err) => Err(err),
}
}
async fn remove_device(&self, device: DeviceType) -> Result<()> {
let mut inner = self.inner.write().await;
inner.remove_device(device).await
}
async fn update_device(&self, device: DeviceType) -> Result<()> {
let mut inner = self.inner.write().await;
inner.update_device(device).await
}
async fn get_agent_socket(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_agent_socket().await
}
async fn disconnect(&self) {
let mut inner = self.inner.write().await;
inner.disconnect().await
}
async fn hypervisor_config(&self) -> HypervisorConfig {
let inner = self.inner.read().await;
inner.hypervisor_config()
}
async fn get_thread_ids(&self) -> Result<VcpuThreadIds> {
let inner = self.inner.read().await;
inner.get_thread_ids().await
}
async fn cleanup(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.cleanup().await
}
async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> {
let inner = self.inner.read().await;
inner.resize_vcpu(old_vcpu, new_vcpu).await
}
async fn get_pids(&self) -> Result<Vec<u32>> {
let inner = self.inner.read().await;
inner.get_pids().await
}
async fn get_vmm_master_tid(&self) -> Result<u32> {
let inner = self.inner.read().await;
inner.get_vmm_master_tid().await
}
async fn get_ns_path(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_ns_path().await
}
async fn check(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.check().await
}
async fn get_jailer_root(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_jailer_root().await
}
async fn save_state(&self) -> Result<HypervisorState> {
self.save().await
}
async fn capabilities(&self) -> Result<Capabilities> {
let inner = self.inner.read().await;
inner.capabilities().await
}
async fn get_hypervisor_metrics(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_hypervisor_metrics().await
}
async fn set_capabilities(&self, flag: CapabilityBits) {
let mut inner = self.inner.write().await;
inner.set_capabilities(flag)
}
async fn set_guest_memory_block_size(&self, size: u32) {
let mut inner = self.inner.write().await;
inner.set_guest_memory_block_size(size);
}
async fn guest_memory_block_size(&self) -> u32 {
let inner = self.inner.read().await;
inner.guest_memory_block_size_mb()
}
async fn resize_memory(&self, new_mem_mb: u32) -> Result<(u32, MemoryConfig)> {
let mut inner = self.inner.write().await;
inner.resize_memory(new_mem_mb)
}
async fn get_passfd_listener_addr(&self) -> Result<(String, u32)> {
Err(anyhow::anyhow!("Not yet supported"))
}
}
#[async_trait]
impl Persist for Firecracker {
type State = HypervisorState;
type ConstructorArgs = ();
/// Save a state of the component.
async fn save(&self) -> Result<Self::State> {
let inner = self.inner.read().await;
inner.save().await.context("save hypervisor state")
}
/// Restore a component from a specified state.
async fn restore(
hypervisor_args: Self::ConstructorArgs,
hypervisor_state: Self::State,
) -> Result<Self> {
let inner = FcInner::restore(hypervisor_args, hypervisor_state).await?;
Ok(Self {
inner: Arc::new(RwLock::new(inner)),
})
}
}

View File

@ -118,6 +118,11 @@ impl KernelParams {
self.params.append(&mut params.params);
}
#[cfg(not(target_arch = "s390x"))]
pub(crate) fn push(&mut self, new_param: Param) {
self.params.push(new_param);
}
pub(crate) fn from_string(params_string: &str) -> Self {
let mut params = vec![];

View File

@ -15,6 +15,8 @@ pub use device::driver::*;
use device::DeviceType;
#[cfg(not(target_arch = "s390x"))]
pub mod dragonball;
#[cfg(not(target_arch = "s390x"))]
pub mod firecracker;
mod kernel_param;
pub mod qemu;
pub use kernel_param::Param;
@ -61,6 +63,7 @@ const HUGE_SHMEM: &str = "hugeshmem";
pub const HYPERVISOR_DRAGONBALL: &str = "dragonball";
pub const HYPERVISOR_QEMU: &str = "qemu";
pub const HYPERVISOR_FIRECRACKER: &str = "firecracker";
pub const DEFAULT_HYBRID_VSOCK_NAME: &str = "kata.hvsock";
pub const JAILER_ROOT: &str = "root";

View File

@ -14,9 +14,14 @@ pub const PERSIST_FILE: &str = "state.json";
use kata_sys_util::validate::verify_id;
use safe_path::scoped_join;
pub fn to_disk<T: serde::Serialize>(value: &T, sid: &str) -> Result<()> {
pub fn to_disk<T: serde::Serialize>(value: &T, sid: &str, jailer_path: &str) -> Result<()> {
verify_id(sid).context("failed to verify sid")?;
let mut path = scoped_join(KATA_PATH, sid)?;
// FIXME: handle jailed case
let mut path = match jailer_path {
"" => scoped_join(KATA_PATH, sid)?,
_ => scoped_join(jailer_path, "root")?,
};
//let mut path = scoped_join(KATA_PATH, sid)?;
if path.exists() {
path.push(PERSIST_FILE);
let f = File::create(path)
@ -62,10 +67,10 @@ mod tests {
key: 1,
};
// invalid sid
assert!(to_disk(&data, "..3").is_err());
assert!(to_disk(&data, "../../../3").is_err());
assert!(to_disk(&data, "a/b/c").is_err());
assert!(to_disk(&data, ".#cdscd.").is_err());
assert!(to_disk(&data, "..3", "").is_err());
assert!(to_disk(&data, "../../../3", "").is_err());
assert!(to_disk(&data, "a/b/c", "").is_err());
assert!(to_disk(&data, ".#cdscd.", "").is_err());
let sid = "aadede";
let sandbox_dir = [KATA_PATH, sid].join("/");
@ -74,7 +79,7 @@ mod tests {
.create(&sandbox_dir)
.is_ok()
{
assert!(to_disk(&data, sid).is_ok());
assert!(to_disk(&data, sid, "").is_ok());
if let Ok(result) = from_disk::<Kata>(sid) {
assert_eq!(result.name, data.name);
assert_eq!(result.key, data.key);

View File

@ -23,9 +23,13 @@ use common::{message::Message, RuntimeHandler, RuntimeInstance};
use hypervisor::Hypervisor;
#[cfg(not(target_arch = "s390x"))]
use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL};
#[cfg(not(target_arch = "s390x"))]
use hypervisor::{firecracker::Firecracker, HYPERVISOR_FIRECRACKER};
use hypervisor::{qemu::Qemu, HYPERVISOR_QEMU};
#[cfg(not(target_arch = "s390x"))]
use kata_types::config::DragonballConfig;
#[cfg(not(target_arch = "s390x"))]
use kata_types::config::FirecrackerConfig;
use kata_types::config::{hypervisor::register_hypervisor_plugin, QemuConfig, TomlConfig};
#[cfg(all(feature = "cloud-hypervisor", not(target_arch = "s390x")))]
@ -55,6 +59,9 @@ impl RuntimeHandler for VirtContainer {
{
let dragonball_config = Arc::new(DragonballConfig::new());
register_hypervisor_plugin("dragonball", dragonball_config);
let firecracker_config = Arc::new(FirecrackerConfig::new());
register_hypervisor_plugin("firecracker", firecracker_config);
}
let qemu_config = Arc::new(QemuConfig::new());
@ -160,6 +167,14 @@ async fn new_hypervisor(toml_config: &TomlConfig) -> Result<Arc<dyn Hypervisor>>
.await;
Ok(Arc::new(hypervisor))
}
#[cfg(not(target_arch = "s390x"))]
HYPERVISOR_FIRECRACKER => {
let mut hypervisor = Firecracker::new();
hypervisor
.set_hypervisor_config(hypervisor_config.clone())
.await;
Ok(Arc::new(hypervisor))
}
#[cfg(all(feature = "cloud-hypervisor", not(target_arch = "s390x")))]
HYPERVISOR_NAME_CH => {

View File

@ -18,12 +18,14 @@ use common::{Sandbox, SandboxNetworkEnv};
use containerd_shim_protos::events::task::TaskOOM;
use hypervisor::VsockConfig;
#[cfg(not(target_arch = "s390x"))]
use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL};
use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL, HYPERVISOR_FIRECRACKER};
use hypervisor::{qemu::Qemu, HYPERVISOR_QEMU};
use hypervisor::{utils::get_hvsock_path, HybridVsockConfig, DEFAULT_GUEST_VSOCK_CID};
use hypervisor::{BlockConfig, Hypervisor};
use kata_sys_util::hooks::HookStates;
use kata_types::capabilities::CapabilityBits;
#[cfg(not(target_arch = "s390x"))]
use kata_types::config::hypervisor::HYPERVISOR_NAME_CH;
use kata_types::config::TomlConfig;
use persist::{self, sandbox_persist::Persist};
use resource::manager::ManagerArgs;
@ -570,12 +572,39 @@ impl Persist for VirtSandbox {
/// Save a state of Sandbox
async fn save(&self) -> Result<Self::State> {
let hypervisor_state = self.hypervisor.save_state().await?;
let sandbox_state = crate::sandbox_persist::SandboxState {
sandbox_type: VIRTCONTAINER.to_string(),
resource: Some(self.resource_manager.save().await?),
hypervisor: Some(self.hypervisor.save_state().await?),
hypervisor: match hypervisor_state.hypervisor_type.as_str() {
// TODO support other hypervisors
#[cfg(not(target_arch = "s390x"))]
HYPERVISOR_DRAGONBALL => Ok(Some(hypervisor_state)),
#[cfg(not(target_arch = "s390x"))]
HYPERVISOR_NAME_CH => Ok(Some(hypervisor_state)),
#[cfg(not(target_arch = "s390x"))]
HYPERVISOR_FIRECRACKER => Ok(Some(hypervisor_state)),
HYPERVISOR_QEMU => Ok(Some(hypervisor_state)),
_ => Err(anyhow!(
"Unsupported hypervisor {}",
hypervisor_state.hypervisor_type
)),
}?,
};
persist::to_disk(&sandbox_state, &self.sid)?;
// FIXME: properly handle jailed case
// eg: Determine if we are running jailed:
// let h = sandbox_state.hypervisor.clone().unwrap_or_default();
// Figure out the jailed path:
// jailed_path = h.<>
// and somehow store the sandbox state into the jail:
// persist::to_disk(&sandbox_state, &self.sid, jailed_path)?;
// Issue is, how to handle restore.
let h = sandbox_state.hypervisor.as_ref().unwrap();
let vmpath = match h.jailed {
true => h.vm_path.clone(),
false => "".to_string(),
};
persist::to_disk(&sandbox_state, &self.sid, vmpath.as_str())?;
Ok(sandbox_state)
}
/// Restore Sandbox