From 2d19f3fbd7bf41433c5d98d3bf43be660b5320c5 Mon Sep 17 00:00:00 2001 From: George Pyrros Date: Tue, 26 Sep 2023 19:16:20 +0000 Subject: [PATCH] runtime-rs: firecracker hypervisor backend Add a basic runtime-rs `Hypervisor` trait implementation for AWS Firecracker - Add basic hypervisor operations (setup / start / stop / add_device) - Implement AWS Firecracker API on a separate file `fc_api.rs` - Add support for running jailed (include all sandbox-related content) - Add initial device support (limited as hotplug is not supported) - Add separate config for runtime-rs (FC) Notes: - devmapper is the only snapshotter supported - to account for no sharefs support, we copy files in the sandbox (as in the GO runtime) - nerdctl spawn is broken (TODO: #7703) Fixes: #5268 Signed-off-by: George Pyrros Signed-off-by: Anastassios Nanos Signed-off-by: Charalampos Mainas Signed-off-by: George Ntoutsos --- src/libs/kata-types/src/config/default.rs | 10 + .../src/config/hypervisor/firecracker.rs | 116 ++++++ .../kata-types/src/config/hypervisor/mod.rs | 4 + src/libs/kata-types/src/config/mod.rs | 4 +- src/libs/shim-interface/src/lib.rs | 3 + src/runtime-rs/Cargo.lock | 2 + src/runtime-rs/Makefile | 60 ++- src/runtime-rs/arch/aarch64-options.mk | 2 + src/runtime-rs/arch/x86_64-options.mk | 4 + .../config/configuration-dragonball.toml.in | 2 +- .../config/configuration-rs-fc.toml.in | 373 ++++++++++++++++++ src/runtime-rs/crates/hypervisor/Cargo.toml | 3 + .../hypervisor/src/firecracker/fc_api.rs | 324 +++++++++++++++ .../hypervisor/src/firecracker/inner.rs | 208 ++++++++++ .../src/firecracker/inner_device.rs | 102 +++++ .../src/firecracker/inner_hypervisor.rs | 192 +++++++++ .../crates/hypervisor/src/firecracker/mod.rs | 215 ++++++++++ .../crates/hypervisor/src/kernel_param.rs | 5 + src/runtime-rs/crates/hypervisor/src/lib.rs | 3 + src/runtime-rs/crates/persist/src/lib.rs | 19 +- .../crates/runtimes/virt_container/src/lib.rs | 15 + .../runtimes/virt_container/src/sandbox.rs | 35 +- 22 files changed, 1687 insertions(+), 14 deletions(-) create mode 100644 src/libs/kata-types/src/config/hypervisor/firecracker.rs create mode 100644 src/runtime-rs/config/configuration-rs-fc.toml.in create mode 100644 src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs create mode 100644 src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs create mode 100644 src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs create mode 100644 src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs create mode 100644 src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs diff --git a/src/libs/kata-types/src/config/default.rs b/src/libs/kata-types/src/config/default.rs index d269ea1abf..5dc47dd3ca 100644 --- a/src/libs/kata-types/src/config/default.rs +++ b/src/libs/kata-types/src/config/default.rs @@ -88,3 +88,13 @@ pub const DEFAULT_CH_PCI_BRIDGES: u32 = 2; pub const MAX_CH_PCI_BRIDGES: u32 = 5; pub const MAX_CH_VCPUS: u32 = 256; pub const MIN_CH_MEMORY_SIZE_MB: u32 = 64; + +//Default configuration for firecracker +pub const DEFAULT_FIRECRACKER_ENTROPY_SOURCE: &str = "/dev/urandom"; +pub const DEFAULT_FIRECRACKER_MEMORY_SIZE_MB: u32 = 128; +pub const DEFAULT_FIRECRACKER_MEMORY_SLOTS: u32 = 128; +pub const DEFAULT_FIRECRACKER_VCPUS: u32 = 1; +pub const DEFAULT_FIRECRACKER_GUEST_KERNEL_IMAGE: &str = "vmlinux"; +pub const DEFAULT_FIRECRACKER_GUEST_KERNEL_PARAMS: &str = ""; +pub const MAX_FIRECRACKER_VCPUS: u32 = 32; +pub const MIN_FIRECRACKER_MEMORY_SIZE_MB: u32 = 128; diff --git a/src/libs/kata-types/src/config/hypervisor/firecracker.rs b/src/libs/kata-types/src/config/hypervisor/firecracker.rs new file mode 100644 index 0000000000..3c348c89c5 --- /dev/null +++ b/src/libs/kata-types/src/config/hypervisor/firecracker.rs @@ -0,0 +1,116 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2022-2023 Nubificus LTD +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Result; +use std::path::Path; +use std::sync::Arc; + +use super::{default, register_hypervisor_plugin}; + +use crate::config::default::MAX_FIRECRACKER_VCPUS; +use crate::config::default::MIN_FIRECRACKER_MEMORY_SIZE_MB; + +use crate::config::{ConfigPlugin, TomlConfig}; +use crate::{eother, validate_path}; + +/// Hypervisor name for firecracker, used to index `TomlConfig::hypervisor`. +pub const HYPERVISOR_NAME_FIRECRACKER: &str = "firecracker"; + +/// Configuration information for firecracker. +#[derive(Default, Debug)] +pub struct FirecrackerConfig {} + +impl FirecrackerConfig { + /// Create a new instance of `FirecrackerConfig`. + pub fn new() -> Self { + FirecrackerConfig {} + } + + /// Register the firecracker plugin. + pub fn register(self) { + let plugin = Arc::new(self); + register_hypervisor_plugin(HYPERVISOR_NAME_FIRECRACKER, plugin); + } +} + +impl ConfigPlugin for FirecrackerConfig { + fn get_max_cpus(&self) -> u32 { + MAX_FIRECRACKER_VCPUS + } + + fn get_min_memory(&self) -> u32 { + MIN_FIRECRACKER_MEMORY_SIZE_MB + } + + fn name(&self) -> &str { + HYPERVISOR_NAME_FIRECRACKER + } + + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(&self, conf: &mut TomlConfig) -> Result<()> { + if let Some(firecracker) = conf.hypervisor.get_mut(HYPERVISOR_NAME_FIRECRACKER) { + if firecracker.boot_info.kernel.is_empty() { + firecracker.boot_info.kernel = + default::DEFAULT_FIRECRACKER_GUEST_KERNEL_IMAGE.to_string(); + } + if firecracker.boot_info.kernel_params.is_empty() { + firecracker.boot_info.kernel_params = + default::DEFAULT_FIRECRACKER_GUEST_KERNEL_PARAMS.to_string(); + } + if firecracker.machine_info.entropy_source.is_empty() { + firecracker.machine_info.entropy_source = + default::DEFAULT_FIRECRACKER_ENTROPY_SOURCE.to_string(); + } + + if firecracker.memory_info.default_memory == 0 { + firecracker.memory_info.default_memory = + default::DEFAULT_FIRECRACKER_MEMORY_SIZE_MB; + } + } + + Ok(()) + } + + /// Validate the configuration information. + fn validate(&self, conf: &TomlConfig) -> Result<()> { + if let Some(firecracker) = conf.hypervisor.get(HYPERVISOR_NAME_FIRECRACKER) { + if firecracker.path.is_empty() { + return Err(eother!("Firecracker path is empty")); + } + validate_path!( + firecracker.path, + "FIRECRACKER binary path `{}` is invalid: {}" + )?; + if firecracker.boot_info.kernel.is_empty() { + return Err(eother!("Guest kernel image for firecracker is empty")); + } + if firecracker.boot_info.image.is_empty() { + return Err(eother!( + "Both guest boot image and initrd for firecracker are empty" + )); + } + + if (firecracker.cpu_info.default_vcpus > 0 + && firecracker.cpu_info.default_vcpus as u32 > default::MAX_FIRECRACKER_VCPUS) + || firecracker.cpu_info.default_maxvcpus > default::MAX_FIRECRACKER_VCPUS + { + return Err(eother!( + "Firecracker hypervisor can not support {} vCPUs", + firecracker.cpu_info.default_maxvcpus + )); + } + + if firecracker.memory_info.default_memory < MIN_FIRECRACKER_MEMORY_SIZE_MB { + return Err(eother!( + "Firecracker hypervisor has minimal memory limitation {}", + MIN_FIRECRACKER_MEMORY_SIZE_MB + )); + } + } + + Ok(()) + } +} diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index f73969d850..f17cc87239 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -59,6 +59,9 @@ pub const VIRTIO_SCSI: &str = "virtio-scsi"; /// Virtual PMEM device driver. pub const VIRTIO_PMEM: &str = "virtio-pmem"; +mod firecracker; +pub use self::firecracker::{FirecrackerConfig, HYPERVISOR_NAME_FIRECRACKER}; + const VIRTIO_9P: &str = "virtio-9p"; const VIRTIO_FS: &str = "virtio-fs"; const VIRTIO_FS_INLINE: &str = "inline-virtio-fs"; @@ -530,6 +533,7 @@ impl TopologyConfigInfo { HYPERVISOR_NAME_QEMU, HYPERVISOR_NAME_CH, HYPERVISOR_NAME_DRAGONBALL, + HYPERVISOR_NAME_FIRECRACKER, ]; let hypervisor_name = toml_config.runtime.hypervisor_name.as_str(); if !hypervisor_names.contains(&hypervisor_name) { diff --git a/src/libs/kata-types/src/config/mod.rs b/src/libs/kata-types/src/config/mod.rs index beb93e697c..a5c32fefe9 100644 --- a/src/libs/kata-types/src/config/mod.rs +++ b/src/libs/kata-types/src/config/mod.rs @@ -25,8 +25,8 @@ pub mod hypervisor; pub use self::agent::Agent; use self::default::DEFAULT_AGENT_DBG_CONSOLE_PORT; pub use self::hypervisor::{ - BootInfo, CloudHypervisorConfig, DragonballConfig, Hypervisor, QemuConfig, - HYPERVISOR_NAME_DRAGONBALL, HYPERVISOR_NAME_QEMU, + BootInfo, CloudHypervisorConfig, DragonballConfig, FirecrackerConfig, Hypervisor, QemuConfig, + HYPERVISOR_NAME_DRAGONBALL, HYPERVISOR_NAME_FIRECRACKER, HYPERVISOR_NAME_QEMU, }; mod runtime; diff --git a/src/libs/shim-interface/src/lib.rs b/src/libs/shim-interface/src/lib.rs index 706734d73e..8a401c1aec 100644 --- a/src/libs/shim-interface/src/lib.rs +++ b/src/libs/shim-interface/src/lib.rs @@ -37,6 +37,9 @@ fn get_uds_with_sid(short_id: &str, path: &str) -> Result { return Ok(format!("unix://{}", p.display())); } + let _ = fs::create_dir_all(kata_run_path.join(short_id)) + .context(format!("failed to create directory {:?}", kata_run_path.join(short_id))); + let target_ids: Vec = fs::read_dir(&kata_run_path)? .filter_map(|e| { let x = e.ok()?.file_name().to_string_lossy().into_owned(); diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock index 129a3a5d2c..98e668320b 100644 --- a/src/runtime-rs/Cargo.lock +++ b/src/runtime-rs/Cargo.lock @@ -1635,6 +1635,8 @@ dependencies = [ "dragonball", "futures 0.3.28", "go-flag", + "hyper", + "hyperlocal", "hypervisor", "kata-sys-util", "kata-types", diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 0451000e2c..d395fc3692 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -109,6 +109,12 @@ ROOTFSTYPE_XFS := \"xfs\" ROOTFSTYPE_EROFS := \"erofs\" DEFROOTFSTYPE := $(ROOTFSTYPE_EXT4) +FCBINDIR := $(PREFIXDEPS)/bin +FCPATH = $(FCBINDIR)/$(FCCMD) +FCVALIDHYPERVISORPATHS := [\"$(FCPATH)\"] +FCJAILERPATH = $(FCBINDIR)/$(FCJAILERCMD) +FCVALIDJAILERPATHS = [\"$(FCJAILERPATH)\"] + PKGLIBEXECDIR := $(LIBEXECDIR)/$(PROJECT_DIR) FIRMWAREPATH := FIRMWAREVOLUMEPATH := @@ -164,8 +170,11 @@ DEFMSIZE9P := 8192 DEFVFIOMODE := guest-kernel ##VAR DEFSANDBOXCGROUPONLY= Default cgroup model DEFSANDBOXCGROUPONLY ?= false +DEFSANDBOXCGROUPONLY_DB ?= true +DEFSANDBOXCGROUPONLY_FC ?= true DEFSTATICRESOURCEMGMT ?= false DEFSTATICRESOURCEMGMT_DB ?= false +DEFSTATICRESOURCEMGMT_FC ?= true DEFBINDMOUNTS := [] DEFDANCONF := /run/kata-containers/dans SED = sed @@ -216,7 +225,7 @@ ifneq (,$(DBCMD)) KERNELTYPE_DB = uncompressed KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME_DB,$(KERNELTYPE_DB)) KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB) - DEFSANDBOXCGROUPONLY = true + DEFSANDBOXCGROUPONLY_DB = true RUNTIMENAME := virt_container PIPESIZE := 1 DBSHAREDFS := inline-virtio-fs @@ -244,6 +253,9 @@ ifneq (,$(CLHCMD)) KERNEL_NAME_CLH = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_CLH)) KERNELPATH_CLH = $(KERNELDIR)/$(KERNEL_NAME_CLH) VMROOTFSDRIVER_CLH := virtio-pmem + + DEFSTATICRESOURCEMGMT = true + DEFSANDBOXCGROUPONLY = true endif ifneq (,$(QEMUCMD)) @@ -288,6 +300,28 @@ endif DEFSECCOMPSANDBOXPARAM := on,obsolete=deny,spawn=deny,resourcecontrol=deny DEFGUESTSELINUXLABEL := system_u:system_r:container_t endif +ifneq (,$(FCCMD)) + KNOWN_HYPERVISORS += $(HYPERVISOR_FC) + CONFIG_FILE_FC = configuration-rs-fc.toml + CONFIG_FC = config/$(CONFIG_FILE_FC) + CONFIG_FC_IN = $(CONFIG_FC).in + CONFIG_PATH_FC = $(abspath $(CONFDIR)/$(CONFIG_FILE_FC)) + CONFIG_PATHS += $(CONFIG_PATH_FC) + SYSCONFIG_FC = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_FC)) + SYSCONFIG_PATHS += $(SYSCONFIG_FC) + CONFIGS += $(CONFIG_FC) + # firecracker-specific options (all should be suffixed by "_FC") + DEFBLOCKSTORAGEDRIVER_FC := virtio-blk-mmio + DEFMAXMEMSZ_FC := 2048 + DEFNETWORKMODEL_FC := tcfilter + KERNELPARAMS = console=ttyS0 agent.log_vport=1025 + KERNELTYPE_FC = uncompressed + KERNEL_NAME_FC = $(call MAKE_KERNEL_NAME_FC,$(KERNELTYPE_FC)) + KERNELPATH_FC = $(KERNELDIR)/$(KERNEL_NAME_FC) + DEFSANDBOXCGROUPONLY_FC = true + RUNTIMENAME := virt_container + DEFSTATICRESOURCEMGMT_FC ?= true +endif ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_DB)) DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_DB) @@ -296,16 +330,21 @@ endif ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_QEMU)) DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_QEMU) endif +ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_FC)) + DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_FC) +endif # list of variables the user may wish to override USER_VARS += ARCH USER_VARS += BINDIR USER_VARS += CONFIG_DB_IN +USER_VARS += CONFIG_FC_IN USER_VARS += CONFIG_PATH USER_VARS += CONFIG_QEMU_IN USER_VARS += DESTDIR USER_VARS += DEFAULT_HYPERVISOR USER_VARS += DBCMD USER_VARS += DBCTLCMD +USER_VARS += FCCTLCMD USER_VARS += DBPATH USER_VARS += DBVALIDHYPERVISORPATHS USER_VARS += DBCTLPATH @@ -316,6 +355,13 @@ USER_VARS += QEMUPATH USER_VARS += QEMUVALIDHYPERVISORPATHS USER_VARS += FIRMWAREPATH_CLH USER_VARS += KERNELPATH_CLH +USER_VARS += FCCMD +USER_VARS += FCPATH +USER_VARS += FCVALIDHYPERVISORPATHS +USER_VARS += FCJAILERPATH +USER_VARS += FCVALIDJAILERPATHS +USER_VARS += FCVALIDJAILERPATHS +USER_VARS += DEFMAXMEMSZ_FC USER_VARS += SYSCONFIG USER_VARS += IMAGENAME USER_VARS += IMAGEPATH @@ -329,6 +375,8 @@ USER_VARS += KERNELDIR USER_VARS += KERNELTYPE USER_VARS += KERNELPATH_DB USER_VARS += KERNELPATH_QEMU +USER_VARS += KERNELPATH_FC +USER_VARS += KERNELPATH USER_VARS += KERNELVIRTIOFSPATH USER_VARS += FIRMWAREPATH USER_VARS += FIRMWAREVOLUMEPATH @@ -365,6 +413,7 @@ USER_VARS += DEFBRIDGES USER_VARS += DEFNETWORKMODEL_DB USER_VARS += DEFNETWORKMODEL_CLH USER_VARS += DEFNETWORKMODEL_QEMU +USER_VARS += DEFNETWORKMODEL_FC USER_VARS += DEFDISABLEGUESTEMPTYDIR USER_VARS += DEFDISABLEGUESTSECCOMP USER_VARS += DEFDISABLESELINUX @@ -374,6 +423,7 @@ USER_VARS += DEFDISABLEBLOCK USER_VARS += DEFBLOCKSTORAGEDRIVER_DB USER_VARS += DEFBLOCKSTORAGEDRIVER_QEMU USER_VARS += DEFBLOCKDEVICEAIO_QEMU +USER_VARS += DEFBLOCKSTORAGEDRIVER_FC USER_VARS += DEFSHAREDFS_CLH_VIRTIOFS USER_VARS += DEFSHAREDFS_QEMU_VIRTIOFS USER_VARS += DEFVIRTIOFSDAEMON @@ -396,8 +446,11 @@ USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFSANDBOXCGROUPONLY_QEMU +USER_VARS += DEFSANDBOXCGROUPONLY_DB +USER_VARS += DEFSANDBOXCGROUPONLY_FC USER_VARS += DEFSTATICRESOURCEMGMT USER_VARS += DEFSTATICRESOURCEMGMT_DB +USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFBINDMOUNTS USER_VARS += DEFVFIOMODE USER_VARS += BUILDFLAGS @@ -405,6 +458,7 @@ USER_VARS += RUNTIMENAME USER_VARS += HYPERVISOR_DB USER_VARS += HYPERVISOR_CLH USER_VARS += HYPERVISOR_QEMU +USER_VARS += HYPERVISOR_FC USER_VARS += PIPESIZE USER_VARS += DBSHAREDFS USER_VARS += KATA_INSTALL_GROUP @@ -442,6 +496,7 @@ RUNTIME_VERSION=$(VERSION) GENERATED_VARS = \ VERSION \ CONFIG_DB_IN \ + CONFIG_FC_IN \ $(USER_VARS) @@ -483,6 +538,9 @@ endef define MAKE_KERNEL_NAME_DB $(if $(findstring uncompressed,$1),vmlinux-dragonball-experimental.container,vmlinuz-dragonball-experimental.container) endef +define MAKE_KERNEL_NAME_FC +$(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container) +endef # Returns the name of the kernel file to use based on the provided KERNELTYPE. # # $1 : KERNELTYPE (compressed or uncompressed) diff --git a/src/runtime-rs/arch/aarch64-options.mk b/src/runtime-rs/arch/aarch64-options.mk index 2e9e5759b7..2a4e97befe 100644 --- a/src/runtime-rs/arch/aarch64-options.mk +++ b/src/runtime-rs/arch/aarch64-options.mk @@ -13,3 +13,5 @@ QEMUCMD := qemu-system-aarch64 # dragonball binary name DBCMD := dragonball +FCCMD := firecracker +FCJAILERCMD := jailer diff --git a/src/runtime-rs/arch/x86_64-options.mk b/src/runtime-rs/arch/x86_64-options.mk index ea44aeb506..3f905d829e 100644 --- a/src/runtime-rs/arch/x86_64-options.mk +++ b/src/runtime-rs/arch/x86_64-options.mk @@ -16,3 +16,7 @@ DBCMD := dragonball # cloud-hypervisor binary name CLHCMD := cloud-hypervisor + +# firecracker binary (vmm and jailer) +FCCMD := firecracker +FCJAILERCMD := jailer diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 2998301055..6ed69e6e14 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -341,7 +341,7 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. # The sandbox cgroup is constrained if there is no container type annotation. # See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType -sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY_DB@ # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, diff --git a/src/runtime-rs/config/configuration-rs-fc.toml.in b/src/runtime-rs/config/configuration-rs-fc.toml.in new file mode 100644 index 0000000000..786b1b2a92 --- /dev/null +++ b/src/runtime-rs/config/configuration-rs-fc.toml.in @@ -0,0 +1,373 @@ +# Copyright (c) 2017-2023 Intel Corporation +# Copyright (c) Adobe Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_FC_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.firecracker] +path = "@FCPATH@" +kernel = "@KERNELPATH_FC@" +image = "@IMAGEPATH@" + +rootfs_type=@DEFROOTFSTYPE@ +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @FCVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @FCVALIDHYPERVISORPATHS@ + +# Path for the jailer specific to firecracker +# If the jailer path is not set kata will launch firecracker +# without a jail. If the jailer is set firecracker will be +# launched in a jailed enviornment created by the jailer +#jailer_path = "@FCJAILERPATH@" + +# List of valid jailer path values for the hypervisor +# Each member of the list can be a regular expression +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @FCVALIDJAILERPATHS@ +valid_jailer_paths = @FCVALIDJAILERPATHS@ + + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to @DEFVCPUS@ +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = 1 + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. +default_maxvcpus = @DEFMAXVCPUS@ + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ + +# +# Default memory slots per SB/VM. +# If unspecified then it will be set @DEFMEMSLOTS@. +# This is will determine the times that memory will be hotadded to sandbox/VM. +memory_slots = @DEFMEMSLOTS@ + +# The size in MiB will be plused to max memory of hypervisor. +# It is the memory address space for the NVDIMM devie. +# If set block storage driver (block_device_driver) to "nvdimm", +# should set memory_offset to the size of block device. +# Default 0 +#memory_offset = 0 + +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = @DEFMAXMEMSZ_FC@ + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. This is virtio-scsi, virtio-blk +# or nvdimm. +block_device_driver = "@DEFBLOCKSTORAGEDRIVER_FC@" + +# Specifies cache-related options will be set to block devices or not. +# Default false +#block_device_cache_set = true + +# Specifies cache-related options for block devices. +# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +# Default false +#block_device_cache_direct = true + +# Specifies cache-related options for block devices. +# Denotes whether flush requests for the device are ignored. +# Default false +#block_device_cache_noflush = true + +# Enable pre allocation of VM RAM, default false +# Enabling this will result in lower container density +# as all of the memory will be allocated and locked +# This is useful when you want to reserve all the memory +# upfront or in the cases where you want memory latencies +# to be very predictable +# Default false +#enable_mem_prealloc = true + +# Enable huge pages for VM RAM, default false +# Enabling this will result in the VM memory +# being allocated using huge pages. +# This is useful when you want to use vhost-user network +# stacks within the container. This will automatically +# result in memory pre allocation +#enable_hugepages = true + +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: intel_iommu=on,iommu=pt +#enable_iommu = true + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +#enable_debug = true + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# This is the msize used for 9p shares. It is the number of bytes +# used for 9p packet payload. +#msize_9p = @DEFMSIZE9P@ + +# VFIO devices are hotplugged on a bridge by default. +# Enable hotplugging on root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hotplugging on +# a bridge. +# Default false +#hotplug_vfio_on_root_bus = true + +# +# Default entropy source. +# The path to a host source of entropy (including a real hardware RNG) +# /dev/urandom and /dev/random are two main options. +# Be aware that /dev/random is a blocking source of entropy. If the host +# runs out of entropy, the VMs boot time will increase leading to get startup +# timeouts. +# The source of entropy /dev/urandom is non-blocking and provides a +# generally acceptable source of entropy. It should work well for pretty much +# all practical purposes. +#entropy_source= "@DEFENTROPYSOURCE@" + +# List of valid annotations values for entropy_source +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DEFVALIDENTROPYSOURCES@ +valid_entropy_sources = @DEFVALIDENTROPYSOURCES@ + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered will scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" +# +# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM). +# In Firecracker, it provides a built-in rate limiter, which is based on TBF(Token Bucket Filter) +# queueing discipline. +# Default 0-sized value means unlimited rate. +#rx_rate_limiter_max_rate = 0 +# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM). +# In Firecracker, it provides a built-in rate limiter, which is based on TBF(Token Bucket Filter) +# queueing discipline. +# Default 0-sized value means unlimited rate. +#tx_rate_limiter_max_rate = 0 + +# disable applying SELinux on the VMM process (default false) +disable_selinux=@DEFDISABLESELINUX@ + +[factory] +# VM templating support. Once enabled, new VMs are created from template +# using vm cloning. They will share the same initial kernel, initramfs and +# agent memory by mapping it readonly. It helps speeding up new container +# creation and saves a lot of memory if there are many kata containers running +# on the same host. +# +# When disabled, new VMs are created from scratch. +# +# Note: Requires "initrd=" to be set ("image=" is not supported). +# +# Default false +#enable_template = true + +[agent.@PROJECT_TYPE@] +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +#enable_tracing = true + +# Comma separated list of kernel modules and their parameters. +# These modules will be loaded in the guest kernel using modprobe(8). +# The following example can be used to load two kernel modules with parameters +# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] +# The first word is considered as the module name and the rest as its parameters. +# Container will not be started when: +# * A kernel module is specified and the modprobe command is not installed in the guest +# or it fails loading the module. +# * The module is not available in the guest or it doesn't met the guest kernel +# requirements, like architecture and version. +# +kernel_modules=[] + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +#debug_console_enabled = true + +# Agent connection dialing timeout value in seconds +# (default: 45) +dial_timeout = 45 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="@DEFNETWORKMODEL_FC@" + +name="@RUNTIMENAME@" +hypervisor_name="@HYPERVISOR_FC@" +agent_name="@PROJECT_TYPE@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +#jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +#jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +#jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY_FC@ + +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_FC@ + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir=@DEFDISABLEGUESTEMPTYDIR@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental=@DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +# enable_pprof = true diff --git a/src/runtime-rs/crates/hypervisor/Cargo.toml b/src/runtime-rs/crates/hypervisor/Cargo.toml index 1b26596486..c0e122b235 100644 --- a/src/runtime-rs/crates/hypervisor/Cargo.toml +++ b/src/runtime-rs/crates/hypervisor/Cargo.toml @@ -49,6 +49,9 @@ qapi-qmp = "0.14.0" [target.'cfg(not(target_arch = "s390x"))'.dependencies] dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs", "vhost-net", "dbs-upcall", "virtio-mem", "virtio-balloon", "vhost-user-net", "host-device"] } +dbs-utils = { path = "../../../dragonball/src/dbs_utils" } +hyperlocal = "0.8.0" +hyper = {version = "0.14.18", features = ["client"]} [features] default = [] diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs new file mode 100644 index 0000000000..c1f4a7fe35 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs @@ -0,0 +1,324 @@ +//Copyright (c) 2019-2022 Alibaba Cloud +//Copyright (c) 2023 Nubificus Ltd +// +//SPDX-License-Identifier: Apache-2.0 + +use crate::{ + firecracker::{ + inner_hypervisor::{FC_AGENT_SOCKET_NAME, ROOT}, + sl, FcInner, + }, + kernel_param::KernelParams, + NetworkConfig, Param, +}; +use anyhow::{anyhow, Context, Result}; +use dbs_utils::net::MacAddr; +use hyper::{Body, Method, Request, Response}; +use hyperlocal::Uri; +use kata_sys_util::mount; +use nix::mount::MsFlags; +use serde_json::json; +use tokio::{fs, fs::File}; + +const REQUEST_RETRY: u32 = 500; +const FC_KERNEL: &str = "vmlinux"; +const FC_ROOT_FS: &str = "rootfs"; +const DRIVE_PREFIX: &str = "drive"; +const DISK_POOL_SIZE: u32 = 6; + +impl FcInner { + pub(crate) fn get_resource(&self, src: &str, dst: &str) -> Result { + if self.jailed { + self.jail_resource(src, dst) + } else { + Ok(src.to_string()) + } + } + + fn jail_resource(&self, src: &str, dst: &str) -> Result { + if src.is_empty() || dst.is_empty() { + return Err(anyhow!("invalid param src {} dst {}", src, dst)); + } + + let jailed_location = [self.vm_path.as_str(), ROOT, dst].join("/"); + mount::bind_mount_unchecked(src, jailed_location.as_str(), false, MsFlags::MS_SLAVE) + .context("bind_mount ERROR")?; + + let mut abs_path = String::from("/"); + abs_path.push_str(dst); + Ok(abs_path) + } + + // Remounting jailer root to ensure it has exec permissions, since firecracker binary will + // execute from there + pub(crate) async fn remount_jailer_with_exec(&self) -> Result<()> { + let localpath = [self.vm_path.clone(), ROOT.to_string()].join("/"); + let _ = fs::create_dir_all(&localpath) + .await + .context(format!("failed to create directory {:?}", &localpath)); + mount::bind_mount_unchecked(&localpath, &localpath, false, MsFlags::MS_SHARED) + .context("bind mount jailer root")?; + + mount::bind_remount(&localpath, false).context("rebind mount jailer root")?; + Ok(()) + } + + pub(crate) async fn prepare_hvsock(&mut self) -> Result<()> { + let rel_uds_path = match self.jailed { + false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"), + true => FC_AGENT_SOCKET_NAME.to_string(), + }; + + let body_vsock: String = json!({ + "guest_cid": 3, + "uds_path": rel_uds_path, + "vsock_id": ROOT, + }) + .to_string(); + + self.request_with_retry(Method::PUT, "/vsock", body_vsock) + .await?; + Ok(()) + } + + pub(crate) async fn prepare_vmm_resources(&mut self) -> Result<()> { + let mut kernel_params = KernelParams::new(self.config.debug_info.enable_debug); + kernel_params.push(Param::new("pci", "off")); + kernel_params.push(Param::new("iommu", "off")); + let rootfs_driver = self.config.blockdev_info.block_device_driver.clone(); + + kernel_params.append(&mut KernelParams::new_rootfs_kernel_params( + &rootfs_driver, + &self.config.boot_info.rootfs_type, + )?); + kernel_params.append(&mut KernelParams::from_string( + &self.config.boot_info.kernel_params, + )); + let mut parameters = String::new().to_owned(); + + for param in &kernel_params.to_string() { + parameters.push_str(¶m.to_string()); + } + + let kernel = self + .get_resource(&self.config.boot_info.kernel, FC_KERNEL) + .context("get resource KERNEL")?; + let rootfs = self + .get_resource(&self.config.boot_info.image, FC_ROOT_FS) + .context("get resource ROOTFS")?; + + let body_kernel: String = json!({ + "kernel_image_path": kernel, + "boot_args": parameters, + }) + .to_string(); + + let body_rootfs: String = json!({ + "drive_id": "rootfs", + "path_on_host": rootfs, + "is_root_device": false, + "is_read_only": true + }) + .to_string(); + + info!(sl(), "Before first request"); + self.request_with_retry(Method::PUT, "/boot-source", body_kernel) + .await?; + self.request_with_retry(Method::PUT, "/drives/rootfs", body_rootfs) + .await?; + + let abs_path = [&self.vm_path, ROOT].join("/"); + + let rel_path = "/".to_string(); + let _ = fs::create_dir_all(&abs_path) + .await + .context(format!("failed to create directory {:?}", &abs_path)); + + // We create some placeholder drives to be used for patching block devices while the vmm is + // running, as firecracker does not support device hotplug. + for i in 1..DISK_POOL_SIZE { + let full_path_name = format!("{}/drive{}", abs_path, i); + + let _ = File::create(&full_path_name) + .await + .context(format!("failed to create file {:?}", &full_path_name)); + + let path_on_host = match self.jailed { + false => abs_path.clone(), + true => rel_path.clone(), + }; + let body: String = json!({ + "drive_id": format!("drive{}",i), + "path_on_host": format!("{}/drive{}", path_on_host, i), + "is_root_device": false, + "is_read_only": false + }) + .to_string(); + + self.request_with_retry(Method::PUT, &format!("/drives/drive{}", i), body) + .await?; + } + + Ok(()) + } + pub(crate) async fn patch_container_rootfs( + &mut self, + drive_id: &str, + drive_path: &str, + ) -> Result<()> { + let new_drive_id = &[DRIVE_PREFIX, drive_id].concat(); + let new_drive_path = self + .get_resource(drive_path, new_drive_id) + .context("get resource CONTAINER ROOTFS")?; + let body: String = json!({ + "drive_id": format!("drive{drive_id}"), + "path_on_host": new_drive_path + }) + .to_string(); + self.request_with_retry( + Method::PATCH, + &["/drives/", &format!("drive{drive_id}")].concat(), + body, + ) + .await?; + Ok(()) + } + + pub(crate) async fn add_net_device( + &mut self, + config: &NetworkConfig, + device_id: String, + ) -> Result<()> { + let g_mac = match &config.guest_mac { + Some(mac) => MacAddr::from_bytes(&mac.0).ok(), + None => None, + }; + let body: String = json!({ + "iface_id": &device_id, + "guest_mac": g_mac, + "host_dev_name": &config.host_dev_name + + }) + .to_string(); + self.request_with_retry( + Method::PUT, + &["/network-interfaces/", &device_id].concat(), + body, + ) + .await?; + Ok(()) + } + + pub(crate) async fn request_with_retry( + &self, + method: Method, + uri: &str, + data: String, + ) -> Result<()> { + let url: hyper::Uri = Uri::new(&self.asock_path, uri).into(); + self.send_request_with_retry(method, url, data).await + } + + pub(crate) async fn send_request_with_retry( + &self, + method: Method, + uri: hyper::Uri, + data: String, + ) -> Result<()> { + debug!(sl(), "METHOD: {:?}", method.clone()); + debug!(sl(), "URI: {:?}", uri.clone()); + debug!(sl(), "DATA: {:?}", data.clone()); + for _count in 0..REQUEST_RETRY { + let req = Request::builder() + .method(method.clone()) + .uri(uri.clone()) + .header("Accept", "application/json") + .header("Content-Type", "application/json") + .body(Body::from(data.clone()))?; + + match self.send_request(req).await { + Ok(resp) => { + debug!(sl(), "Request sent, resp: {:?}", resp); + return Ok(()); + } + Err(resp) => { + debug!(sl(), "Request sent with error, resp: {:?}", resp); + std::thread::sleep(std::time::Duration::from_millis(10)); + continue; + } + } + } + Err(anyhow::anyhow!( + "After {} attempts, it still doesn't work.", + REQUEST_RETRY + )) + } + + pub(crate) async fn send_request(&self, req: Request) -> Result> { + let resp = self.client.request(req).await?; + + let status = resp.status(); + debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp); + if status.is_success() { + return Ok(resp); + } else { + let body = hyper::body::to_bytes(resp.into_body()).await?; + if body.is_empty() { + debug!(sl(), "Request FAILED WITH STATUS: {:?}", status); + None + } else { + let body = String::from_utf8_lossy(&body).into_owned(); + debug!( + sl(), + "Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body + ); + Some(body) + }; + } + + Err(anyhow::anyhow!( + "After {} attempts, it + still doesn't work.", + REQUEST_RETRY + )) + } + pub(crate) fn cleanup_resource(&self) { + if self.jailed { + self.umount_jail_resource(FC_KERNEL).ok(); + self.umount_jail_resource(FC_ROOT_FS).ok(); + + for i in 1..DISK_POOL_SIZE { + self.umount_jail_resource(&[DRIVE_PREFIX, &i.to_string()].concat()) + .ok(); + } + + self.umount_jail_resource("").ok(); + } + std::fs::remove_dir_all(self.vm_path.as_str()) + .map_err(|err| { + error!( + sl(), + "failed to remove dir all for {} with error: {:?}", &self.vm_path, &err + ); + err + }) + .ok(); + } + + pub(crate) fn umount_jail_resource(&self, jailed_path: &str) -> Result<()> { + let path = match jailed_path { + // Handle final case to umount the bind-mounted `/run/kata/firecracker/{id}/root` dir + "" => [self.vm_path.clone(), ROOT.to_string()].join("/"), + // Handle generic case to umount the bind-mounted + // `/run/kata/firecracker/{id}/root/asset` file/dir + _ => [ + self.vm_path.clone(), + ROOT.to_string(), + jailed_path.to_string(), + ] + .join("/"), + }; + nix::mount::umount2(path.as_str(), nix::mount::MntFlags::MNT_DETACH) + .with_context(|| format!("umount path {}", &path)) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs new file mode 100644 index 0000000000..58697aafe4 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs @@ -0,0 +1,208 @@ +//Copyright (c) 2019-2022 Alibaba Cloud +//Copyright (c) 2023 Nubificus Ltd +// +//SPDX-License-Identifier: Apache-2.0 + +use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl}; +use crate::HypervisorState; +use crate::MemoryConfig; +use crate::HYPERVISOR_FIRECRACKER; +use crate::{device::DeviceType, VmmState}; +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hyper::Client; +use hyperlocal::{UnixClientExt, UnixConnector}; +use kata_types::{ + capabilities::{Capabilities, CapabilityBits}, + config::hypervisor::Hypervisor as HypervisorConfig, +}; +use nix::sched::{setns, CloneFlags}; +use persist::sandbox_persist::Persist; +use std::os::unix::io::AsRawFd; +use tokio::process::Command; + +unsafe impl Send for FcInner {} +unsafe impl Sync for FcInner {} + +#[derive(Debug)] +pub struct FcInner { + pub(crate) id: String, + pub(crate) asock_path: String, + pub(crate) state: VmmState, + pub(crate) config: HypervisorConfig, + pub(crate) pid: Option, + pub(crate) vm_path: String, + pub(crate) netns: Option, + pub(crate) client: Client, + pub(crate) jailer_root: String, + pub(crate) jailed: bool, + pub(crate) run_dir: String, + pub(crate) pending_devices: Vec, + pub(crate) capabilities: Capabilities, +} + +impl FcInner { + pub fn new() -> FcInner { + let mut capabilities = Capabilities::new(); + capabilities.set(CapabilityBits::BlockDeviceSupport); + FcInner { + id: String::default(), + asock_path: String::default(), + state: VmmState::NotReady, + config: Default::default(), + pid: None, + netns: None, + vm_path: String::default(), + client: Client::unix(), + jailer_root: String::default(), + jailed: false, + run_dir: String::default(), + pending_devices: vec![], + capabilities, + } + } + + pub(crate) async fn prepare_vmm(&mut self, netns: Option) -> Result<()> { + let mut cmd: Command; + self.netns = netns.clone(); + match self.jailed { + true => { + debug!(sl(), "Running Jailed"); + cmd = Command::new(&self.config.jailer_path); + let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/"); + let args = [ + "--id", + &self.id, + "--gid", + "0", + "--uid", + "0", + "--exec-file", + &self.config.path, + "--chroot-base-dir", + &self.jailer_root, + "--", + "--api-sock", + &api_socket, + ]; + cmd.args(args); + } + false => { + debug!(sl(), "Running non-Jailed"); + cmd = Command::new(&self.config.path); + cmd.args(["--api-sock", &self.asock_path]); + } + } + debug!(sl(), "Exec: {:?}", cmd); + + // Make sure we're in the correct Network Namespace + unsafe { + let _pre = cmd.pre_exec(move || { + if let Some(netns_path) = &netns { + debug!(sl(), "set netns for vmm master {:?}", &netns_path); + let netns_fd = std::fs::File::open(netns_path); + let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET) + .context("set netns failed"); + } + Ok(()) + }); + } + + let mut child = cmd.spawn()?; + + match child.id() { + Some(id) => { + let cur_tid = nix::unistd::gettid().as_raw() as u32; + info!( + sl(), + "VMM spawned successfully: PID: {:?}, current TID: {:?}", id, cur_tid + ); + self.pid = Some(id); + } + None => { + let exit_status = child.wait().await?; + error!(sl(), "Process exited, status: {:?}", exit_status); + } + }; + Ok(()) + } + + pub(crate) fn hypervisor_config(&self) -> HypervisorConfig { + debug!(sl(), "[Firecracker]: Hypervisor config"); + self.config.clone() + } + + pub(crate) fn set_hypervisor_config(&mut self, config: HypervisorConfig) { + debug!(sl(), "[Firecracker]: Set Hypervisor config"); + self.config = config; + } + + pub(crate) fn resize_memory(&mut self, new_mem_mb: u32) -> Result<(u32, MemoryConfig)> { + warn!( + sl(), + "memory size unchanged, requested: {:?} Not implemented", new_mem_mb + ); + Ok(( + 0, + MemoryConfig { + ..Default::default() + }, + )) + } + + pub(crate) fn set_capabilities(&mut self, flag: CapabilityBits) { + self.capabilities.add(flag); + } + + pub(crate) fn set_guest_memory_block_size(&mut self, size: u32) { + warn!( + sl(), + "guest memory block size unchanged, requested: {:?}, Not implemented", size + ); + } + + pub(crate) fn guest_memory_block_size_mb(&self) -> u32 { + warn!(sl(), "guest memory block size Not implemented"); + 0 + } +} + +#[async_trait] +impl Persist for FcInner { + type State = HypervisorState; + type ConstructorArgs = (); + + async fn save(&self) -> Result { + Ok(HypervisorState { + hypervisor_type: HYPERVISOR_FIRECRACKER.to_string(), + id: self.id.clone(), + vm_path: self.vm_path.clone(), + config: self.hypervisor_config(), + jailed: self.jailed, + jailer_root: self.jailer_root.clone(), + run_dir: self.run_dir.clone(), + netns: self.netns.clone(), + ..Default::default() + }) + } + async fn restore( + _hypervisor_args: Self::ConstructorArgs, + hypervisor_state: Self::State, + ) -> Result { + Ok(FcInner { + id: hypervisor_state.id, + asock_path: String::default(), + state: VmmState::NotReady, + vm_path: hypervisor_state.vm_path, + config: hypervisor_state.config, + netns: hypervisor_state.netns, + pid: None, + jailed: hypervisor_state.jailed, + jailer_root: hypervisor_state.jailer_root, + client: Client::unix(), + pending_devices: vec![], + run_dir: hypervisor_state.run_dir, + capabilities: Capabilities::new(), + }) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs new file mode 100644 index 0000000000..6e46ccee48 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs @@ -0,0 +1,102 @@ +//Copyright (c) 2019-2022 Alibaba Cloud +//Copyright (c) 2019-2022 Ant Group +//Copyright (c) 2023 Nubificus Ltd +// +//SPDX-License-Identifier: Apache-2.0 + +use super::FcInner; +use crate::firecracker::{ + inner_hypervisor::{FC_AGENT_SOCKET_NAME, ROOT}, + sl, +}; +use crate::VmmState; +use crate::{device::DeviceType, HybridVsockConfig, VsockConfig}; +use anyhow::{anyhow, Context, Result}; +use serde_json::json; + +impl FcInner { + pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<()> { + if self.state == VmmState::NotReady { + info!(sl(), "VMM not ready, queueing device {}", device); + + self.pending_devices.insert(0, device); + + return Ok(()); + } + + debug!(sl(), "Add Device {} ", &device); + + match device { + DeviceType::Block(block) => self + .hotplug_block_device(block.config.path_on_host.as_str(), block.config.index) + .await + .context("add block device"), + DeviceType::Network(network) => self + .add_net_device(&network.config, network.device_id) + .await + .context("add net device"), + DeviceType::HybridVsock(hvsock) => { + self.add_hvsock(&hvsock.config).await.context("add vsock") + } + DeviceType::Vsock(vsock) => self.add_vsock(&vsock.config).await.context("add vsock"), + _ => Err(anyhow!("unhandled device: {:?}", device)), + } + } + + // Since Firecracker doesn't support sharefs, we patch block devices on pre-start inserted + // dummy drives + pub(crate) async fn hotplug_block_device(&mut self, path: &str, id: u64) -> Result<()> { + if id > 0 { + self.patch_container_rootfs(&id.to_string(), path).await?; + } + Ok(()) + } + + pub(crate) async fn remove_device(&mut self, device: DeviceType) -> Result<()> { + info!(sl(), "Remove Device {} ", device); + Ok(()) + } + + pub(crate) async fn update_device(&mut self, device: DeviceType) -> Result<()> { + info!(sl(), "update device {:?}", &device); + Ok(()) + } + + pub(crate) async fn add_hvsock(&mut self, config: &HybridVsockConfig) -> Result<()> { + let rel_uds_path = match self.jailed { + false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"), + true => FC_AGENT_SOCKET_NAME.to_string(), + }; + let body_vsock: String = json!({ + "vsock_id": String::from(ROOT), + "guest_cid": config.guest_cid, + "uds_path": rel_uds_path, + }) + .to_string(); + + info!(sl(), "HybridVsock configure: {:?}", &body_vsock); + + self.request_with_retry(hyper::Method::PUT, "/vsock", body_vsock) + .await?; + Ok(()) + } + + pub(crate) async fn add_vsock(&mut self, config: &VsockConfig) -> Result<()> { + let rel_uds_path = match self.jailed { + false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"), + true => FC_AGENT_SOCKET_NAME.to_string(), + }; + let body_vsock: String = json!({ + "vsock_id": String::from(ROOT), + "guest_cid": config.guest_cid, + "uds_path": rel_uds_path, + }) + .to_string(); + + info!(sl(), "HybridVsock configure: {:?}", &body_vsock); + + self.request_with_retry(hyper::Method::PUT, "/vsock", body_vsock) + .await?; + Ok(()) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs new file mode 100644 index 0000000000..d176ec43d1 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs @@ -0,0 +1,192 @@ +//Copyright (c) 2019-2022 Alibaba Cloud +//Copyright (c) 2023 Nubificus Ltd +// +//SPDX-License-Identifier: Apache-2.0 + +use crate::firecracker::{sl, FcInner}; +use crate::{VcpuThreadIds, VmmState, HYPERVISOR_FIRECRACKER}; +use anyhow::{anyhow, Context, Result}; +use kata_types::capabilities::Capabilities; +use kata_types::config::KATA_PATH; +use std::collections::HashSet; +use std::iter::FromIterator; +use tokio::fs; + +pub const FC_API_SOCKET_NAME: &str = "fc.sock"; +pub const FC_AGENT_SOCKET_NAME: &str = "kata.hvsock"; +pub const ROOT: &str = "root"; + +const HYBRID_VSOCK_SCHEME: &str = "hvsock"; + +impl FcInner { + pub(crate) async fn prepare_vm(&mut self, id: &str, _netns: Option) -> Result<()> { + debug!(sl(), "Preparing Firecracker"); + + self.id = id.to_string(); + + if !self.config.jailer_path.is_empty() { + debug!(sl(), "Running jailed"); + self.jailed = true; + self.jailer_root = KATA_PATH.to_string(); + debug!(sl(), "jailer_root: {:?}", self.jailer_root); + self.vm_path = [ + self.jailer_root.clone(), + HYPERVISOR_FIRECRACKER.to_string(), + id.to_string(), + ] + .join("/"); + debug!(sl(), "VM Path: {:?}", self.vm_path); + self.run_dir = [self.vm_path.clone(), "root".to_string(), "run".to_string()].join("/"); + debug!(sl(), "Rundir: {:?}", self.run_dir); + let _ = self.remount_jailer_with_exec().await; + } else { + self.vm_path = [KATA_PATH.to_string(), id.to_string()].join("/"); + debug!(sl(), "VM Path: {:?}", self.vm_path); + self.run_dir = [self.vm_path.clone(), "run".to_string()].join("/"); + debug!(sl(), "Rundir: {:?}", self.run_dir); + } + // We construct the FC API socket path based on the run_dir variable (jailed or + // non-jailed). + self.asock_path = [self.run_dir.as_str(), "fc.sock"].join("/"); + debug!(sl(), "Socket Path: {:?}", self.asock_path); + + let _ = fs::create_dir_all(self.run_dir.as_str()) + .await + .context(format!("failed to create directory {:?}", self.vm_path)); + + self.netns = _netns.clone(); + self.prepare_vmm(self.netns.clone()).await?; + self.state = VmmState::VmmServerReady; + self.prepare_vmm_resources().await?; + self.prepare_hvsock().await?; + Ok(()) + } + + pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> { + debug!(sl(), "Starting sandbox"); + let body: String = serde_json::json!({ + "action_type": "InstanceStart" + }) + .to_string(); + self.request_with_retry(hyper::Method::PUT, "/actions", body) + .await?; + self.state = VmmState::VmRunning; + Ok(()) + } + + pub(crate) async fn stop_vm(&mut self) -> Result<()> { + debug!(sl(), "Stopping sandbox"); + if self.state != VmmState::VmRunning { + debug!(sl(), "VM not running!"); + } else if let Some(pid_to_kill) = &self.pid { + let pid = ::nix::unistd::Pid::from_raw(*pid_to_kill as i32); + if let Err(err) = ::nix::sys::signal::kill(pid, nix::sys::signal::SIGKILL) { + if err != ::nix::Error::ESRCH { + debug!(sl(), "Failed to kill VMM with pid {} {:?}", pid, err); + } + } + } + Ok(()) + } + + pub(crate) fn pause_vm(&self) -> Result<()> { + warn!(sl(), "Pause VM: Not implemented"); + Ok(()) + } + + pub(crate) async fn save_vm(&self) -> Result<()> { + warn!(sl(), "Save VM: Not implemented"); + Ok(()) + } + pub(crate) fn resume_vm(&self) -> Result<()> { + warn!(sl(), "Resume VM: Not implemented"); + Ok(()) + } + + pub(crate) async fn get_agent_socket(&self) -> Result { + debug!(sl(), "Get kata-agent socket"); + let vsock_path = match self.jailed { + false => [self.vm_path.as_str(), FC_AGENT_SOCKET_NAME].join("/"), + true => [self.vm_path.as_str(), ROOT, FC_AGENT_SOCKET_NAME].join("/"), + }; + Ok(format!("{}://{}", HYBRID_VSOCK_SCHEME, vsock_path)) + } + + pub(crate) async fn disconnect(&mut self) { + warn!(sl(), "Disconnect: Not implemented"); + } + pub(crate) async fn get_thread_ids(&self) -> Result { + debug!(sl(), "Get Thread IDs"); + Ok(VcpuThreadIds::default()) + } + + pub(crate) async fn get_pids(&self) -> Result> { + debug!(sl(), "Get PIDs"); + let mut pids = HashSet::new(); + // get shim thread ids + pids.insert(self.pid.unwrap()); + + debug!(sl(), "PIDs: {:?}", pids); + Ok(Vec::from_iter(pids.into_iter())) + } + + pub(crate) async fn get_vmm_master_tid(&self) -> Result { + debug!(sl(), "Get VMM master TID"); + if let Some(pid) = self.pid { + Ok(pid) + } else { + Err(anyhow!("could not get vmm master tid")) + } + } + pub(crate) async fn get_ns_path(&self) -> Result { + debug!(sl(), "Get NS path"); + if let Some(pid) = self.pid { + let ns_path = format!("/proc/{}/ns", pid); + Ok(ns_path) + } else { + Err(anyhow!("could not get ns path")) + } + } + + pub(crate) async fn cleanup(&self) -> Result<()> { + debug!(sl(), "Cleanup"); + self.cleanup_resource(); + + std::fs::remove_dir_all(self.vm_path.as_str()) + .map_err(|err| { + error!( + sl(), + "failed to remove dir all for {} with error: {:?}", &self.vm_path, &err + ); + err + }) + .ok(); + + Ok(()) + } + + pub(crate) async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> { + warn!(sl(), "Resize vCPU: Not implemented"); + Ok((old_vcpu, new_vcpu)) + } + + pub(crate) async fn check(&self) -> Result<()> { + warn!(sl(), "Check: Not implemented"); + Ok(()) + } + + pub(crate) async fn get_jailer_root(&self) -> Result { + debug!(sl(), "Get Jailer Root"); + Ok(self.jailer_root.clone()) + } + + pub(crate) async fn capabilities(&self) -> Result { + debug!(sl(), "Capabilities"); + Ok(self.capabilities.clone()) + } + + pub(crate) async fn get_hypervisor_metrics(&self) -> Result { + warn!(sl(), "Get Hypervisor Metrics: Not implemented"); + todo!() + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs new file mode 100644 index 0000000000..a7ba3db54d --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs @@ -0,0 +1,215 @@ +//Copyright (c) 2019-2022 Alibaba Cloud +//Copyright (c) 2023 Nubificus Ltd +// +//SPDX-License-Identifier: Apache-2.0 + +mod fc_api; +mod inner; +mod inner_device; +mod inner_hypervisor; + +use super::HypervisorState; +use crate::MemoryConfig; +use crate::{device::DeviceType, Hypervisor, HypervisorConfig, VcpuThreadIds}; +use anyhow::Context; +use anyhow::Result; +use async_trait::async_trait; +use inner::FcInner; +use kata_types::capabilities::Capabilities; +use kata_types::capabilities::CapabilityBits; +use persist::sandbox_persist::Persist; +use std::sync::Arc; +use tokio::sync::RwLock; + +#[derive(Debug)] +pub struct Firecracker { + inner: Arc>, +} + +// Convenience function to set the scope. +pub fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "firecracker")) +} + +impl Default for Firecracker { + fn default() -> Self { + Self::new() + } +} + +impl Firecracker { + pub fn new() -> Self { + Self { + inner: Arc::new(RwLock::new(FcInner::new())), + } + } + + pub async fn set_hypervisor_config(&mut self, config: HypervisorConfig) { + let mut inner = self.inner.write().await; + inner.set_hypervisor_config(config) + } +} + +#[async_trait] +impl Hypervisor for Firecracker { + async fn prepare_vm(&self, id: &str, netns: Option) -> Result<()> { + let mut inner = self.inner.write().await; + inner.prepare_vm(id, netns).await + } + + async fn start_vm(&self, timeout: i32) -> Result<()> { + let mut inner = self.inner.write().await; + inner.start_vm(timeout).await + } + + async fn stop_vm(&self) -> Result<()> { + let mut inner = self.inner.write().await; + inner.stop_vm().await + } + + async fn pause_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.pause_vm() + } + + async fn resume_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.resume_vm() + } + + async fn save_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.save_vm().await + } + + async fn add_device(&self, device: DeviceType) -> Result { + let mut inner = self.inner.write().await; + match inner.add_device(device.clone()).await { + Ok(_) => Ok(device), + Err(err) => Err(err), + } + } + + async fn remove_device(&self, device: DeviceType) -> Result<()> { + let mut inner = self.inner.write().await; + inner.remove_device(device).await + } + + async fn update_device(&self, device: DeviceType) -> Result<()> { + let mut inner = self.inner.write().await; + inner.update_device(device).await + } + + async fn get_agent_socket(&self) -> Result { + let inner = self.inner.read().await; + inner.get_agent_socket().await + } + + async fn disconnect(&self) { + let mut inner = self.inner.write().await; + inner.disconnect().await + } + + async fn hypervisor_config(&self) -> HypervisorConfig { + let inner = self.inner.read().await; + inner.hypervisor_config() + } + + async fn get_thread_ids(&self) -> Result { + let inner = self.inner.read().await; + inner.get_thread_ids().await + } + + async fn cleanup(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.cleanup().await + } + + async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> { + let inner = self.inner.read().await; + inner.resize_vcpu(old_vcpu, new_vcpu).await + } + + async fn get_pids(&self) -> Result> { + let inner = self.inner.read().await; + inner.get_pids().await + } + + async fn get_vmm_master_tid(&self) -> Result { + let inner = self.inner.read().await; + inner.get_vmm_master_tid().await + } + + async fn get_ns_path(&self) -> Result { + let inner = self.inner.read().await; + inner.get_ns_path().await + } + + async fn check(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.check().await + } + + async fn get_jailer_root(&self) -> Result { + let inner = self.inner.read().await; + inner.get_jailer_root().await + } + + async fn save_state(&self) -> Result { + self.save().await + } + + async fn capabilities(&self) -> Result { + let inner = self.inner.read().await; + inner.capabilities().await + } + + async fn get_hypervisor_metrics(&self) -> Result { + let inner = self.inner.read().await; + inner.get_hypervisor_metrics().await + } + + async fn set_capabilities(&self, flag: CapabilityBits) { + let mut inner = self.inner.write().await; + inner.set_capabilities(flag) + } + + async fn set_guest_memory_block_size(&self, size: u32) { + let mut inner = self.inner.write().await; + inner.set_guest_memory_block_size(size); + } + + async fn guest_memory_block_size(&self) -> u32 { + let inner = self.inner.read().await; + inner.guest_memory_block_size_mb() + } + + async fn resize_memory(&self, new_mem_mb: u32) -> Result<(u32, MemoryConfig)> { + let mut inner = self.inner.write().await; + inner.resize_memory(new_mem_mb) + } + + async fn get_passfd_listener_addr(&self) -> Result<(String, u32)> { + Err(anyhow::anyhow!("Not yet supported")) + } +} +#[async_trait] +impl Persist for Firecracker { + type State = HypervisorState; + type ConstructorArgs = (); + /// Save a state of the component. + async fn save(&self) -> Result { + let inner = self.inner.read().await; + inner.save().await.context("save hypervisor state") + } + /// Restore a component from a specified state. + async fn restore( + hypervisor_args: Self::ConstructorArgs, + hypervisor_state: Self::State, + ) -> Result { + let inner = FcInner::restore(hypervisor_args, hypervisor_state).await?; + Ok(Self { + inner: Arc::new(RwLock::new(inner)), + }) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/kernel_param.rs b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs index c6a93034ba..5f97a8067d 100644 --- a/src/runtime-rs/crates/hypervisor/src/kernel_param.rs +++ b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs @@ -118,6 +118,11 @@ impl KernelParams { self.params.append(&mut params.params); } + #[cfg(not(target_arch = "s390x"))] + pub(crate) fn push(&mut self, new_param: Param) { + self.params.push(new_param); + } + pub(crate) fn from_string(params_string: &str) -> Self { let mut params = vec![]; diff --git a/src/runtime-rs/crates/hypervisor/src/lib.rs b/src/runtime-rs/crates/hypervisor/src/lib.rs index f7aec752d6..6f0c74f74c 100644 --- a/src/runtime-rs/crates/hypervisor/src/lib.rs +++ b/src/runtime-rs/crates/hypervisor/src/lib.rs @@ -15,6 +15,8 @@ pub use device::driver::*; use device::DeviceType; #[cfg(not(target_arch = "s390x"))] pub mod dragonball; +#[cfg(not(target_arch = "s390x"))] +pub mod firecracker; mod kernel_param; pub mod qemu; pub use kernel_param::Param; @@ -61,6 +63,7 @@ const HUGE_SHMEM: &str = "hugeshmem"; pub const HYPERVISOR_DRAGONBALL: &str = "dragonball"; pub const HYPERVISOR_QEMU: &str = "qemu"; +pub const HYPERVISOR_FIRECRACKER: &str = "firecracker"; pub const DEFAULT_HYBRID_VSOCK_NAME: &str = "kata.hvsock"; pub const JAILER_ROOT: &str = "root"; diff --git a/src/runtime-rs/crates/persist/src/lib.rs b/src/runtime-rs/crates/persist/src/lib.rs index 0c6510a0dc..5637739588 100644 --- a/src/runtime-rs/crates/persist/src/lib.rs +++ b/src/runtime-rs/crates/persist/src/lib.rs @@ -14,9 +14,14 @@ pub const PERSIST_FILE: &str = "state.json"; use kata_sys_util::validate::verify_id; use safe_path::scoped_join; -pub fn to_disk(value: &T, sid: &str) -> Result<()> { +pub fn to_disk(value: &T, sid: &str, jailer_path: &str) -> Result<()> { verify_id(sid).context("failed to verify sid")?; - let mut path = scoped_join(KATA_PATH, sid)?; + // FIXME: handle jailed case + let mut path = match jailer_path { + "" => scoped_join(KATA_PATH, sid)?, + _ => scoped_join(jailer_path, "root")?, + }; + //let mut path = scoped_join(KATA_PATH, sid)?; if path.exists() { path.push(PERSIST_FILE); let f = File::create(path) @@ -62,10 +67,10 @@ mod tests { key: 1, }; // invalid sid - assert!(to_disk(&data, "..3").is_err()); - assert!(to_disk(&data, "../../../3").is_err()); - assert!(to_disk(&data, "a/b/c").is_err()); - assert!(to_disk(&data, ".#cdscd.").is_err()); + assert!(to_disk(&data, "..3", "").is_err()); + assert!(to_disk(&data, "../../../3", "").is_err()); + assert!(to_disk(&data, "a/b/c", "").is_err()); + assert!(to_disk(&data, ".#cdscd.", "").is_err()); let sid = "aadede"; let sandbox_dir = [KATA_PATH, sid].join("/"); @@ -74,7 +79,7 @@ mod tests { .create(&sandbox_dir) .is_ok() { - assert!(to_disk(&data, sid).is_ok()); + assert!(to_disk(&data, sid, "").is_ok()); if let Ok(result) = from_disk::(sid) { assert_eq!(result.name, data.name); assert_eq!(result.key, data.key); diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs index 374da59eba..582e12c81c 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs @@ -23,9 +23,13 @@ use common::{message::Message, RuntimeHandler, RuntimeInstance}; use hypervisor::Hypervisor; #[cfg(not(target_arch = "s390x"))] use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL}; +#[cfg(not(target_arch = "s390x"))] +use hypervisor::{firecracker::Firecracker, HYPERVISOR_FIRECRACKER}; use hypervisor::{qemu::Qemu, HYPERVISOR_QEMU}; #[cfg(not(target_arch = "s390x"))] use kata_types::config::DragonballConfig; +#[cfg(not(target_arch = "s390x"))] +use kata_types::config::FirecrackerConfig; use kata_types::config::{hypervisor::register_hypervisor_plugin, QemuConfig, TomlConfig}; #[cfg(all(feature = "cloud-hypervisor", not(target_arch = "s390x")))] @@ -55,6 +59,9 @@ impl RuntimeHandler for VirtContainer { { let dragonball_config = Arc::new(DragonballConfig::new()); register_hypervisor_plugin("dragonball", dragonball_config); + + let firecracker_config = Arc::new(FirecrackerConfig::new()); + register_hypervisor_plugin("firecracker", firecracker_config); } let qemu_config = Arc::new(QemuConfig::new()); @@ -160,6 +167,14 @@ async fn new_hypervisor(toml_config: &TomlConfig) -> Result> .await; Ok(Arc::new(hypervisor)) } + #[cfg(not(target_arch = "s390x"))] + HYPERVISOR_FIRECRACKER => { + let mut hypervisor = Firecracker::new(); + hypervisor + .set_hypervisor_config(hypervisor_config.clone()) + .await; + Ok(Arc::new(hypervisor)) + } #[cfg(all(feature = "cloud-hypervisor", not(target_arch = "s390x")))] HYPERVISOR_NAME_CH => { diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 6f572083a6..cb6ac6ab4b 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -18,12 +18,14 @@ use common::{Sandbox, SandboxNetworkEnv}; use containerd_shim_protos::events::task::TaskOOM; use hypervisor::VsockConfig; #[cfg(not(target_arch = "s390x"))] -use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL}; +use hypervisor::{dragonball::Dragonball, HYPERVISOR_DRAGONBALL, HYPERVISOR_FIRECRACKER}; use hypervisor::{qemu::Qemu, HYPERVISOR_QEMU}; use hypervisor::{utils::get_hvsock_path, HybridVsockConfig, DEFAULT_GUEST_VSOCK_CID}; use hypervisor::{BlockConfig, Hypervisor}; use kata_sys_util::hooks::HookStates; use kata_types::capabilities::CapabilityBits; +#[cfg(not(target_arch = "s390x"))] +use kata_types::config::hypervisor::HYPERVISOR_NAME_CH; use kata_types::config::TomlConfig; use persist::{self, sandbox_persist::Persist}; use resource::manager::ManagerArgs; @@ -570,12 +572,39 @@ impl Persist for VirtSandbox { /// Save a state of Sandbox async fn save(&self) -> Result { + let hypervisor_state = self.hypervisor.save_state().await?; let sandbox_state = crate::sandbox_persist::SandboxState { sandbox_type: VIRTCONTAINER.to_string(), resource: Some(self.resource_manager.save().await?), - hypervisor: Some(self.hypervisor.save_state().await?), + hypervisor: match hypervisor_state.hypervisor_type.as_str() { + // TODO support other hypervisors + #[cfg(not(target_arch = "s390x"))] + HYPERVISOR_DRAGONBALL => Ok(Some(hypervisor_state)), + #[cfg(not(target_arch = "s390x"))] + HYPERVISOR_NAME_CH => Ok(Some(hypervisor_state)), + #[cfg(not(target_arch = "s390x"))] + HYPERVISOR_FIRECRACKER => Ok(Some(hypervisor_state)), + HYPERVISOR_QEMU => Ok(Some(hypervisor_state)), + _ => Err(anyhow!( + "Unsupported hypervisor {}", + hypervisor_state.hypervisor_type + )), + }?, }; - persist::to_disk(&sandbox_state, &self.sid)?; + // FIXME: properly handle jailed case + // eg: Determine if we are running jailed: + // let h = sandbox_state.hypervisor.clone().unwrap_or_default(); + // Figure out the jailed path: + // jailed_path = h.<> + // and somehow store the sandbox state into the jail: + // persist::to_disk(&sandbox_state, &self.sid, jailed_path)?; + // Issue is, how to handle restore. + let h = sandbox_state.hypervisor.as_ref().unwrap(); + let vmpath = match h.jailed { + true => h.vm_path.clone(), + false => "".to_string(), + }; + persist::to_disk(&sandbox_state, &self.sid, vmpath.as_str())?; Ok(sandbox_state) } /// Restore Sandbox