diff --git a/.github/workflows/kata-deploy-push.yaml b/.github/workflows/kata-deploy-push.yaml index bdd0aeb2bd..c7d7e8cb4e 100644 --- a/.github/workflows/kata-deploy-push.yaml +++ b/.github/workflows/kata-deploy-push.yaml @@ -18,6 +18,7 @@ jobs: matrix: asset: - kernel + - kernel-dragonball-experimental - shim-v2 - qemu - cloud-hypervisor diff --git a/.github/workflows/kata-deploy-test.yaml b/.github/workflows/kata-deploy-test.yaml index 7f6839b40f..5a924c7390 100644 --- a/.github/workflows/kata-deploy-test.yaml +++ b/.github/workflows/kata-deploy-test.yaml @@ -50,6 +50,7 @@ jobs: - cloud-hypervisor - firecracker - kernel + - kernel-dragonball-experimental - nydus - qemu - rootfs-image diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index b5f585937e..bbb95c53da 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -13,6 +13,7 @@ jobs: - cloud-hypervisor - firecracker - kernel + - kernel-dragonball-experimental - nydus - qemu - rootfs-image diff --git a/docs/Developer-Guide.md b/docs/Developer-Guide.md index 1af800d5b9..5d0fdf6c52 100644 --- a/docs/Developer-Guide.md +++ b/docs/Developer-Guide.md @@ -232,10 +232,6 @@ $ rustup target add "${ARCH}-unknown-linux-${LIBC}" To build the agent: -```bash -$ make -C kata-containers/src/agent -``` - The agent is built with seccomp capability by default. If you want to build the agent without the seccomp capability, you need to run `make` with `SECCOMP=no` as follows. @@ -243,6 +239,31 @@ If you want to build the agent without the seccomp capability, you need to run ` $ make -C kata-containers/src/agent SECCOMP=no ``` +For building the agent with seccomp support using `musl`, set the environment +variables for the [`libseccomp` crate](https://github.com/libseccomp-rs/libseccomp-rs). + +```bash +$ export LIBSECCOMP_LINK_TYPE=static +$ export LIBSECCOMP_LIB_PATH="the path of the directory containing libseccomp.a" +$ make -C kata-containers/src/agent +``` + +If the compilation fails when the agent tries to link the `libseccomp` library statically +against `musl`, you will need to build `libseccomp` manually with `-U_FORTIFY_SOURCE`. +You can use [our script](https://github.com/kata-containers/kata-containers/blob/main/ci/install_libseccomp.sh) +to install `libseccomp` for the agent. + +```bash +$ mkdir -p ${seccomp_install_path} ${gperf_install_path} +$ kata-containers/ci/install_libseccomp.sh ${seccomp_install_path} ${gperf_install_path} +$ export LIBSECCOMP_LIB_PATH="${seccomp_install_path}/lib" +``` + +On `ppc64le` and `s390x`, `glibc` is used. You will need to install the `libseccomp` library +provided by your distribution. + +> e.g. `libseccomp-dev` for Ubuntu, or `libseccomp-devel` for CentOS + > **Note:** > > - If you enable seccomp in the main configuration file but build the agent without seccomp capability, diff --git a/docs/use-cases/using-Intel-QAT-and-kata.md b/docs/use-cases/using-Intel-QAT-and-kata.md index d029de3672..e83569fbed 100644 --- a/docs/use-cases/using-Intel-QAT-and-kata.md +++ b/docs/use-cases/using-Intel-QAT-and-kata.md @@ -49,7 +49,7 @@ the latest driver. $ export QAT_DRIVER_VER=qat1.7.l.4.14.0-00031.tar.gz $ export QAT_DRIVER_URL=https://downloadmirror.intel.com/30178/eng/${QAT_DRIVER_VER} $ export QAT_CONF_LOCATION=~/QAT_conf -$ export QAT_DOCKERFILE=https://raw.githubusercontent.com/intel/intel-device-plugins-for-kubernetes/master/demo/openssl-qat-engine/Dockerfile +$ export QAT_DOCKERFILE=https://raw.githubusercontent.com/intel/intel-device-plugins-for-kubernetes/main/demo/openssl-qat-engine/Dockerfile $ export QAT_SRC=~/src/QAT $ export GOPATH=~/src/go $ export KATA_KERNEL_LOCATION=~/kata diff --git a/src/dragonball/README.md b/src/dragonball/README.md index c9d7e5119c..3fde0782e0 100644 --- a/src/dragonball/README.md +++ b/src/dragonball/README.md @@ -19,6 +19,7 @@ and configuration process. Device: [Device Document](docs/device.md) vCPU: [vCPU Document](docs/vcpu.md) API: [API Document](docs/api.md) +`Upcall`: [`Upcall` Document](docs/upcall.md) Currently, the documents are still actively adding. You could see the [official documentation](docs/) page for more details. diff --git a/src/dragonball/docs/images/upcall-architecture.svg b/src/dragonball/docs/images/upcall-architecture.svg new file mode 100644 index 0000000000..a74c37f455 --- /dev/null +++ b/src/dragonball/docs/images/upcall-architecture.svg @@ -0,0 +1,177 @@ + + + + + + + + + + + + + + + + + Canvas 1 + + + Layer 1 + + + + + + Guest User + + + + + Guest Kernel + + + + + + + + Hypervisor + + + + + + + + socket + + + + + + Device + Manager + Service + + + + + bind + + + + + listen + + + + + accept + + + + + new kthread + + + + + + virtio-vsocket + + + + + + virtio-vsocket backend + + + + + + + + + + + + + + + + + + Port + + + + + + Device + Manager + Backend + + + + + + + + + Service B + + + + + + Backend B + + + + + + + + …… + + + + + …… + + + + + Upcall Server + + + + + + + + Service handler + + + + + + Conn + + + + + + Conn + + + + + + + + + + + diff --git a/src/dragonball/docs/upcall.md b/src/dragonball/docs/upcall.md new file mode 100644 index 0000000000..292b33d270 --- /dev/null +++ b/src/dragonball/docs/upcall.md @@ -0,0 +1,30 @@ +# `Upcall` + +## What is `Upcall`? + +`Upcall` is a direct communication tool between VMM and guest developed upon `vsock`. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests after the kernel starts. And the client side is in Dragonball VMM , it'll be a thread that communicates with `vsock` through `uds`. + +We want to keep the lightweight of the VM through the implementation of the `upcall`. + +![architecture overview](images/upcall-architecture.svg) +## What can `upcall` do? + +We define specific operations in the device manager service (one of the services in `upcall` we developed) to perform device hotplug / hot-unplug including vCPU hotplug, `virtio-mmio` hotplug, and memory hotplug. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid the virtualization of ACPI to minimize virtual machines overhead. And there could be many other uses if other services are implemented. + +## How to enable `upcall`? + +`Upcall` needs a server in the guest kernel which will be several kernel patches for the `upcall` server itself and different services registered in the `upcall` server. It's currently tested on upstream Linux kernel 5.10. + +To make it easy for users to use, we have open-source the `upcall` guest patches in [Dragonball experimental guest patches](../../../tools/packaging/kernel/patches/5.10.x/dragonball-experimental) and develop `upcall` support in [Kata guest kernel building script](../../../tools/packaging/kernel/build-kernel.sh). + +You could use following command to download the upstream kernel (currently Dragonball uses 5.10.25) and put the `upcall` patches and other Kata patches into kernel code. + +`sh build-kernel.sh -e -t dragonball -f setup` + +`-e` here means experimental, mainly because `upcall` patches are not in upstream Linux kernel. +`-t dragonball` is for specifying hypervisor type +`-f` is for generating `.config` file + +After this command, the kernel code with `upcall` and related `.config` file are all set up in the directory `kata-linux-dragonball-experimental-5.10.25-[config version]`. You can either manually compile the kernel with `make` command or following [Document for build-kernel.sh](../../../tools/packaging/kernel/README.md) to build and use this guest kernel. + +Also, a client-side is also needed in VMM. Dragonball has already open-source the way to implement `upcall` client and Dragonball compiled with `dbs-upcall` feature will enable Dragonball client side. \ No newline at end of file diff --git a/src/dragonball/src/address_space_manager.rs b/src/dragonball/src/address_space_manager.rs index 0e9ac91d73..6efdb027ad 100644 --- a/src/dragonball/src/address_space_manager.rs +++ b/src/dragonball/src/address_space_manager.rs @@ -250,6 +250,11 @@ impl AddressSpaceMgr { self.address_space.as_ref() } + /// Get the guest memory. + pub fn vm_memory(&self) -> Option<::T> { + self.get_vm_as().map(|m| m.memory()) + } + /// Create the address space for a virtual machine. /// /// This method is designed to be called when starting up a virtual machine instead of at diff --git a/src/dragonball/src/api/v1/vmm_action.rs b/src/dragonball/src/api/v1/vmm_action.rs index 45591e7ed1..a2dd496fda 100644 --- a/src/dragonball/src/api/v1/vmm_action.rs +++ b/src/dragonball/src/api/v1/vmm_action.rs @@ -656,10 +656,10 @@ mod tests { let (to_vmm, from_api) = channel(); let (to_api, from_vmm) = channel(); - let vmm = Arc::new(Mutex::new(create_vmm_instance())); + let epoll_mgr = EpollManager::default(); + let vmm = Arc::new(Mutex::new(create_vmm_instance(epoll_mgr.clone()))); let mut vservice = VmmService::new(from_api, to_api); - let epoll_mgr = EpollManager::default(); let mut event_mgr = EventManager::new(&vmm, epoll_mgr).unwrap(); let mut v = vmm.lock().unwrap(); @@ -681,9 +681,9 @@ mod tests { let (_to_vmm, from_api) = channel(); let (to_api, _from_vmm) = channel(); - let vmm = Arc::new(Mutex::new(create_vmm_instance())); - let mut vservice = VmmService::new(from_api, to_api); let epoll_mgr = EpollManager::default(); + let vmm = Arc::new(Mutex::new(create_vmm_instance(epoll_mgr.clone()))); + let mut vservice = VmmService::new(from_api, to_api); let mut event_mgr = EventManager::new(&vmm, epoll_mgr).unwrap(); let mut v = vmm.lock().unwrap(); @@ -695,9 +695,9 @@ mod tests { fn test_vmm_action_disconnected() { let (to_vmm, from_api) = channel(); let (to_api, _from_vmm) = channel(); - let vmm = Arc::new(Mutex::new(create_vmm_instance())); - let mut vservice = VmmService::new(from_api, to_api); let epoll_mgr = EpollManager::default(); + let vmm = Arc::new(Mutex::new(create_vmm_instance(epoll_mgr.clone()))); + let mut vservice = VmmService::new(from_api, to_api); let mut event_mgr = EventManager::new(&vmm, epoll_mgr).unwrap(); let mut v = vmm.lock().unwrap(); diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs index 06f6ea6ab2..56ee3617ca 100644 --- a/src/dragonball/src/device_manager/mod.rs +++ b/src/dragonball/src/device_manager/mod.rs @@ -1065,7 +1065,9 @@ mod tests { use crate::vm::VmConfigInfo; let epoll_manager = EpollManager::default(); - let vmm = Arc::new(Mutex::new(crate::vmm::tests::create_vmm_instance())); + let vmm = Arc::new(Mutex::new(crate::vmm::tests::create_vmm_instance( + epoll_manager.clone(), + ))); let event_mgr = crate::event_manager::EventManager::new(&vmm, epoll_manager).unwrap(); let mut vm = crate::vm::tests::create_vm_instance(); let vm_config = VmConfigInfo { diff --git a/src/dragonball/src/vm/mod.rs b/src/dragonball/src/vm/mod.rs index d573080ae8..8ad959c791 100644 --- a/src/dragonball/src/vm/mod.rs +++ b/src/dragonball/src/vm/mod.rs @@ -825,7 +825,14 @@ impl Vm { #[cfg(test)] pub mod tests { + use kvm_ioctls::VcpuExit; + use linux_loader::cmdline::Cmdline; + use test_utils::skip_if_not_root; + use vm_memory::GuestMemory; + use vmm_sys_util::tempfile::TempFile; + use super::*; + use crate::test_utils::tests::create_vm_for_test; impl Vm { pub fn set_instance_state(&mut self, mstate: InstanceState) { @@ -841,4 +848,204 @@ pub mod tests { let epoll_manager = EpollManager::default(); Vm::new(None, instance_info, epoll_manager).unwrap() } + + #[test] + fn test_create_vm_instance() { + skip_if_not_root!(); + let vm = create_vm_instance(); + assert!(vm.check_health().is_err()); + assert!(vm.kernel_config.is_none()); + assert!(vm.get_reset_eventfd().is_none()); + assert!(!vm.is_vm_initialized()); + assert!(!vm.is_vm_running()); + assert!(vm.reset_console().is_ok()); + } + + #[test] + fn test_vm_init_guest_memory() { + skip_if_not_root!(); + let vm_config = VmConfigInfo { + vcpu_count: 1, + max_vcpu_count: 3, + cpu_pm: "off".to_string(), + mem_type: "shmem".to_string(), + mem_file_path: "".to_string(), + mem_size_mib: 16, + serial_path: None, + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + }; + + let mut vm = create_vm_instance(); + vm.set_vm_config(vm_config); + assert!(vm.init_guest_memory().is_ok()); + let vm_memory = vm.address_space.vm_memory().unwrap(); + + assert_eq!(vm_memory.num_regions(), 1); + assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + + // Reconfigure an already configured vm will be ignored and just return OK. + let vm_config = VmConfigInfo { + vcpu_count: 1, + max_vcpu_count: 3, + cpu_pm: "off".to_string(), + mem_type: "shmem".to_string(), + mem_file_path: "".to_string(), + mem_size_mib: 16, + serial_path: None, + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + }; + vm.set_vm_config(vm_config); + assert!(vm.init_guest_memory().is_ok()); + let vm_memory = vm.address_space.vm_memory().unwrap(); + assert_eq!(vm_memory.num_regions(), 1); + assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + + let obj_addr = GuestAddress(0xf0); + vm_memory.write_obj(67u8, obj_addr).unwrap(); + let read_val: u8 = vm_memory.read_obj(obj_addr).unwrap(); + assert_eq!(read_val, 67u8); + } + + #[test] + fn test_vm_create_devices() { + skip_if_not_root!(); + let epoll_mgr = EpollManager::default(); + let vmm = Arc::new(Mutex::new(crate::vmm::tests::create_vmm_instance( + epoll_mgr.clone(), + ))); + + let mut guard = vmm.lock().unwrap(); + let vm = guard.get_vm_mut().unwrap(); + + let vm_config = VmConfigInfo { + vcpu_count: 1, + max_vcpu_count: 3, + cpu_pm: "off".to_string(), + mem_type: "shmem".to_string(), + mem_file_path: "".to_string(), + mem_size_mib: 16, + serial_path: None, + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + }; + + vm.set_vm_config(vm_config); + assert!(vm.init_guest_memory().is_ok()); + assert!(vm.setup_interrupt_controller().is_ok()); + + let vm_memory = vm.address_space.vm_memory().unwrap(); + assert_eq!(vm_memory.num_regions(), 1); + assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + + let kernel_file = TempFile::new().unwrap(); + let cmd_line = Cmdline::new(64); + + vm.set_kernel_config(KernelConfigInfo::new( + kernel_file.into_file(), + None, + cmd_line, + )); + + vm.init_devices(epoll_mgr).unwrap(); + } + + #[test] + fn test_vm_delete_devices() { + skip_if_not_root!(); + let mut vm = create_vm_for_test(); + let epoll_mgr = EpollManager::default(); + + vm.setup_interrupt_controller().unwrap(); + vm.init_devices(epoll_mgr).unwrap(); + assert!(vm.remove_devices().is_ok()); + } + + #[test] + fn test_run_code() { + skip_if_not_root!(); + + use std::io::{self, Write}; + // This example is based on https://lwn.net/Articles/658511/ + let code = [ + 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ + 0x00, 0xd8, /* add %bl, %al */ + 0x04, b'0', /* add $'0', %al */ + 0xee, /* out %al, (%dx) */ + 0xb0, b'\n', /* mov $'\n', %al */ + 0xee, /* out %al, (%dx) */ + 0xf4, /* hlt */ + ]; + let load_addr = GuestAddress(0x1000); + let instance_info = Arc::new(RwLock::new(InstanceInfo::default())); + let epoll_manager = EpollManager::default(); + let mut vm = Vm::new(None, instance_info, epoll_manager).unwrap(); + + let vcpu_count = 1; + let vm_config = VmConfigInfo { + vcpu_count, + max_vcpu_count: 1, + cpu_pm: "off".to_string(), + mem_type: "shmem".to_string(), + mem_file_path: "".to_string(), + mem_size_mib: 10, + serial_path: None, + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + }; + + vm.set_vm_config(vm_config); + vm.init_guest_memory().unwrap(); + + let vm_memory = vm.address_space.vm_memory().unwrap(); + vm_memory.write_obj(code, load_addr).unwrap(); + + let vcpu_fd = vm.vm_fd().create_vcpu(0).unwrap(); + let mut vcpu_sregs = vcpu_fd.get_sregs().unwrap(); + assert_ne!(vcpu_sregs.cs.base, 0); + assert_ne!(vcpu_sregs.cs.selector, 0); + vcpu_sregs.cs.base = 0; + vcpu_sregs.cs.selector = 0; + vcpu_fd.set_sregs(&vcpu_sregs).unwrap(); + + let mut vcpu_regs = vcpu_fd.get_regs().unwrap(); + + vcpu_regs.rip = 0x1000; + vcpu_regs.rax = 2; + vcpu_regs.rbx = 3; + vcpu_regs.rflags = 2; + vcpu_fd.set_regs(&vcpu_regs).unwrap(); + + match vcpu_fd.run().expect("run failed") { + VcpuExit::IoOut(0x3f8, data) => { + assert_eq!(data.len(), 1); + io::stdout().write_all(data).unwrap(); + } + VcpuExit::Hlt => { + io::stdout().write_all(b"KVM_EXIT_HLT\n").unwrap(); + } + r => panic!("unexpected exit reason: {:?}", r), + } + } } diff --git a/src/dragonball/src/vmm.rs b/src/dragonball/src/vmm.rs index b15e66fef1..72c799e111 100644 --- a/src/dragonball/src/vmm.rs +++ b/src/dragonball/src/vmm.rs @@ -200,11 +200,10 @@ pub(crate) mod tests { use super::*; - pub fn create_vmm_instance() -> Vmm { + pub fn create_vmm_instance(epoll_manager: EpollManager) -> Vmm { let info = Arc::new(RwLock::new(InstanceInfo::default())); let event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); let seccomp_filter: BpfProgram = Vec::new(); - let epoll_manager = EpollManager::default(); Vmm::new_with_epoll_manager( info, @@ -221,6 +220,6 @@ pub(crate) mod tests { fn test_create_vmm_instance() { skip_if_not_root!(); - create_vmm_instance(); + create_vmm_instance(EpollManager::default()); } } diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index aba5c751b6..afd3a42ea5 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -32,7 +32,7 @@ use regex::RegexSet; use super::{default, ConfigOps, ConfigPlugin, TomlConfig}; use crate::annotations::KATA_ANNO_CFG_HYPERVISOR_PREFIX; -use crate::{eother, resolve_path, validate_path}; +use crate::{eother, resolve_path, sl, validate_path}; mod dragonball; pub use self::dragonball::{DragonballConfig, HYPERVISOR_NAME_DRAGONBALL}; @@ -779,6 +779,10 @@ pub struct SharedFsInfo { #[serde(default)] pub virtio_fs_cache_size: u32, + /// Default size of virtqueues + #[serde(default)] + pub virtio_fs_queue_size: u32, + /// Enable virtio-fs DAX window if true. #[serde(default)] pub virtio_fs_is_dax: bool, @@ -846,6 +850,10 @@ impl SharedFsInfo { if self.virtio_fs_cache.is_empty() { self.virtio_fs_cache = default::DEFAULT_VIRTIO_FS_CACHE_MODE.to_string(); } + if self.virtio_fs_cache == *"none" { + warn!(sl!(), "virtio-fs cache mode `none` is deprecated since Kata Containers 2.5.0 and will be removed in the future release, please use `never` instead. For more details please refer to https://github.com/kata-containers/kata-containers/issues/4234."); + self.virtio_fs_cache = default::DEFAULT_VIRTIO_FS_CACHE_MODE.to_string(); + } if self.virtio_fs_is_dax && self.virtio_fs_cache_size == 0 { self.virtio_fs_cache_size = default::DEFAULT_VIRTIO_FS_DAX_SIZE_MB; } diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index ea97517389..ffe7a934bc 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -119,6 +119,7 @@ DEFVALIDVIRTIOFSDAEMONPATHS := [\"$(DEFVIRTIOFSDAEMON)\"] # if value is 0, DAX is not enabled DEFVIRTIOFSCACHESIZE ?= 0 DEFVIRTIOFSCACHE ?= auto +DEFVIRTIOFSQUEUESIZE ?= 1024 # Format example: # [\"-o\", \"arg1=xxx,arg2\", \"-o\", \"hello world\", \"--arg3=yyy\"] # @@ -256,6 +257,7 @@ USER_VARS += DEFVIRTIOFSDAEMON USER_VARS += DEFVALIDVIRTIOFSDAEMONPATHS USER_VARS += DEFVIRTIOFSCACHESIZE USER_VARS += DEFVIRTIOFSCACHE +USER_VARS += DEFVIRTIOFSQUEUESIZE USER_VARS += DEFVIRTIOFSEXTRAARGS USER_VARS += DEFENABLEANNOTATIONS USER_VARS += DEFENABLEIOTHREADS diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index ca1db2681a..fa39d23c82 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -136,6 +136,34 @@ block_device_driver = "@DEFBLOCKSTORAGEDRIVER_DB@" # of shim, does not need an external virtiofsd process. shared_fs = "@DBSHAREDFS@" +# Default size of DAX cache in MiB +virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@ + +# Extra args for virtiofsd daemon +# +# Format example: +# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"] +# +# see `virtiofsd -h` for possible options. +virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@ + +# Cache mode: +# +# - never +# Metadata, data, and pathname lookup are not cached in guest. They are +# always fetched from host and any changes are immediately pushed to host. +# +# - auto +# Metadata and pathname lookup cache expires after a configured amount of +# time (default is 1 second). Data is cached while the file is open (close +# to open consistency). +# +# - always +# Metadata, data, and pathname lookup are cached in guest and never expire. +virtio_fs_cache = "@DEFVIRTIOFSCACHE@" + # Enable huge pages for VM RAM, default false # Enabling this will result in the VM memory # being allocated using huge pages. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 11ceaacbef..a80f42e887 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -336,7 +336,7 @@ valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@ pflashes = [] # This option changes the default hypervisor and kernel parameters -# to enable debug output where available. +# to enable debug output where available. And Debug also enable the hmp socket. # # Default false #enable_debug = true diff --git a/src/runtime/pkg/govmm/qemu/examples_test.go b/src/runtime/pkg/govmm/qemu/examples_test.go index 03e52b87ac..6f6727effb 100644 --- a/src/runtime/pkg/govmm/qemu/examples_test.go +++ b/src/runtime/pkg/govmm/qemu/examples_test.go @@ -27,13 +27,16 @@ func Example() { // resources params = append(params, "-m", "370", "-smp", "cpus=2") - // LaunchCustomQemu should return as soon as the instance has launched as we - // are using the --daemonize flag. It will set up a unix domain socket - // called /tmp/qmp-socket that we can use to manage the instance. - _, err := qemu.LaunchCustomQemu(context.Background(), "", params, nil, nil, nil) + // LaunchCustomQemu should return immediately. We must then wait + // the returned process to terminate as we are using the --daemonize + // flag. + // It will set up a unix domain socket called /tmp/qmp-socket that we + // can use to manage the instance. + proc, _, err := qemu.LaunchCustomQemu(context.Background(), "", params, nil, nil, nil) if err != nil { panic(err) } + proc.Wait() // This channel will be closed when the instance dies. disconnectedCh := make(chan struct{}) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 7a7fb4608e..309de5de14 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -14,9 +14,9 @@ package qemu import ( - "bytes" "context" "fmt" + "io" "log" "os" "os/exec" @@ -2362,10 +2362,17 @@ const ( ) // QMPSocket represents a qemu QMP socket configuration. +// nolint: govet type QMPSocket struct { // Type is the socket type (e.g. "unix"). Type QMPSocketType + // Human Monitor Interface (HMP) (true for HMP, false for QMP, default false) + IsHmp bool + + // QMP listener file descriptor to be passed to qemu + FD *os.File + // Name is the socket name. Name string @@ -2378,7 +2385,8 @@ type QMPSocket struct { // Valid returns true if the QMPSocket structure is valid and complete. func (qmp QMPSocket) Valid() bool { - if qmp.Type == "" || qmp.Name == "" { + // Exactly one of Name of FD must be set. + if qmp.Type == "" || (qmp.Name == "") == (qmp.FD == nil) { return false } @@ -2646,9 +2654,6 @@ type Config struct { // PidFile is the -pidfile parameter PidFile string - // LogFile is the -D parameter - LogFile string - qemuParams []string } @@ -2718,7 +2723,13 @@ func (config *Config) appendQMPSockets() { continue } - qmpParams := append([]string{}, fmt.Sprintf("%s:%s", q.Type, q.Name)) + var qmpParams []string + if q.FD != nil { + qemuFDs := config.appendFDs([]*os.File{q.FD}) + qmpParams = append([]string{}, fmt.Sprintf("%s:fd=%d", q.Type, qemuFDs[0])) + } else { + qmpParams = append([]string{}, fmt.Sprintf("%s:path=%s", q.Type, q.Name)) + } if q.Server { qmpParams = append(qmpParams, "server=on") if q.NoWait { @@ -2726,7 +2737,12 @@ func (config *Config) appendQMPSockets() { } } - config.qemuParams = append(config.qemuParams, "-qmp") + if q.IsHmp { + config.qemuParams = append(config.qemuParams, "-monitor") + } else { + config.qemuParams = append(config.qemuParams, "-qmp") + } + config.qemuParams = append(config.qemuParams, strings.Join(qmpParams, ",")) } } @@ -2975,13 +2991,6 @@ func (config *Config) appendPidFile() { } } -func (config *Config) appendLogFile() { - if config.LogFile != "" { - config.qemuParams = append(config.qemuParams, "-D") - config.qemuParams = append(config.qemuParams, config.LogFile) - } -} - func (config *Config) appendFwCfg(logger QMPLog) { if logger == nil { logger = qmpNullLogger{} @@ -3001,12 +3010,8 @@ func (config *Config) appendFwCfg(logger QMPLog) { // // The Config parameter contains a set of qemu parameters and settings. // -// This function writes its log output via logger parameter. -// -// The function will block until the launched qemu process exits. "", nil -// will be returned if the launch succeeds. Otherwise a string containing -// the contents of stderr + a Go error object will be returned. -func LaunchQemu(config Config, logger QMPLog) (string, error) { +// See LaunchCustomQemu for more information. +func LaunchQemu(config Config, logger QMPLog) (*exec.Cmd, io.ReadCloser, error) { config.appendName() config.appendUUID() config.appendMachine() @@ -3024,12 +3029,11 @@ func LaunchQemu(config Config, logger QMPLog) (string, error) { config.appendIOThreads() config.appendIncoming() config.appendPidFile() - config.appendLogFile() config.appendFwCfg(logger) config.appendSeccompSandbox() if err := config.appendCPUs(); err != nil { - return "", err + return nil, nil, err } ctx := config.Ctx @@ -3060,17 +3064,16 @@ func LaunchQemu(config Config, logger QMPLog) (string, error) { // // This function writes its log output via logger parameter. // -// The function will block until the launched qemu process exits. "", nil -// will be returned if the launch succeeds. Otherwise a string containing -// the contents of stderr + a Go error object will be returned. +// The function returns cmd, reader, nil where cmd is a Go exec.Cmd object +// representing the QEMU process and reader a Go io.ReadCloser object +// connected to QEMU's stderr, if launched successfully. Otherwise +// nil, nil, err where err is a Go error object is returned. func LaunchCustomQemu(ctx context.Context, path string, params []string, fds []*os.File, - attr *syscall.SysProcAttr, logger QMPLog) (string, error) { + attr *syscall.SysProcAttr, logger QMPLog) (*exec.Cmd, io.ReadCloser, error) { if logger == nil { logger = qmpNullLogger{} } - errStr := "" - if path == "" { path = "qemu-system-x86_64" } @@ -3084,15 +3087,17 @@ func LaunchCustomQemu(ctx context.Context, path string, params []string, fds []* cmd.SysProcAttr = attr - var stderr bytes.Buffer - cmd.Stderr = &stderr + reader, err := cmd.StderrPipe() + if err != nil { + logger.Errorf("Unable to connect stderr to a pipe") + return nil, nil, err + } logger.Infof("launching %s with: %v", path, params) - err := cmd.Run() + err = cmd.Start() if err != nil { logger.Errorf("Unable to launch %s: %v", path, err) - errStr = stderr.String() - logger.Errorf("%s", errStr) + return nil, nil, err } - return errStr, err + return cmd, reader, nil } diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 01ca765c50..34ca776dfc 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -698,8 +698,8 @@ func TestFailToAppendCPUs(t *testing.T) { } } -var qmpSingleSocketServerString = "-qmp unix:cc-qmp,server=on,wait=off" -var qmpSingleSocketString = "-qmp unix:cc-qmp" +var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off" +var qmpSingleSocketString = "-qmp unix:path=cc-qmp" func TestAppendSingleQMPSocketServer(t *testing.T) { qmp := QMPSocket{ @@ -722,7 +722,27 @@ func TestAppendSingleQMPSocket(t *testing.T) { testAppend(qmp, qmpSingleSocketString, t) } -var qmpSocketServerString = "-qmp unix:cc-qmp-1,server=on,wait=off -qmp unix:cc-qmp-2,server=on,wait=off" +var qmpSocketServerFdString = "-qmp unix:fd=3,server=on,wait=off" + +func TestAppendQMPSocketServerFd(t *testing.T) { + foo, _ := os.CreateTemp(os.TempDir(), "govmm-qemu-test") + + defer func() { + _ = foo.Close() + _ = os.Remove(foo.Name()) + }() + + qmp := QMPSocket{ + Type: "unix", + FD: foo, + Server: true, + NoWait: true, + } + + testAppend(qmp, qmpSocketServerFdString, t) +} + +var qmpSocketServerString = "-qmp unix:path=cc-qmp-1,server=on,wait=off -qmp unix:path=cc-qmp-2,server=on,wait=off" func TestAppendQMPSocketServer(t *testing.T) { qmp := []QMPSocket{ @@ -744,8 +764,7 @@ func TestAppendQMPSocketServer(t *testing.T) { } var pidfile = "/run/vc/vm/iamsandboxid/pidfile" -var logfile = "/run/vc/vm/iamsandboxid/logfile" -var qemuString = "-name cc-qemu -cpu host -uuid " + agentUUID + " -pidfile " + pidfile + " -D " + logfile +var qemuString = "-name cc-qemu -cpu host -uuid " + agentUUID + " -pidfile " + pidfile func TestAppendStrings(t *testing.T) { config := Config{ @@ -754,14 +773,12 @@ func TestAppendStrings(t *testing.T) { UUID: agentUUID, CPUModel: "host", PidFile: pidfile, - LogFile: logfile, } config.appendName() config.appendCPUModel() config.appendUUID() config.appendPidFile() - config.appendLogFile() result := strings.Join(config.qemuParams, " ") if result != qemuString { diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 7b2c0a92b2..a5fd2f6f68 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -718,6 +718,16 @@ func QMPStart(ctx context.Context, socket string, cfg QMPConfig, disconnectedCh return nil, nil, err } + return QMPStartWithConn(ctx, conn, cfg, disconnectedCh) +} + +// Same as QMPStart but with a pre-established connection +func QMPStartWithConn(ctx context.Context, conn net.Conn, cfg QMPConfig, disconnectedCh chan struct{}) (*QMP, *QMPVersion, error) { + if conn == nil { + close(disconnectedCh) + return nil, nil, fmt.Errorf("invalid connection") + } + connectedCh := make(chan *QMPVersion) q := startQMPLoop(conn, cfg, connectedCh, disconnectedCh) diff --git a/src/runtime/pkg/govmm/qemu/qmp_test.go b/src/runtime/pkg/govmm/qemu/qmp_test.go index 3f82fa54f9..c6bee11e7d 100644 --- a/src/runtime/pkg/govmm/qemu/qmp_test.go +++ b/src/runtime/pkg/govmm/qemu/qmp_test.go @@ -273,6 +273,22 @@ func TestQMPStartBadPath(t *testing.T) { <-disconnectedCh } +// Checks that a call to QMPStartWithConn with a nil connection exits gracefully. +// +// We call QMPStartWithConn with a nil connection. +// +// An error should be returned and the disconnected channel should be closed. +func TestQMPStartWithConnNil(t *testing.T) { + cfg := QMPConfig{Logger: qmpTestLogger{}} + disconnectedCh := make(chan struct{}) + q, _, err := QMPStartWithConn(context.Background(), nil, cfg, disconnectedCh) + if err == nil { + t.Errorf("Expected error") + q.Shutdown() + } + <-disconnectedCh +} + // Checks that the qmp_capabilities command is correctly sent. // // We start a QMPLoop, send the qmp_capabilities command and stop the diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index f35b60a0bb..0ce2b8815b 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -13,10 +13,14 @@ import ( "encoding/hex" "encoding/json" "fmt" + "io" "math" + "net" "os" + "os/exec" "os/user" "path/filepath" + "regexp" "strconv" "strings" "sync" @@ -106,6 +110,7 @@ type qemu struct { const ( consoleSocket = "console.sock" qmpSocket = "qmp.sock" + hmpSocket = "hmp.sock" vhostFSSocket = "vhost-fs.sock" nydusdAPISock = "nydusd-api.sock" @@ -313,6 +318,10 @@ func (q *qemu) qmpSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket) } +func (q *qemu) hmpSocketPath(id string) (string, error) { + return utils.BuildSocketPath(q.config.VMStorePath, id, hmpSocket) +} + func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) { machine := q.arch.machine() @@ -354,14 +363,30 @@ func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { path: monitorSockPath, } - return []govmmQemu.QMPSocket{ - { + var sockets []govmmQemu.QMPSocket + + sockets = append(sockets, govmmQemu.QMPSocket{ + Type: "unix", + Server: true, + NoWait: true, + }) + + if q.HypervisorConfig().Debug { + humanMonitorSockPath, err := q.hmpSocketPath(q.id) + if err != nil { + return nil, err + } + + sockets = append(sockets, govmmQemu.QMPSocket{ Type: "unix", - Name: q.qmpMonitorCh.path, + IsHmp: true, + Name: humanMonitorSockPath, Server: true, NoWait: true, - }, - }, nil + }) + } + + return sockets, nil } func (q *qemu) buildDevices(ctx context.Context, initrdPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, error) { @@ -520,7 +545,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi NoDefaults: true, NoGraphic: true, NoReboot: true, - Daemonize: true, + Daemonize: false, MemPrealloc: q.config.MemPrealloc, HugePages: q.config.HugePages, IOMMUPlatform: q.config.IOMMUPlatform, @@ -880,6 +905,77 @@ func (q *qemu) AttestVM(ctx context.Context) error { return nil } +// setupEarlyQmpConnection creates a listener socket to be passed to QEMU +// as a QMP listening endpoint. An initial connection is established, to +// be used as the QMP client socket. This allows to detect an early failure +// of QEMU instead of looping on connect until some timeout expires. +func (q *qemu) setupEarlyQmpConnection() (net.Conn, error) { + monitorSockPath := q.qmpMonitorCh.path + + qmpListener, err := net.Listen("unix", monitorSockPath) + if err != nil { + q.Logger().WithError(err).Errorf("Unable to listen on unix socket address (%s)", monitorSockPath) + return nil, err + } + + // A duplicate fd of this socket will be passed to QEMU. We must + // close the original one when we're done. + defer qmpListener.Close() + + if rootless.IsRootless() { + err = syscall.Chown(monitorSockPath, int(q.config.Uid), int(q.config.Gid)) + if err != nil { + q.Logger().WithError(err).Errorf("Unable to make unix socket (%s) rootless", monitorSockPath) + return nil, err + } + } + + VMFd, err := qmpListener.(*net.UnixListener).File() + if err != nil { + return nil, err + } + defer func() { + if err != nil { + VMFd.Close() + } + }() + + // This socket will be used to establish the initial QMP connection + dialer := net.Dialer{Cancel: q.qmpMonitorCh.ctx.Done()} + conn, err := dialer.Dial("unix", monitorSockPath) + if err != nil { + q.Logger().WithError(err).Errorf("Unable to connect to unix socket (%s)", monitorSockPath) + return nil, err + } + + // We need to keep the socket file around to be able to re-connect + qmpListener.(*net.UnixListener).SetUnlinkOnClose(false) + + // Pass the duplicated fd of the listener socket to QEMU + q.qemuConfig.QMPSockets[0].FD = VMFd + q.fds = append(q.fds, q.qemuConfig.QMPSockets[0].FD) + + return conn, nil +} + +func (q *qemu) LogAndWait(qemuCmd *exec.Cmd, reader io.ReadCloser) { + pid := qemuCmd.Process.Pid + q.Logger().Infof("Start logging QEMU (qemuPid=%d)", pid) + scanner := bufio.NewScanner(reader) + warnRE := regexp.MustCompile("(^[^:]+: )warning: ") + for scanner.Scan() { + text := scanner.Text() + if warnRE.MatchString(text) { + text = warnRE.ReplaceAllString(text, "$1") + q.Logger().WithField("qemuPid", pid).Warning(text) + } else { + q.Logger().WithField("qemuPid", pid).Error(text) + } + } + q.Logger().Infof("Stop logging QEMU (qemuPid=%d)", pid) + qemuCmd.Wait() +} + // StartVM will start the Sandbox's VM. func (q *qemu) StartVM(ctx context.Context, timeout int) error { span, ctx := katatrace.Trace(ctx, q.Logger(), "StartVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) @@ -912,10 +1008,6 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error { return err } q.Logger().WithField("vm path", vmPath).Info("created vm path") - // append logfile only on debug - if q.config.Debug { - q.qemuConfig.LogFile = filepath.Join(vmPath, "qemu.log") - } defer func() { if err != nil { @@ -925,6 +1017,12 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error { } }() + var qmpConn net.Conn + qmpConn, err = q.setupEarlyQmpConnection() + if err != nil { + return err + } + // This needs to be done as late as possible, just before launching // virtiofsd are executed by kata-runtime after this call, run with // the SELinux label. If these processes require privileged, we do @@ -950,20 +1048,23 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error { } - var strErr string - strErr, err = govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger()) + qemuCmd, reader, err := govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger()) if err != nil { - if q.config.Debug && q.qemuConfig.LogFile != "" { - b, err := os.ReadFile(q.qemuConfig.LogFile) - if err == nil { - strErr += string(b) - } - } - q.Logger().WithError(err).Errorf("failed to launch qemu: %s", strErr) - return fmt.Errorf("failed to launch qemu: %s, error messages from qemu log: %s", err, strErr) + q.Logger().WithError(err).Error("failed to launch qemu") + return fmt.Errorf("failed to launch qemu: %s", err) + } + if q.qemuConfig.Knobs.Daemonize { + // LaunchQemu returns a handle on the upper QEMU process. + // Wait for it to exit to assume that the QEMU daemon was + // actually started. + qemuCmd.Wait() + } else { + // Log QEMU errors and ensure the QEMU process is reaped after + // termination. + go q.LogAndWait(qemuCmd, reader) } - err = q.waitVM(ctx, timeout) + err = q.waitVM(ctx, qmpConn, timeout) if err != nil { return err } @@ -1001,7 +1102,7 @@ func (q *qemu) bootFromTemplate() error { } // waitVM will wait for the Sandbox's VM to be up and running. -func (q *qemu) waitVM(ctx context.Context, timeout int) error { +func (q *qemu) waitVM(ctx context.Context, qmpConn net.Conn, timeout int) error { span, _ := katatrace.Trace(ctx, q.Logger(), "waitVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() @@ -1021,7 +1122,7 @@ func (q *qemu) waitVM(ctx context.Context, timeout int) error { timeStart := time.Now() for { disconnectCh = make(chan struct{}) - qmp, ver, err = govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh) + qmp, ver, err = govmmQemu.QMPStartWithConn(q.qmpMonitorCh.ctx, qmpConn, cfg, disconnectCh) if err == nil { break } @@ -1071,19 +1172,6 @@ func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) { } }() - if q.config.Debug && q.qemuConfig.LogFile != "" { - f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0) - if err == nil { - scanner := bufio.NewScanner(f) - for scanner.Scan() { - q.Logger().WithField("file", q.qemuConfig.LogFile).Debug(scanner.Text()) - } - if err := scanner.Err(); err != nil { - q.Logger().WithError(err).Debug("read qemu log failed") - } - } - } - if err := q.qmpSetup(); err != nil { return err } diff --git a/tools/packaging/kata-deploy/local-build/Makefile b/tools/packaging/kata-deploy/local-build/Makefile index 48e9af19be..47dfd17107 100644 --- a/tools/packaging/kata-deploy/local-build/Makefile +++ b/tools/packaging/kata-deploy/local-build/Makefile @@ -37,6 +37,7 @@ all-parallel: $(MK_DIR)/dockerbuild/install_yq.sh all: serial-targets \ firecracker-tarball \ kernel-tarball \ + kernel-dragonball-experimental-tarball \ nydus-tarball \ qemu-tarball \ shim-v2-tarball \ @@ -60,6 +61,9 @@ firecracker-tarball: kernel-tarball: ${MAKE} $@-build +kernel-dragonball-experimental-tarball: + ${MAKE} $@-build + kernel-experimental-tarball: ${MAKE} $@-build diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index e2f538112b..f81f6b1aa1 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -82,6 +82,7 @@ options: cloud-hypervisor firecracker kernel + kernel-dragonball-experimental kernel-experimental nydus qemu @@ -509,6 +510,13 @@ install_kernel() { DESTDIR="${destdir}" PREFIX="${prefix}" "${kernel_builder}" -f -v "${kernel_version}" } +#Install dragonball experimental kernel asset +install_dragonball_experimental_kernel() { + info "build dragonball experimental kernel" + export kernel_version="$(yq r $versions_yaml assets.dragonball-kernel-experimental.version)" + info "kernel version ${kernel_version}" + DESTDIR="${destdir}" PREFIX="${prefix}" "${kernel_builder}" -e -t dragonball -v ${kernel_version} +} #Install experimental kernel asset install_experimental_kernel() { @@ -653,6 +661,8 @@ handle_build() { nydus) install_nydus ;; + kernel-dragonball-experimental) install_dragonball_experimental_kernel;; + kernel-experimental) install_experimental_kernel;; qemu) install_qemu ;; diff --git a/versions.yaml b/versions.yaml index cad5eaf228..98b3e34bc4 100644 --- a/versions.yaml +++ b/versions.yaml @@ -196,7 +196,7 @@ externals: cni-plugins: description: "CNI network plugins" url: "https://github.com/containernetworking/plugins" - version: "v1.1.1" + version: "v1.2.0" conmon: description: "An OCI container runtime monitor"