diff --git a/src/dragonball/Cargo.toml b/src/dragonball/Cargo.toml index 1368c362f0..3cc17de1dd 100644 --- a/src/dragonball/Cargo.toml +++ b/src/dragonball/Cargo.toml @@ -14,18 +14,22 @@ arc-swap = "1.5.0" bytes = "1.1.0" dbs-address-space = "0.1.0" dbs-allocator = "0.1.0" +dbs-arch = "0.1.0" dbs-boot = "0.2.0" dbs-device = "0.1.0" dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] } dbs-legacy-devices = "0.1.0" +dbs-upcall = { version = "0.1.0", optional = true } dbs-utils = "0.1.0" dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] } kvm-bindings = "0.5.0" kvm-ioctls = "0.11.0" +lazy_static = "1.2" libc = "0.2.39" linux-loader = "0.4.0" log = "0.4.14" nix = "0.23.1" +seccompiler = "0.2.0" serde = "1.0.27" serde_derive = "1.0.27" serde_json = "1.0.9" @@ -41,13 +45,15 @@ slog-term = "2.9.0" slog-async = "2.7.0" [features] +acpi = [] atomic-guest-memory = [] +hotplug = ["virtio-vsock"] virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"] [patch.'crates-io'] -dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } -dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } -dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } -dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } -dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } -dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" } +dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } +dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } +dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } +dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } +dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } +dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" } diff --git a/src/dragonball/README.md b/src/dragonball/README.md index 0e3bcb45a0..c9d7e5119c 100644 --- a/src/dragonball/README.md +++ b/src/dragonball/README.md @@ -17,7 +17,10 @@ and configuration process. # Documentation Device: [Device Document](docs/device.md) +vCPU: [vCPU Document](docs/vcpu.md) +API: [API Document](docs/api.md) +Currently, the documents are still actively adding. You could see the [official documentation](docs/) page for more details. # Supported Architectures diff --git a/src/dragonball/docs/api.md b/src/dragonball/docs/api.md new file mode 100644 index 0000000000..cd2bc2db8e --- /dev/null +++ b/src/dragonball/docs/api.md @@ -0,0 +1,7 @@ +# API + +We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager. +This document provides the introduction for each of them. + +TODO: Details will be added in the Part III PR for `Dragonball` + diff --git a/src/dragonball/docs/device.md b/src/dragonball/docs/device.md index 8f3fdbe6ed..ab2e078e7b 100644 --- a/src/dragonball/docs/device.md +++ b/src/dragonball/docs/device.md @@ -14,4 +14,7 @@ Currently we have following device manager: ## Device supported `VIRTIO-VSOCK` +`i8042` +`COM1` +`COM2` diff --git a/src/dragonball/docs/vcpu.md b/src/dragonball/docs/vcpu.md new file mode 100644 index 0000000000..e2be8037b6 --- /dev/null +++ b/src/dragonball/docs/vcpu.md @@ -0,0 +1,42 @@ +# vCPU + +## vCPU Manager +The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc. + +For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445) + +### vCPU config +`VcpuConfig` is used to configure guest overall CPU info. + +`boot_vcpu_count` is used to define the initial vCPU number. + +`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature + +`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology. + +`vpmu_feature` is used to define `vPMU` feature level. +If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default). +If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions). +If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported + +## vCPU State + +There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow. + +When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`. + +During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason. + +If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine. +When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM. + +When the VMM is stopped / destroyed, the state will change to `exited`. + +## vCPU Hot plug +Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug. + +To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try. + +vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid. + + diff --git a/src/dragonball/src/api/mod.rs b/src/dragonball/src/api/mod.rs new file mode 100644 index 0000000000..75ca6af690 --- /dev/null +++ b/src/dragonball/src/api/mod.rs @@ -0,0 +1,6 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! API related data structures to configure the vmm. + +pub mod v1; diff --git a/src/dragonball/src/api/v1/instance_info.rs b/src/dragonball/src/api/v1/instance_info.rs new file mode 100644 index 0000000000..d457b6124b --- /dev/null +++ b/src/dragonball/src/api/v1/instance_info.rs @@ -0,0 +1,84 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use serde_derive::{Deserialize, Serialize}; + +/// The microvm state. +/// +/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is +/// called, the state goes from Uninitialized to Starting. The state is changed to Running until +/// the start_microvm method ends. Halting and Halted are currently unsupported. +#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)] +pub enum InstanceState { + /// Microvm is not initialized. + Uninitialized, + /// Microvm is starting. + Starting, + /// Microvm is running. + Running, + /// Microvm is Paused. + Paused, + /// Microvm received a halt instruction. + Halting, + /// Microvm is halted. + Halted, + /// Microvm exit instead of process exit. + Exited(i32), +} + +/// The state of async actions +#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] +pub enum AsyncState { + /// Uninitialized + Uninitialized, + /// Success + Success, + /// Failure + Failure, +} + +/// The strongly typed that contains general information about the microVM. +#[derive(Debug, Deserialize, Serialize)] +pub struct InstanceInfo { + /// The ID of the microVM. + pub id: String, + /// The state of the microVM. + pub state: InstanceState, + /// The version of the VMM that runs the microVM. + pub vmm_version: String, + /// The pid of the current VMM process. + pub pid: u32, + /// The state of async actions. + pub async_state: AsyncState, + /// List of tids of vcpu threads (vcpu index, tid) + pub tids: Vec<(u8, u32)>, +} + +impl InstanceInfo { + /// create instance info object with given id, version, and platform type + pub fn new(id: String, vmm_version: String) -> Self { + InstanceInfo { + id, + state: InstanceState::Uninitialized, + vmm_version, + pid: std::process::id(), + async_state: AsyncState::Uninitialized, + tids: Vec::new(), + } + } +} + +impl Default for InstanceInfo { + fn default() -> Self { + InstanceInfo { + id: String::from(""), + state: InstanceState::Uninitialized, + vmm_version: env!("CARGO_PKG_VERSION").to_string(), + pid: std::process::id(), + async_state: AsyncState::Uninitialized, + tids: Vec::new(), + } + } +} diff --git a/src/dragonball/src/api/v1/mod.rs b/src/dragonball/src/api/v1/mod.rs new file mode 100644 index 0000000000..f25fb84364 --- /dev/null +++ b/src/dragonball/src/api/v1/mod.rs @@ -0,0 +1,7 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! API Version 1 related data structures to configure the vmm. + +mod instance_info; +pub use self::instance_info::{InstanceInfo, InstanceState}; diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs index 5691690ea8..2ad26e0d01 100644 --- a/src/dragonball/src/device_manager/mod.rs +++ b/src/dragonball/src/device_manager/mod.rs @@ -29,6 +29,12 @@ use dbs_virtio_devices::{ VirtioDevice, }; +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +use dbs_upcall::{ + DevMgrRequest, DevMgrService, MmioDevRequest, UpcallClient, UpcallClientError, + UpcallClientRequest, UpcallClientResponse, +}; + use crate::address_space_manager::GuestAddressSpaceImpl; use crate::error::StartMicrovmError; use crate::resource_manager::ResourceManager; @@ -83,6 +89,11 @@ pub enum DeviceMgrError { /// Error from Virtio subsystem. #[error(transparent)] Virtio(virtio::Error), + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + /// Failed to hotplug the device. + #[error("failed to hotplug virtual device")] + HotplugDevice(#[source] UpcallClientError), } /// Specialized version of `std::result::Result` for device manager operations. @@ -188,6 +199,8 @@ pub struct DeviceOpContext { logger: slog::Logger, is_hotplug: bool, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: Option>>, #[cfg(feature = "dbs-virtio-devices")] virtio_devices: Vec>, } @@ -220,6 +233,8 @@ impl DeviceOpContext { address_space, logger, is_hotplug, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: None, #[cfg(feature = "dbs-virtio-devices")] virtio_devices: Vec::new(), } @@ -236,35 +251,122 @@ impl DeviceOpContext { &self.logger } + #[allow(unused_variables)] fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> { - if !self.is_hotplug { + if self.is_hotplug { return Err(DeviceMgrError::InvalidOperation); } #[cfg(feature = "dbs-virtio-devices")] - let cmdline = kernel_config.kernel_cmdline_mut(); + { + let cmdline = kernel_config.kernel_cmdline_mut(); - #[cfg(feature = "dbs-virtio-devices")] - for device in self.virtio_devices.iter() { - let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?; + for device in self.virtio_devices.iter() { + let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?; - // as per doc, [virtio_mmio.]device=@: needs to be appended - // to kernel commandline for virtio mmio devices to get recognized - // the size parameter has to be transformed to KiB, so dividing hexadecimal value in - // bytes to 1024; further, the '{}' formatting rust construct will automatically - // transform it to decimal - cmdline - .insert( - "virtio_mmio.device", - &format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq), - ) - .map_err(DeviceMgrError::Cmdline)?; + // as per doc, [virtio_mmio.]device=@: needs to be appended + // to kernel commandline for virtio mmio devices to get recognized + // the size parameter has to be transformed to KiB, so dividing hexadecimal value in + // bytes to 1024; further, the '{}' formatting rust construct will automatically + // transform it to decimal + cmdline + .insert( + "virtio_mmio.device", + &format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq), + ) + .map_err(DeviceMgrError::Cmdline)?; + } } Ok(()) } } +#[cfg(not(feature = "hotplug"))] +impl DeviceOpContext { + pub(crate) fn insert_hotplug_mmio_device( + &self, + _dev: &Arc, + _callback: Option<()>, + ) -> Result<()> { + Err(DeviceMgrError::InvalidOperation) + } + + pub(crate) fn remove_hotplug_mmio_device( + &self, + _dev: &Arc, + _callback: Option<()>, + ) -> Result<()> { + Err(DeviceMgrError::InvalidOperation) + } +} + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +impl DeviceOpContext { + fn call_hotplug_device( + &self, + req: DevMgrRequest, + callback: Option>, + ) -> Result<()> { + if let Some(upcall_client) = self.upcall_client.as_ref() { + if let Some(cb) = callback { + upcall_client + .send_request(UpcallClientRequest::DevMgr(req), cb) + .map_err(DeviceMgrError::HotplugDevice)?; + } else { + upcall_client + .send_request_without_result(UpcallClientRequest::DevMgr(req)) + .map_err(DeviceMgrError::HotplugDevice)?; + } + Ok(()) + } else { + Err(DeviceMgrError::InvalidOperation) + } + } + + pub(crate) fn insert_hotplug_mmio_device( + &self, + dev: &Arc, + callback: Option>, + ) -> Result<()> { + if !self.is_hotplug { + return Err(DeviceMgrError::InvalidOperation); + } + + let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?; + let req = DevMgrRequest::AddMmioDev(MmioDevRequest { + mmio_base, + mmio_size, + mmio_irq, + }); + + self.call_hotplug_device(req, callback) + } + + pub(crate) fn remove_hotplug_mmio_device( + &self, + dev: &Arc, + callback: Option>, + ) -> Result<()> { + if !self.is_hotplug { + return Err(DeviceMgrError::InvalidOperation); + } + let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?; + let req = DevMgrRequest::DelMmioDev(MmioDevRequest { + mmio_base, + mmio_size, + mmio_irq, + }); + + self.call_hotplug_device(req, callback) + } +} + +#[cfg(all(feature = "hotplug", feature = "acpi"))] +impl DeviceOpContext { + // TODO: We will implement this when we develop ACPI virtualization +} + /// Device manager for virtual machines, which manages all device for a virtual machine. pub struct DeviceManager { io_manager: Arc>, @@ -351,7 +453,7 @@ impl DeviceManager { self.set_guest_kernel_log_stream(dmesg_fifo) .map_err(|_| StartMicrovmError::EventFd)?; - slog::info!(self.logger, "init console path: {:?}", com1_sock_path); + info!(self.logger, "init console path: {:?}", com1_sock_path); if let Some(path) = com1_sock_path { if let Some(legacy_manager) = self.legacy_manager.as_ref() { let com1 = legacy_manager.get_com1_serial(); @@ -387,19 +489,6 @@ impl DeviceManager { Ok(()) } - /// Restore legacy devices - pub fn restore_legacy_devices( - &mut self, - dmesg_fifo: Option>, - com1_sock_path: Option, - ) -> std::result::Result<(), StartMicrovmError> { - self.set_guest_kernel_log_stream(dmesg_fifo) - .map_err(|_| StartMicrovmError::EventFd)?; - slog::info!(self.logger, "restore console path: {:?}", com1_sock_path); - // TODO: restore console - Ok(()) - } - /// Reset the console into canonical mode. pub fn reset_console(&self) -> Result<()> { self.con_manager.reset_console() diff --git a/src/dragonball/src/error.rs b/src/dragonball/src/error.rs index 5a497abb6b..2858103a48 100644 --- a/src/dragonball/src/error.rs +++ b/src/dragonball/src/error.rs @@ -14,6 +14,37 @@ use dbs_virtio_devices::Error as VirtIoError; use crate::device_manager; +/// Shorthand result type for internal VMM commands. +pub type Result = std::result::Result; + +/// Errors associated with the VMM internal logic. +/// +/// These errors cannot be generated by direct user input, but can result from bad configuration +/// of the host (for example if Dragonball doesn't have permissions to open the KVM fd). +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Failure occurs in issuing KVM ioctls and errors will be returned from kvm_ioctls lib. + #[error("failure in issuing KVM ioctl command")] + Kvm(#[source] kvm_ioctls::Error), + + /// The host kernel reports an unsupported KVM API version. + #[error("unsupported KVM version {0}")] + KvmApiVersion(i32), + + /// Cannot initialize the KVM context due to missing capabilities. + #[error("missing KVM capability")] + KvmCap(kvm_ioctls::Cap), + + #[cfg(target_arch = "x86_64")] + #[error("failed to configure MSRs")] + /// Cannot configure MSRs + GuestMSRs(dbs_arch::msr::Error), + + /// MSR inner error + #[error("MSR inner error")] + Msr(vmm_sys_util::fam::Error), +} + /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error)] pub enum StartMicrovmError { diff --git a/src/dragonball/src/io_manager.rs b/src/dragonball/src/io_manager.rs new file mode 100644 index 0000000000..410703bc7a --- /dev/null +++ b/src/dragonball/src/io_manager.rs @@ -0,0 +1,60 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use arc_swap::{ArcSwap, Cache}; +use dbs_device::device_manager::Error; +use dbs_device::device_manager::IoManager; + +/// A specialized version of [`std::result::Result`] for IO manager related operations. +pub type Result = std::result::Result; + +/// Wrapper over IoManager to support device hotplug with [`ArcSwap`] and [`Cache`]. +#[derive(Clone)] +pub struct IoManagerCached(pub(crate) Cache>, Arc>); + +impl IoManagerCached { + /// Create a new instance of [`IoManagerCached`]. + pub fn new(io_manager: Arc>) -> Self { + IoManagerCached(Cache::new(io_manager)) + } + + #[cfg(target_arch = "x86_64")] + #[inline] + /// Read data from IO ports. + pub fn pio_read(&mut self, addr: u16, data: &mut [u8]) -> Result<()> { + self.0.load().pio_read(addr, data) + } + + #[cfg(target_arch = "x86_64")] + #[inline] + /// Write data to IO ports. + pub fn pio_write(&mut self, addr: u16, data: &[u8]) -> Result<()> { + self.0.load().pio_write(addr, data) + } + + #[inline] + /// Read data to MMIO address. + pub fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> Result<()> { + self.0.load().mmio_read(addr, data) + } + + #[inline] + /// Write data to MMIO address. + pub fn mmio_write(&mut self, addr: u64, data: &[u8]) -> Result<()> { + self.0.load().mmio_write(addr, data) + } + + #[inline] + /// Revalidate the inner cache + pub fn revalidate_cache(&mut self) { + let _ = self.0.load(); + } + + #[inline] + /// Get immutable reference to underlying [`IoManager`]. + pub fn load(&mut self) -> &IoManager { + self.0.load() + } +} diff --git a/src/dragonball/src/kvm_context.rs b/src/dragonball/src/kvm_context.rs new file mode 100644 index 0000000000..f160b264b8 --- /dev/null +++ b/src/dragonball/src/kvm_context.rs @@ -0,0 +1,251 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +#![allow(dead_code)] +use kvm_bindings::KVM_API_VERSION; +use kvm_ioctls::{Cap, Kvm, VmFd}; +use std::os::unix::io::{FromRawFd, RawFd}; + +use crate::error::{Error, Result}; + +/// Describes a KVM context that gets attached to the micro VM instance. +/// It gives access to the functionality of the KVM wrapper as long as every required +/// KVM capability is present on the host. +pub struct KvmContext { + kvm: Kvm, + max_memslots: usize, + #[cfg(target_arch = "x86_64")] + supported_msrs: kvm_bindings::MsrList, +} + +impl KvmContext { + /// Create a new KVM context object, using the provided `kvm_fd` if one is presented. + pub fn new(kvm_fd: Option) -> Result { + let kvm = if let Some(fd) = kvm_fd { + // Safe because we expect kvm_fd to contain a valid fd number when is_some() == true. + unsafe { Kvm::from_raw_fd(fd) } + } else { + Kvm::new().map_err(Error::Kvm)? + }; + + if kvm.get_api_version() != KVM_API_VERSION as i32 { + return Err(Error::KvmApiVersion(kvm.get_api_version())); + } + + Self::check_cap(&kvm, Cap::Irqchip)?; + Self::check_cap(&kvm, Cap::Irqfd)?; + Self::check_cap(&kvm, Cap::Ioeventfd)?; + Self::check_cap(&kvm, Cap::UserMemory)?; + #[cfg(target_arch = "x86_64")] + Self::check_cap(&kvm, Cap::SetTssAddr)?; + + #[cfg(target_arch = "x86_64")] + let supported_msrs = dbs_arch::msr::supported_guest_msrs(&kvm).map_err(Error::GuestMSRs)?; + let max_memslots = kvm.get_nr_memslots(); + + Ok(KvmContext { + kvm, + max_memslots, + #[cfg(target_arch = "x86_64")] + supported_msrs, + }) + } + + /// Get underlying KVM object to access kvm-ioctls interfaces. + pub fn kvm(&self) -> &Kvm { + &self.kvm + } + + /// Get the maximum number of memory slots reported by this KVM context. + pub fn max_memslots(&self) -> usize { + self.max_memslots + } + + /// Create a virtual machine object. + pub fn create_vm(&self) -> Result { + self.kvm.create_vm().map_err(Error::Kvm) + } + + /// Get the max vcpu count supported by kvm + pub fn get_max_vcpus(&self) -> usize { + self.kvm.get_max_vcpus() + } + + fn check_cap(kvm: &Kvm, cap: Cap) -> std::result::Result<(), Error> { + if !kvm.check_extension(cap) { + return Err(Error::KvmCap(cap)); + } + Ok(()) + } +} + +#[cfg(target_arch = "x86_64")] +mod x86_64 { + use super::*; + use dbs_arch::msr::*; + use kvm_bindings::{kvm_msr_entry, CpuId, MsrList, Msrs}; + use std::collections::HashSet; + + impl KvmContext { + /// Get information about supported CPUID of x86 processor. + pub fn supported_cpuid( + &self, + max_entries_count: usize, + ) -> std::result::Result { + self.kvm.get_supported_cpuid(max_entries_count) + } + + /// Get information about supported MSRs of x86 processor. + pub fn supported_msrs( + &self, + _max_entries_count: usize, + ) -> std::result::Result { + Ok(self.supported_msrs.clone()) + } + + // It's very sensible to manipulate MSRs, so please be careful to change code below. + fn build_msrs_list(kvm: &Kvm) -> Result { + let mut mset: HashSet = HashSet::new(); + let supported_msr_list = kvm.get_msr_index_list().map_err(super::Error::Kvm)?; + for msr in supported_msr_list.as_slice() { + mset.insert(*msr); + } + + let mut msrs = vec![ + MSR_IA32_APICBASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_IA32_CR_PAT, + ]; + + let filters_list = vec![ + MSR_STAR, + MSR_VM_HSAVE_PA, + MSR_TSC_AUX, + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSCDEADLINE, + MSR_IA32_MISC_ENABLE, + MSR_IA32_BNDCFGS, + MSR_IA32_SPEC_CTRL, + ]; + for msr in filters_list { + if mset.contains(&msr) { + msrs.push(msr); + } + } + + // TODO: several msrs are optional. + + // TODO: Since our guests don't support nested-vmx, LMCE nor SGX for now. + // msrs.push(MSR_IA32_FEATURE_CONTROL); + + msrs.push(MSR_CSTAR); + msrs.push(MSR_KERNEL_GS_BASE); + msrs.push(MSR_SYSCALL_MASK); + msrs.push(MSR_LSTAR); + msrs.push(MSR_IA32_TSC); + + msrs.push(MSR_KVM_SYSTEM_TIME_NEW); + msrs.push(MSR_KVM_WALL_CLOCK_NEW); + + // FIXME: check if it's supported. + msrs.push(MSR_KVM_ASYNC_PF_EN); + msrs.push(MSR_KVM_PV_EOI_EN); + msrs.push(MSR_KVM_STEAL_TIME); + + msrs.push(MSR_CORE_PERF_FIXED_CTR_CTRL); + msrs.push(MSR_CORE_PERF_GLOBAL_CTRL); + msrs.push(MSR_CORE_PERF_GLOBAL_STATUS); + msrs.push(MSR_CORE_PERF_GLOBAL_OVF_CTRL); + + const MAX_FIXED_COUNTERS: u32 = 3; + for i in 0..MAX_FIXED_COUNTERS { + msrs.push(MSR_CORE_PERF_FIXED_CTR0 + i); + } + + // FIXME: skip MCE for now. + + let mtrr_msrs = vec![ + MSR_MTRRdefType, + MSR_MTRRfix64K_00000, + MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000, + MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_C8000, + MSR_MTRRfix4K_D0000, + MSR_MTRRfix4K_D8000, + MSR_MTRRfix4K_E0000, + MSR_MTRRfix4K_E8000, + MSR_MTRRfix4K_F0000, + MSR_MTRRfix4K_F8000, + ]; + for mtrr in mtrr_msrs { + msrs.push(mtrr); + } + + const MSR_MTRRCAP_VCNT: u32 = 8; + for i in 0..MSR_MTRRCAP_VCNT { + msrs.push(0x200 + 2 * i); + msrs.push(0x200 + 2 * i + 1); + } + + let msrs: Vec = msrs + .iter() + .map(|reg| kvm_msr_entry { + index: *reg, + reserved: 0, + data: 0, + }) + .collect(); + + Msrs::from_entries(&msrs).map_err(super::Error::Msr) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kvm_ioctls::Kvm; + use std::fs::File; + use std::os::unix::fs::MetadataExt; + use std::os::unix::io::{AsRawFd, FromRawFd}; + + #[test] + fn test_create_kvm_context() { + let c = KvmContext::new(None).unwrap(); + + assert!(c.max_memslots >= 32); + + let kvm = Kvm::new().unwrap(); + let f = unsafe { File::from_raw_fd(kvm.as_raw_fd()) }; + let m1 = f.metadata().unwrap(); + let m2 = File::open("/dev/kvm").unwrap().metadata().unwrap(); + + assert_eq!(m1.dev(), m2.dev()); + assert_eq!(m1.ino(), m2.ino()); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_get_supported_cpu_id() { + let c = KvmContext::new(None).unwrap(); + + let _ = c + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .expect("failed to get supported CPUID"); + assert!(c.supported_cpuid(0).is_err()); + } + + #[test] + fn test_create_vm() { + let c = KvmContext::new(None).unwrap(); + + let _ = c.create_vm().unwrap(); + } +} diff --git a/src/dragonball/src/lib.rs b/src/dragonball/src/lib.rs index cf528d067e..6bf0b9298a 100644 --- a/src/dragonball/src/lib.rs +++ b/src/dragonball/src/lib.rs @@ -1,4 +1,5 @@ // Copyright (C) 2018-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //! Dragonball is a light-weight virtual machine manager(VMM) based on Linux Kernel-based Virtual @@ -10,13 +11,45 @@ /// Address space manager for virtual machines. pub mod address_space_manager; +/// API to handle vmm requests. +pub mod api; /// Structs to maintain configuration information. pub mod config_manager; /// Device manager for virtual machines. pub mod device_manager; /// Errors related to Virtual machine manager. pub mod error; +/// KVM operation context for virtual machines. +pub mod kvm_context; +/// Metrics system. +pub mod metric; /// Resource manager for virtual machines. pub mod resource_manager; +/// Signal handler for virtual machines. +pub mod signal_handler; +/// Virtual CPU manager for virtual machines. +pub mod vcpu; /// Virtual machine manager for virtual machines. pub mod vm; + +mod io_manager; +pub use self::io_manager::IoManagerCached; + +/// Success exit code. +pub const EXIT_CODE_OK: u8 = 0; +/// Generic error exit code. +pub const EXIT_CODE_GENERIC_ERROR: u8 = 1; +/// Generic exit code for an error considered not possible to occur if the program logic is sound. +pub const EXIT_CODE_UNEXPECTED_ERROR: u8 = 2; +/// Dragonball was shut down after intercepting a restricted system call. +pub const EXIT_CODE_BAD_SYSCALL: u8 = 148; +/// Dragonball was shut down after intercepting `SIGBUS`. +pub const EXIT_CODE_SIGBUS: u8 = 149; +/// Dragonball was shut down after intercepting `SIGSEGV`. +pub const EXIT_CODE_SIGSEGV: u8 = 150; +/// Invalid json passed to the Dragonball process for configuring microvm. +pub const EXIT_CODE_INVALID_JSON: u8 = 151; +/// Bad configuration for microvm's resources, when using a single json. +pub const EXIT_CODE_BAD_CONFIGURATION: u8 = 152; +/// Command line arguments parsing error. +pub const EXIT_CODE_ARG_PARSING: u8 = 153; diff --git a/src/dragonball/src/metric.rs b/src/dragonball/src/metric.rs new file mode 100644 index 0000000000..716e9e0440 --- /dev/null +++ b/src/dragonball/src/metric.rs @@ -0,0 +1,58 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use dbs_utils::metric::SharedIncMetric; +use lazy_static::lazy_static; +use serde::Serialize; + +pub use dbs_utils::metric::IncMetric; + +lazy_static! { + /// Static instance used for handling metrics. + pub static ref METRICS: DragonballMetrics = DragonballMetrics::default(); +} + +/// Metrics specific to VCPUs' mode of functioning. +#[derive(Default, Serialize)] +pub struct VcpuMetrics { + /// Number of KVM exits for handling input IO. + pub exit_io_in: SharedIncMetric, + /// Number of KVM exits for handling output IO. + pub exit_io_out: SharedIncMetric, + /// Number of KVM exits for handling MMIO reads. + pub exit_mmio_read: SharedIncMetric, + /// Number of KVM exits for handling MMIO writes. + pub exit_mmio_write: SharedIncMetric, + /// Number of errors during this VCPU's run. + pub failures: SharedIncMetric, + /// Failures in configuring the CPUID. + pub filter_cpuid: SharedIncMetric, +} + +/// Metrics for the seccomp filtering. +#[derive(Default, Serialize)] +pub struct SeccompMetrics { + /// Number of errors inside the seccomp filtering. + pub num_faults: SharedIncMetric, +} + +/// Metrics related to signals. +#[derive(Default, Serialize)] +pub struct SignalMetrics { + /// Number of times that SIGBUS was handled. + pub sigbus: SharedIncMetric, + /// Number of times that SIGSEGV was handled. + pub sigsegv: SharedIncMetric, +} + +/// Structure storing all metrics while enforcing serialization support on them. +#[derive(Default, Serialize)] +pub struct DragonballMetrics { + /// Metrics related to a vcpu's functioning. + pub vcpu: VcpuMetrics, + /// Metrics related to seccomp filtering. + pub seccomp: SeccompMetrics, + /// Metrics related to signals. + pub signals: SignalMetrics, +} diff --git a/src/dragonball/src/signal_handler.rs b/src/dragonball/src/signal_handler.rs new file mode 100644 index 0000000000..23e9ff3976 --- /dev/null +++ b/src/dragonball/src/signal_handler.rs @@ -0,0 +1,219 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use libc::{_exit, c_int, c_void, siginfo_t, SIGBUS, SIGSEGV, SIGSYS}; +use log::error; +use vmm_sys_util::signal::register_signal_handler; + +use crate::metric::{IncMetric, METRICS}; + +// The offset of `si_syscall` (offending syscall identifier) within the siginfo structure +// expressed as an `(u)int*`. +// Offset `6` for an `i32` field means that the needed information is located at `6 * sizeof(i32)`. +// See /usr/include/linux/signal.h for the C struct definition. +// See https://github.com/rust-lang/libc/issues/716 for why the offset is different in Rust. +const SI_OFF_SYSCALL: isize = 6; + +const SYS_SECCOMP_CODE: i32 = 1; + +extern "C" { + fn __libc_current_sigrtmin() -> c_int; + fn __libc_current_sigrtmax() -> c_int; +} + +/// Gets current sigrtmin +pub fn sigrtmin() -> c_int { + unsafe { __libc_current_sigrtmin() } +} + +/// Gets current sigrtmax +pub fn sigrtmax() -> c_int { + unsafe { __libc_current_sigrtmax() } +} + +/// Signal handler for `SIGSYS`. +/// +/// Increments the `seccomp.num_faults` metric, logs an error message and terminates the process +/// with a specific exit code. +extern "C" fn sigsys_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) { + // Safe because we're just reading some fields from a supposedly valid argument. + let si_signo = unsafe { (*info).si_signo }; + let si_code = unsafe { (*info).si_code }; + + // Sanity check. The condition should never be true. + if num != si_signo || num != SIGSYS || si_code != SYS_SECCOMP_CODE as i32 { + // Safe because we're terminating the process anyway. + unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) }; + } + + // Other signals which might do async unsafe things incompatible with the rest of this + // function are blocked due to the sa_mask used when registering the signal handler. + let syscall = unsafe { *(info as *const i32).offset(SI_OFF_SYSCALL) as usize }; + // SIGSYS is triggered when bad syscalls are detected. num_faults is only added when SIGSYS is detected + // so it actually only collects the count for bad syscalls. + METRICS.seccomp.num_faults.inc(); + error!( + "Shutting down VM after intercepting a bad syscall ({}).", + syscall + ); + + // Safe because we're terminating the process anyway. We don't actually do anything when + // running unit tests. + #[cfg(not(test))] + unsafe { + _exit(i32::from(super::EXIT_CODE_BAD_SYSCALL)) + }; +} + +/// Signal handler for `SIGBUS` and `SIGSEGV`. +/// +/// Logs an error message and terminates the process with a specific exit code. +extern "C" fn sigbus_sigsegv_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) { + // Safe because we're just reading some fields from a supposedly valid argument. + let si_signo = unsafe { (*info).si_signo }; + let si_code = unsafe { (*info).si_code }; + + // Sanity check. The condition should never be true. + if num != si_signo || (num != SIGBUS && num != SIGSEGV) { + // Safe because we're terminating the process anyway. + unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) }; + } + + // Other signals which might do async unsafe things incompatible with the rest of this + // function are blocked due to the sa_mask used when registering the signal handler. + match si_signo { + SIGBUS => METRICS.signals.sigbus.inc(), + SIGSEGV => METRICS.signals.sigsegv.inc(), + _ => (), + } + + error!( + "Shutting down VM after intercepting signal {}, code {}.", + si_signo, si_code + ); + + // Safe because we're terminating the process anyway. We don't actually do anything when + // running unit tests. + #[cfg(not(test))] + unsafe { + _exit(i32::from(match si_signo { + SIGBUS => super::EXIT_CODE_SIGBUS, + SIGSEGV => super::EXIT_CODE_SIGSEGV, + _ => super::EXIT_CODE_UNEXPECTED_ERROR, + })) + }; +} + +/// Registers all the required signal handlers. +/// +/// Custom handlers are installed for: `SIGBUS`, `SIGSEGV`, `SIGSYS`. +pub fn register_signal_handlers() -> vmm_sys_util::errno::Result<()> { + // Call to unsafe register_signal_handler which is considered unsafe because it will + // register a signal handler which will be called in the current thread and will interrupt + // whatever work is done on the current thread, so we have to keep in mind that the registered + // signal handler must only do async-signal-safe operations. + register_signal_handler(SIGSYS, sigsys_handler)?; + register_signal_handler(SIGBUS, sigbus_sigsegv_handler)?; + register_signal_handler(SIGSEGV, sigbus_sigsegv_handler)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + use libc::{cpu_set_t, syscall}; + use std::convert::TryInto; + use std::{mem, process, thread}; + + use seccompiler::{apply_filter, BpfProgram, SeccompAction, SeccompFilter}; + + // This function is used when running unit tests, so all the unsafes are safe. + fn cpu_count() -> usize { + let mut cpuset: cpu_set_t = unsafe { mem::zeroed() }; + unsafe { + libc::CPU_ZERO(&mut cpuset); + } + let ret = unsafe { + libc::sched_getaffinity( + 0, + mem::size_of::(), + &mut cpuset as *mut cpu_set_t, + ) + }; + assert_eq!(ret, 0); + + let mut num = 0; + for i in 0..libc::CPU_SETSIZE as usize { + if unsafe { libc::CPU_ISSET(i, &cpuset) } { + num += 1; + } + } + num + } + + #[test] + fn test_signal_handler() { + let child = thread::spawn(move || { + assert!(register_signal_handlers().is_ok()); + + let filter = SeccompFilter::new( + vec![ + (libc::SYS_brk, vec![]), + (libc::SYS_exit, vec![]), + (libc::SYS_futex, vec![]), + (libc::SYS_getpid, vec![]), + (libc::SYS_munmap, vec![]), + (libc::SYS_kill, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), + (libc::SYS_rt_sigreturn, vec![]), + (libc::SYS_sched_getaffinity, vec![]), + (libc::SYS_set_tid_address, vec![]), + (libc::SYS_sigaltstack, vec![]), + (libc::SYS_write, vec![]), + ] + .into_iter() + .collect(), + SeccompAction::Trap, + SeccompAction::Allow, + std::env::consts::ARCH.try_into().unwrap(), + ) + .unwrap(); + + assert!(apply_filter(&TryInto::::try_into(filter).unwrap()).is_ok()); + assert_eq!(METRICS.seccomp.num_faults.count(), 0); + + // Call the blacklisted `SYS_mkdirat`. + unsafe { syscall(libc::SYS_mkdirat, "/foo/bar\0") }; + + // Call SIGBUS signal handler. + assert_eq!(METRICS.signals.sigbus.count(), 0); + unsafe { + syscall(libc::SYS_kill, process::id(), SIGBUS); + } + + // Call SIGSEGV signal handler. + assert_eq!(METRICS.signals.sigsegv.count(), 0); + unsafe { + syscall(libc::SYS_kill, process::id(), SIGSEGV); + } + }); + assert!(child.join().is_ok()); + + // Sanity check. + assert!(cpu_count() > 0); + // Kcov somehow messes with our handler getting the SIGSYS signal when a bad syscall + // is caught, so the following assertion no longer holds. Ideally, we'd have a surefire + // way of either preventing this behaviour, or detecting for certain whether this test is + // run by kcov or not. The best we could do so far is to look at the perceived number of + // available CPUs. Kcov seems to make a single CPU available to the process running the + // tests, so we use this as an heuristic to decide if we check the assertion. + if cpu_count() > 1 { + // The signal handler should let the program continue during unit tests. + assert!(METRICS.seccomp.num_faults.count() >= 1); + } + assert!(METRICS.signals.sigbus.count() >= 1); + assert!(METRICS.signals.sigsegv.count() >= 1); + } +} diff --git a/src/dragonball/src/vcpu/aarch64.rs b/src/dragonball/src/vcpu/aarch64.rs new file mode 100644 index 0000000000..8e78efdc10 --- /dev/null +++ b/src/dragonball/src/vcpu/aarch64.rs @@ -0,0 +1,94 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::sync::mpsc::{channel, Sender}; +use std::sync::Arc; + +use crate::IoManagerCached; +use dbs_utils::time::TimestampUs; +use kvm_ioctls::{VcpuFd, VmFd}; +use vm_memory::GuestAddress; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuStateEvent}; +use crate::vcpu::VcpuConfig; + +#[allow(unused)] +impl Vcpu { + /// Constructs a new VCPU for `vm`. + /// + /// # Arguments + /// + /// * `id` - Represents the CPU number between [0, max vcpus). + /// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu. + /// * `io_mgr` - The io-manager used to access port-io and mmio devices. + /// * `exit_evt` - An `EventFd` that will be written into when this vcpu + /// exits. + /// * `vcpu_state_event` - The eventfd which can notify vmm state of some + /// vcpu should change. + /// * `vcpu_state_sender` - The channel to send state change message from + /// vcpu thread to vmm thread. + /// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime. + /// * `support_immediate_exit` - whether kvm uses supports immediate_exit flag. + pub fn new_aarch64( + id: u8, + vcpu_fd: Arc, + io_mgr: IoManagerCached, + exit_evt: EventFd, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + create_ts: TimestampUs, + support_immediate_exit: bool, + ) -> Result { + let (event_sender, event_receiver) = channel(); + let (response_sender, response_receiver) = channel(); + + Ok(Vcpu { + fd: vcpu_fd, + id, + io_mgr, + create_ts, + event_receiver, + event_sender: Some(event_sender), + response_receiver: Some(response_receiver), + response_sender, + vcpu_state_event, + vcpu_state_sender, + support_immediate_exit, + mpidr: 0, + exit_evt, + }) + } + + /// Configures an aarch64 specific vcpu. + /// + /// # Arguments + /// + /// * `vcpu_config` - vCPU config for this vCPU status + /// * `vm_fd` - The kvm `VmFd` for this microvm. + /// * `vm_as` - The guest memory address space used by this microvm. + /// * `kernel_load_addr` - Offset from `guest_mem` at which the kernel is loaded. + /// * `_pgtable_addr` - pgtable address for ap vcpu (not used in aarch64) + pub fn configure( + &mut self, + _vcpu_config: &VcpuConfig, + vm_fd: &VmFd, + vm_as: &GuestAddressSpaceImpl, + kernel_load_addr: Option, + _pgtable_addr: Option, + ) -> Result<()> { + // TODO: add arm vcpu configure() function. issue: #4445 + Ok(()) + } + + /// Gets the MPIDR register value. + pub fn get_mpidr(&self) -> u64 { + self.mpidr + } +} diff --git a/src/dragonball/src/vcpu/mod.rs b/src/dragonball/src/vcpu/mod.rs new file mode 100644 index 0000000000..d1075e734d --- /dev/null +++ b/src/dragonball/src/vcpu/mod.rs @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Alibaba Cloud Computing. All rights reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +mod sm; +pub mod vcpu_impl; +pub mod vcpu_manager; + +#[cfg(target_arch = "x86_64")] +use dbs_arch::cpuid::VpmuFeatureLevel; + +/// vcpu config collection +pub struct VcpuConfig { + /// initial vcpu count + pub boot_vcpu_count: u8, + /// max vcpu count for hotplug + pub max_vcpu_count: u8, + /// threads per core for cpu topology information + pub threads_per_core: u8, + /// cores per die for cpu topology information + pub cores_per_die: u8, + /// dies per socket for cpu topology information + pub dies_per_socket: u8, + /// socket number for cpu topology information + pub sockets: u8, + /// if vpmu feature is Disabled, it means vpmu feature is off (by default) + /// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions) + /// if vpmu feature is FullyEnabled, it means all vpmu counters are supported + #[cfg(target_arch = "x86_64")] + pub vpmu_feature: VpmuFeatureLevel, +} diff --git a/src/dragonball/src/vcpu/sm.rs b/src/dragonball/src/vcpu/sm.rs new file mode 100644 index 0000000000..2a51d64083 --- /dev/null +++ b/src/dragonball/src/vcpu/sm.rs @@ -0,0 +1,149 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::ops::Deref; + +/// Simple abstraction of a state machine. +/// +/// `StateMachine` is a wrapper over `T` that also encodes state information for `T`. +/// +/// Each state for `T` is represented by a `StateFn` which is a function that acts as +/// the state handler for that particular state of `T`. +/// +/// `StateFn` returns exactly one other `StateMachine` thus each state gets clearly +/// defined transitions to other states. +pub struct StateMachine { + function: StateFn, + end_state: bool, +} + +/// Type representing a state handler of a `StateMachine` machine. Each state handler +/// is a function from `T` that handles a specific state of `T`. +type StateFn = fn(&mut T) -> StateMachine; + +impl StateMachine { + /// Creates a new state wrapper. + /// + /// # Arguments + /// + /// `function` - the state handler for this state. + /// `end_state` - whether this state is final. + pub fn new(function: StateFn, end_state: bool) -> StateMachine { + StateMachine { + function, + end_state, + } + } + + /// Creates a new state wrapper that has further possible transitions. + /// + /// # Arguments + /// + /// `function` - the state handler for this state. + pub fn next(function: StateFn) -> StateMachine { + StateMachine::new(function, false) + } + + /// Creates a new state wrapper that has no further transitions. The state machine + /// will finish after running this handler. + /// + /// # Arguments + /// + /// `function` - the state handler for this last state. + pub fn finish(function: StateFn) -> StateMachine { + StateMachine::new(function, true) + } + + /// Runs a state machine for `T` starting from the provided state. + /// + /// # Arguments + /// + /// `machine` - a mutable reference to the object running through the various states. + /// `starting_state_fn` - a `fn(&mut T) -> StateMachine` that should be the handler for + /// the initial state. + pub fn run(machine: &mut T, starting_state_fn: StateFn) { + // Start off in the `starting_state` state. + let mut sf = StateMachine::new(starting_state_fn, false); + // While current state is not a final/end state, keep churning. + while !sf.end_state { + // Run the current state handler, and get the next one. + sf = sf(machine); + } + } +} + +// Implement Deref of `StateMachine` so that we can directly call its underlying state handler. +impl Deref for StateMachine { + type Target = StateFn; + fn deref(&self) -> &Self::Target { + &self.function + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // DummyMachine with states `s1`, `s2` and `s3`. + struct DummyMachine { + private_data_s1: bool, + private_data_s2: bool, + private_data_s3: bool, + } + + impl DummyMachine { + fn new() -> Self { + DummyMachine { + private_data_s1: false, + private_data_s2: false, + private_data_s3: false, + } + } + + // DummyMachine functions here. + + // Simple state-machine: start->s1->s2->s3->done. + fn run(&mut self) { + // Verify the machine has not run yet. + assert!(!self.private_data_s1); + assert!(!self.private_data_s2); + assert!(!self.private_data_s3); + + // Run the state-machine. + StateMachine::run(self, Self::s1); + + // Verify the machine went through all states. + assert!(self.private_data_s1); + assert!(self.private_data_s2); + assert!(self.private_data_s3); + } + + fn s1(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s1); + self.private_data_s1 = true; + StateMachine::next(Self::s2) + } + + fn s2(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s2); + self.private_data_s2 = true; + StateMachine::next(Self::s3) + } + + fn s3(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s3); + self.private_data_s3 = true; + // The machine ends here, adding `s1` as next state to validate this. + StateMachine::finish(Self::s1) + } + } + + #[test] + fn test_sm() { + let mut machine = DummyMachine::new(); + machine.run(); + } +} diff --git a/src/dragonball/src/vcpu/vcpu_impl.rs b/src/dragonball/src/vcpu/vcpu_impl.rs new file mode 100644 index 0000000000..7c39ca2805 --- /dev/null +++ b/src/dragonball/src/vcpu/vcpu_impl.rs @@ -0,0 +1,955 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! The implementation for per vcpu + +use std::cell::Cell; +use std::result; +use std::sync::atomic::{fence, Ordering}; +use std::sync::mpsc::{Receiver, Sender, TryRecvError}; +use std::sync::{Arc, Barrier}; +use std::thread; + +use dbs_utils::time::TimestampUs; +use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; +use kvm_ioctls::{VcpuExit, VcpuFd}; +use libc::{c_int, c_void, siginfo_t}; +use log::{error, info}; +use seccompiler::{apply_filter, BpfProgram, Error as SecError}; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::signal::{register_signal_handler, Killable}; + +use super::sm::StateMachine; +use crate::metric::{IncMetric, METRICS}; +use crate::signal_handler::sigrtmin; +use crate::IoManagerCached; + +#[cfg(target_arch = "x86_64")] +#[path = "x86_64.rs"] +mod x86_64; + +#[cfg(target_arch = "aarch64")] +#[path = "aarch64.rs"] +mod aarch64; + +#[cfg(target_arch = "x86_64")] +const MAGIC_IOPORT_BASE: u16 = 0xdbdb; +#[cfg(target_arch = "x86_64")] +const MAGIC_IOPORT_DEBUG_INFO: u16 = MAGIC_IOPORT_BASE; + +/// Signal number (SIGRTMIN) used to kick Vcpus. +pub const VCPU_RTSIG_OFFSET: i32 = 0; + +#[cfg(target_arch = "x86_64")] +/// Errors associated with the wrappers over KVM ioctls. +#[derive(Debug, thiserror::Error)] +pub enum VcpuError { + /// Failed to signal Vcpu. + #[error("cannot signal the vCPU thread")] + SignalVcpu(#[source] vmm_sys_util::errno::Error), + + /// Cannot open the vCPU file descriptor. + #[error("cannot open the vCPU file descriptor")] + VcpuFd(#[source] kvm_ioctls::Error), + + /// Cannot spawn a new vCPU thread. + #[error("cannot spawn vCPU thread")] + VcpuSpawn(#[source] std::io::Error), + + /// Cannot cleanly initialize vCPU TLS. + #[error("cannot cleanly initialize TLS fro vCPU")] + VcpuTlsInit, + + /// Vcpu not present in TLS. + #[error("vCPU not present in the TLS")] + VcpuTlsNotPresent, + + /// Unexpected KVM_RUN exit reason + #[error("Unexpected KVM_RUN exit reason")] + VcpuUnhandledKvmExit, + + /// Pause vcpu failed + #[error("failed to pause vcpus")] + PauseFailed, + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command")] + Kvm(#[source] kvm_ioctls::Error), + + /// Msr error + #[error("failure to deal with MSRs")] + Msr(vmm_sys_util::fam::Error), + + /// A call to cpuid instruction failed on x86_64. + #[error("failure while configuring CPUID for virtual CPU on x86_64")] + CpuId(dbs_arch::cpuid::Error), + + /// Error configuring the floating point related registers on x86_64. + #[error("failure while configuring the floating point related registers on x86_64")] + FPUConfiguration(dbs_arch::regs::Error), + + /// Cannot set the local interruption due to bad configuration on x86_64. + #[error("cannot set the local interruption due to bad configuration on x86_64")] + LocalIntConfiguration(dbs_arch::interrupts::Error), + + /// Error configuring the MSR registers on x86_64. + #[error("failure while configuring the MSR registers on x86_64")] + MSRSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the general purpose registers on x86_64. + #[error("failure while configuring the general purpose registers on x86_64")] + REGSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the special registers on x86_64. + #[error("failure while configuring the special registers on x86_64")] + SREGSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the page table on x86_64. + #[error("failure while configuring the page table on x86_64")] + PageTable(dbs_boot::Error), + + /// The call to KVM_SET_CPUID2 failed on x86_64. + #[error("failure while calling KVM_SET_CPUID2 on x86_64")] + SetSupportedCpusFailed(#[source] kvm_ioctls::Error), +} + +#[cfg(target_arch = "aarch64")] +/// Errors associated with the wrappers over KVM ioctls. +#[derive(Debug, thiserror::Error)] +pub enum VcpuError { + /// Failed to signal Vcpu. + #[error("cannot signal the vCPU thread")] + SignalVcpu(#[source] vmm_sys_util::errno::Error), + + /// Cannot open the vCPU file descriptor. + #[error("cannot open the vCPU file descriptor")] + VcpuFd(#[source] kvm_ioctls::Error), + + /// Cannot spawn a new vCPU thread. + #[error("cannot spawn vCPU thread")] + VcpuSpawn(#[source] std::io::Error), + + /// Cannot cleanly initialize vCPU TLS. + #[error("cannot cleanly initialize TLS fro vCPU")] + VcpuTlsInit, + + /// Vcpu not present in TLS. + #[error("vCPU not present in the TLS")] + VcpuTlsNotPresent, + + /// Unexpected KVM_RUN exit reason + #[error("Unexpected KVM_RUN exit reason")] + VcpuUnhandledKvmExit, + + /// Pause vcpu failed + #[error("failed to pause vcpus")] + PauseFailed, + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command")] + Kvm(#[source] kvm_ioctls::Error), + + /// Msr error + #[error("failure to deal with MSRs")] + Msr(vmm_sys_util::fam::Error), + + #[cfg(target_arch = "aarch64")] + /// Error configuring the general purpose aarch64 registers on aarch64. + #[error("failure while configuring the general purpose registers on aarch64")] + REGSConfiguration(dbs_arch::regs::Error), + + #[cfg(target_arch = "aarch64")] + /// Error setting up the global interrupt controller on aarch64. + #[error("failure while setting up the global interrupt controller on aarch64")] + SetupGIC(dbs_arch::gic::Error), + + #[cfg(target_arch = "aarch64")] + /// Error getting the Vcpu preferred target on aarch64. + #[error("failure while getting the vCPU preferred target on aarch64")] + VcpuArmPreferredTarget(kvm_ioctls::Error), + + #[cfg(target_arch = "aarch64")] + /// Error doing vCPU Init on aarch64. + #[error("failure while doing vCPU init on aarch64")] + VcpuArmInit(kvm_ioctls::Error), +} + +/// Result for Vcpu related operations. +pub type Result = result::Result; + +/// List of events that the Vcpu can receive. +#[derive(Debug)] +pub enum VcpuEvent { + /// Kill the Vcpu. + Exit, + /// Pause the Vcpu. + Pause, + /// Event that should resume the Vcpu. + Resume, + /// Get vcpu thread tid + Gettid, + + /// Event to revalidate vcpu IoManager cache + RevalidateCache, +} + +/// List of responses that the Vcpu reports. +pub enum VcpuResponse { + /// Vcpu is paused. + Paused, + /// Vcpu is resumed. + Resumed, + /// Vcpu index and thread tid. + Tid(u8, u32), + /// Requested Vcpu operation is not allowed. + NotAllowed, + /// Requestion action encountered an error + Error(VcpuError), + /// Vcpu IoManager cache is revalidated + CacheRevalidated, +} + +/// List of events that the vcpu_state_sender can send. +pub enum VcpuStateEvent { + /// (result, response) for hotplug, result 0 means failure, 1 means success. + Hotplug((i32, u32)), +} + +/// Wrapper over vCPU that hides the underlying interactions with the vCPU thread. +pub struct VcpuHandle { + event_sender: Sender, + response_receiver: Receiver, + vcpu_thread: thread::JoinHandle<()>, +} + +impl VcpuHandle { + /// Send event to vCPU thread + pub fn send_event(&self, event: VcpuEvent) -> Result<()> { + // Use expect() to crash if the other thread closed this channel. + self.event_sender + .send(event) + .expect("event sender channel closed on vcpu end."); + // Kick the vCPU so it picks up the message. + self.vcpu_thread + .kill(sigrtmin() + VCPU_RTSIG_OFFSET) + .map_err(VcpuError::SignalVcpu)?; + Ok(()) + } + + /// Receive response from vcpu thread + pub fn response_receiver(&self) -> &Receiver { + &self.response_receiver + } + + #[allow(dead_code)] + /// Join the vcpu thread + pub fn join_vcpu_thread(self) -> thread::Result<()> { + self.vcpu_thread.join() + } +} + +#[derive(PartialEq)] +enum VcpuEmulation { + Handled, + Interrupted, + Stopped, +} + +/// A wrapper around creating and using a kvm-based VCPU. +pub struct Vcpu { + // vCPU fd used by the vCPU + fd: Arc, + // vCPU id info + id: u8, + // Io manager Cached for facilitating IO operations + io_mgr: IoManagerCached, + // Records vCPU create time stamp + create_ts: TimestampUs, + + // The receiving end of events channel owned by the vcpu side. + event_receiver: Receiver, + // The transmitting end of the events channel which will be given to the handler. + event_sender: Option>, + // The receiving end of the responses channel which will be given to the handler. + response_receiver: Option>, + // The transmitting end of the responses channel owned by the vcpu side. + response_sender: Sender, + // Event notifier for CPU hotplug. + // After arm adapts to hotplug vcpu, the dead code macro needs to be removed + #[cfg_attr(target_arch = "aarch64", allow(dead_code))] + vcpu_state_event: EventFd, + // CPU hotplug events. + // After arm adapts to hotplug vcpu, the dead code macro needs to be removed + #[cfg_attr(target_arch = "aarch64", allow(dead_code))] + vcpu_state_sender: Sender, + + // An `EventFd` that will be written into when this vcpu exits. + exit_evt: EventFd, + // Whether kvm used supports immediate_exit flag. + support_immediate_exit: bool, + + // CPUID information for the x86_64 CPU + #[cfg(target_arch = "x86_64")] + cpuid: kvm_bindings::CpuId, + + /// Multiprocessor affinity register recorded for aarch64 + #[cfg(target_arch = "aarch64")] + pub(crate) mpidr: u64, +} + +// Using this for easier explicit type-casting to help IDEs interpret the code. +type VcpuCell = Cell>; + +impl Vcpu { + thread_local!(static TLS_VCPU_PTR: VcpuCell = Cell::new(None)); + + /// Associates `self` with the current thread. + /// + /// It is a prerequisite to successfully run `init_thread_local_data()` before using + /// `run_on_thread_local()` on the current thread. + /// This function will return an error if there already is a `Vcpu` present in the TLS. + fn init_thread_local_data(&mut self) -> Result<()> { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if cell.get().is_some() { + return Err(VcpuError::VcpuTlsInit); + } + cell.set(Some(self as *const Vcpu)); + Ok(()) + }) + } + + /// Deassociates `self` from the current thread. + /// + /// Should be called if the current `self` had called `init_thread_local_data()` and + /// now needs to move to a different thread. + /// + /// Fails if `self` was not previously associated with the current thread. + fn reset_thread_local_data(&mut self) -> Result<()> { + // Best-effort to clean up TLS. If the `Vcpu` was moved to another thread + // _before_ running this, then there is nothing we can do. + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if let Some(vcpu_ptr) = cell.get() { + if vcpu_ptr == self as *const Vcpu { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| cell.take()); + return Ok(()); + } + } + Err(VcpuError::VcpuTlsNotPresent) + }) + } + + /// Runs `func` for the `Vcpu` associated with the current thread. + /// + /// It requires that `init_thread_local_data()` was run on this thread. + /// + /// Fails if there is no `Vcpu` associated with the current thread. + /// + /// # Safety + /// + /// This is marked unsafe as it allows temporary aliasing through + /// dereferencing from pointer an already borrowed `Vcpu`. + unsafe fn run_on_thread_local(func: F) -> Result<()> + where + F: FnOnce(&Vcpu), + { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if let Some(vcpu_ptr) = cell.get() { + // Dereferencing here is safe since `TLS_VCPU_PTR` is populated/non-empty, + // and it is being cleared on `Vcpu::drop` so there is no dangling pointer. + let vcpu_ref: &Vcpu = &*vcpu_ptr; + func(vcpu_ref); + Ok(()) + } else { + Err(VcpuError::VcpuTlsNotPresent) + } + }) + } + + /// Registers a signal handler which makes use of TLS and kvm immediate exit to + /// kick the vcpu running on the current thread, if there is one. + pub fn register_kick_signal_handler() { + extern "C" fn handle_signal(_: c_int, _: *mut siginfo_t, _: *mut c_void) { + // This is safe because it's temporarily aliasing the `Vcpu` object, but we are + // only reading `vcpu.fd` which does not change for the lifetime of the `Vcpu`. + unsafe { + let _ = Vcpu::run_on_thread_local(|vcpu| { + vcpu.fd.set_kvm_immediate_exit(1); + fence(Ordering::Release); + }); + } + } + + register_signal_handler(sigrtmin() + VCPU_RTSIG_OFFSET, handle_signal) + .expect("Failed to register vcpu signal handler"); + } + + /// Returns the cpu index as seen by the guest OS. + pub fn cpu_index(&self) -> u8 { + self.id + } + + /// Moves the vcpu to its own thread and constructs a VcpuHandle. + /// The handle can be used to control the remote vcpu. + pub fn start_threaded( + mut self, + seccomp_filter: BpfProgram, + barrier: Arc, + ) -> Result { + let event_sender = self.event_sender.take().unwrap(); + let response_receiver = self.response_receiver.take().unwrap(); + + let vcpu_thread = thread::Builder::new() + .name(format!("db_vcpu{}", self.cpu_index())) + .spawn(move || { + self.init_thread_local_data() + .expect("Cannot cleanly initialize vcpu TLS."); + barrier.wait(); + self.run(seccomp_filter); + }) + .map_err(VcpuError::VcpuSpawn)?; + + Ok(VcpuHandle { + event_sender, + response_receiver, + vcpu_thread, + }) + } + + /// Extract the vcpu running logic for test mocking. + #[cfg(not(test))] + pub fn emulate(fd: &VcpuFd) -> std::result::Result, kvm_ioctls::Error> { + fd.run() + } + + /// Runs the vCPU in KVM context and handles the kvm exit reason. + /// + /// Returns error or enum specifying whether emulation was handled or interrupted. + fn run_emulation(&mut self) -> Result { + match Vcpu::emulate(&self.fd) { + Ok(run) => match run { + #[cfg(target_arch = "x86_64")] + VcpuExit::IoIn(addr, data) => { + let _ = self.io_mgr.pio_read(addr, data); + METRICS.vcpu.exit_io_in.inc(); + Ok(VcpuEmulation::Handled) + } + #[cfg(target_arch = "x86_64")] + VcpuExit::IoOut(addr, data) => { + if !self.check_io_port_info(addr, data)? { + let _ = self.io_mgr.pio_write(addr, data); + } + METRICS.vcpu.exit_io_out.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::MmioRead(addr, data) => { + let _ = self.io_mgr.mmio_read(addr, data); + METRICS.vcpu.exit_mmio_read.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::MmioWrite(addr, data) => { + #[cfg(target_arch = "aarch64")] + self.check_boot_complete_signal(addr, data); + + let _ = self.io_mgr.mmio_write(addr, data); + METRICS.vcpu.exit_mmio_write.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::Hlt => { + info!("Received KVM_EXIT_HLT signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::Shutdown => { + info!("Received KVM_EXIT_SHUTDOWN signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + // Documentation specifies that below kvm exits are considered errors. + VcpuExit::FailEntry => { + METRICS.vcpu.failures.inc(); + error!("Received KVM_EXIT_FAIL_ENTRY signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::InternalError => { + METRICS.vcpu.failures.inc(); + error!("Received KVM_EXIT_INTERNAL_ERROR signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::SystemEvent(event_type, event_flags) => match event_type { + KVM_SYSTEM_EVENT_RESET | KVM_SYSTEM_EVENT_SHUTDOWN => { + info!( + "Received KVM_SYSTEM_EVENT: type: {}, event: {}", + event_type, event_flags + ); + Ok(VcpuEmulation::Stopped) + } + _ => { + METRICS.vcpu.failures.inc(); + error!( + "Received KVM_SYSTEM_EVENT signal type: {}, flag: {}", + event_type, event_flags + ); + Err(VcpuError::VcpuUnhandledKvmExit) + } + }, + r => { + METRICS.vcpu.failures.inc(); + // TODO: Are we sure we want to finish running a vcpu upon + // receiving a vm exit that is not necessarily an error? + error!("Unexpected exit reason on vcpu run: {:?}", r); + Err(VcpuError::VcpuUnhandledKvmExit) + } + }, + // The unwrap on raw_os_error can only fail if we have a logic + // error in our code in which case it is better to panic. + Err(ref e) => { + match e.errno() { + libc::EAGAIN => Ok(VcpuEmulation::Handled), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + // Notify that this KVM_RUN was interrupted. + Ok(VcpuEmulation::Interrupted) + } + _ => { + METRICS.vcpu.failures.inc(); + error!("Failure during vcpu run: {}", e); + #[cfg(target_arch = "x86_64")] + { + error!( + "dump regs: {:?}, dump sregs: {:?}", + self.fd.get_regs(), + self.fd.get_sregs() + ); + } + Err(VcpuError::VcpuUnhandledKvmExit) + } + } + } + } + } + + #[cfg(target_arch = "x86_64")] + // checkout the io port that dragonball used only + fn check_io_port_info(&self, addr: u16, data: &[u8]) -> Result { + let mut checked = false; + + match addr { + // debug info signal + MAGIC_IOPORT_DEBUG_INFO => { + if data.len() == 4 { + let data = unsafe { std::ptr::read(data.as_ptr() as *const u32) }; + log::warn!("KDBG: guest kernel debug info: 0x{:x}", data); + checked = true; + } + } + _ => {} + }; + + Ok(checked) + } + + fn gettid() -> u32 { + nix::unistd::gettid().as_raw() as u32 + } + + fn revalidate_cache(&mut self) -> Result<()> { + self.io_mgr.revalidate_cache(); + + Ok(()) + } + + /// Main loop of the vCPU thread. + /// + /// Runs the vCPU in KVM context in a loop. Handles KVM_EXITs then goes back in. + /// Note that the state of the VCPU and associated VM must be setup first for this to do + /// anything useful. + pub fn run(&mut self, seccomp_filter: BpfProgram) { + // Load seccomp filters for this vCPU thread. + // Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters + // altogether is the desired behaviour. + if let Err(e) = apply_filter(&seccomp_filter) { + if matches!(e, SecError::EmptyFilter) { + info!("vCPU thread {} use empty seccomp filters.", self.id); + } else { + panic!( + "Failed to set the requested seccomp filters on vCPU {}: Error: {}", + self.id, e + ); + } + } + + info!("vcpu {} is running", self.cpu_index()); + + // Start running the machine state in the `Paused` state. + StateMachine::run(self, Self::paused); + } + + // This is the main loop of the `Running` state. + fn running(&mut self) -> StateMachine { + // This loop is here just for optimizing the emulation path. + // No point in ticking the state machine if there are no external events. + loop { + match self.run_emulation() { + // Emulation ran successfully, continue. + Ok(VcpuEmulation::Handled) => { + // We need to break here if kvm doesn't support + // immediate_exit flag. Because the signal sent from vmm + // thread may occurs when handling the vcpu exit events, and + // in this case the external vcpu events may not be handled + // correctly, so we need to check the event_receiver channel + // after handle vcpu exit events to decrease the window that + // doesn't handle the vcpu external events. + if !self.support_immediate_exit { + break; + } + } + // Emulation was interrupted, check external events. + Ok(VcpuEmulation::Interrupted) => break, + // Emulation was stopped due to reset or shutdown. + Ok(VcpuEmulation::Stopped) => return StateMachine::next(Self::waiting_exit), + // Emulation errors lead to vCPU exit. + Err(e) => { + error!("vcpu: {}, run_emulation failed: {:?}", self.id, e); + return StateMachine::next(Self::waiting_exit); + } + } + } + + // By default don't change state. + let mut state = StateMachine::next(Self::running); + + // Break this emulation loop on any transition request/external event. + match self.event_receiver.try_recv() { + // Running ---- Exit ----> Exited + Ok(VcpuEvent::Exit) => { + // Move to 'exited' state. + state = StateMachine::next(Self::exited); + } + // Running ---- Pause ----> Paused + Ok(VcpuEvent::Pause) => { + // Nothing special to do. + self.response_sender + .send(VcpuResponse::Paused) + .expect("failed to send pause status"); + + // TODO: we should call `KVM_KVMCLOCK_CTRL` here to make sure + // TODO continued: the guest soft lockup watchdog does not panic on Resume. + //let _ = self.fd.kvmclock_ctrl(); + + // Move to 'paused' state. + state = StateMachine::next(Self::paused); + } + Ok(VcpuEvent::Resume) => { + self.response_sender + .send(VcpuResponse::Resumed) + .expect("failed to send resume status"); + } + Ok(VcpuEvent::Gettid) => { + self.response_sender + .send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid())) + .expect("failed to send vcpu thread tid"); + } + Ok(VcpuEvent::RevalidateCache) => { + self.revalidate_cache() + .map(|()| { + self.response_sender + .send(VcpuResponse::CacheRevalidated) + .expect("failed to revalidate vcpu IoManager cache"); + }) + .map_err(|e| self.response_sender.send(VcpuResponse::Error(e))) + .expect("failed to revalidate vcpu IoManager cache"); + } + // Unhandled exit of the other end. + Err(TryRecvError::Disconnected) => { + // Move to 'exited' state. + state = StateMachine::next(Self::exited); + } + // All other events or lack thereof have no effect on current 'running' state. + Err(TryRecvError::Empty) => (), + } + + state + } + + // This is the main loop of the `Paused` state. + fn paused(&mut self) -> StateMachine { + match self.event_receiver.recv() { + // Paused ---- Exit ----> Exited + Ok(VcpuEvent::Exit) => { + // Move to 'exited' state. + StateMachine::next(Self::exited) + } + // Paused ---- Resume ----> Running + Ok(VcpuEvent::Resume) => { + self.response_sender + .send(VcpuResponse::Resumed) + .expect("failed to send resume status"); + // Move to 'running' state. + StateMachine::next(Self::running) + } + Ok(VcpuEvent::Pause) => { + self.response_sender + .send(VcpuResponse::Paused) + .expect("failed to send pause status"); + // continue 'pause' state. + StateMachine::next(Self::paused) + } + Ok(VcpuEvent::Gettid) => { + self.response_sender + .send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid())) + .expect("failed to send vcpu thread tid"); + StateMachine::next(Self::paused) + } + Ok(VcpuEvent::RevalidateCache) => { + self.revalidate_cache() + .map(|()| { + self.response_sender + .send(VcpuResponse::CacheRevalidated) + .expect("failed to revalidate vcpu IoManager cache"); + }) + .map_err(|e| self.response_sender.send(VcpuResponse::Error(e))) + .expect("failed to revalidate vcpu IoManager cache"); + + StateMachine::next(Self::paused) + } + // Unhandled exit of the other end. + Err(_) => { + // Move to 'exited' state. + StateMachine::next(Self::exited) + } + } + } + + // This is the main loop of the `WaitingExit` state. + fn waiting_exit(&mut self) -> StateMachine { + // trigger vmm to stop machine + if let Err(e) = self.exit_evt.write(1) { + METRICS.vcpu.failures.inc(); + error!("Failed signaling vcpu exit event: {}", e); + } + + let mut state = StateMachine::next(Self::waiting_exit); + + match self.event_receiver.recv() { + Ok(VcpuEvent::Exit) => state = StateMachine::next(Self::exited), + Ok(_) => error!( + "wrong state received in waiting exit state on vcpu {}", + self.id + ), + Err(_) => { + error!( + "vcpu channel closed in waiting exit state on vcpu {}", + self.id + ); + state = StateMachine::next(Self::exited); + } + } + + state + } + + // This is the main loop of the `Exited` state. + fn exited(&mut self) -> StateMachine { + // State machine reached its end. + StateMachine::finish(Self::exited) + } +} + +impl Drop for Vcpu { + fn drop(&mut self) { + let _ = self.reset_thread_local_data(); + } +} + +#[cfg(test)] +pub mod tests { + use std::os::unix::io::AsRawFd; + use std::sync::mpsc::{channel, Receiver}; + use std::sync::Mutex; + + use arc_swap::ArcSwap; + use dbs_device::device_manager::IoManager; + use kvm_ioctls::Kvm; + use lazy_static::lazy_static; + + use super::*; + use crate::kvm_context::KvmContext; + + pub enum EmulationCase { + IoIn, + IoOut, + MmioRead, + MmioWrite, + Hlt, + Shutdown, + FailEntry, + InternalError, + Unknown, + SystemEvent(u32, u64), + Error(i32), + } + + lazy_static! { + pub static ref EMULATE_RES: Mutex = Mutex::new(EmulationCase::Unknown); + } + + impl Vcpu { + pub fn emulate(_fd: &VcpuFd) -> std::result::Result, kvm_ioctls::Error> { + let res = &*EMULATE_RES.lock().unwrap(); + match res { + EmulationCase::IoIn => Ok(VcpuExit::IoIn(0, &mut [])), + EmulationCase::IoOut => Ok(VcpuExit::IoOut(0, &[])), + EmulationCase::MmioRead => Ok(VcpuExit::MmioRead(0, &mut [])), + EmulationCase::MmioWrite => Ok(VcpuExit::MmioWrite(0, &[])), + EmulationCase::Hlt => Ok(VcpuExit::Hlt), + EmulationCase::Shutdown => Ok(VcpuExit::Shutdown), + EmulationCase::FailEntry => Ok(VcpuExit::FailEntry), + EmulationCase::InternalError => Ok(VcpuExit::InternalError), + EmulationCase::Unknown => Ok(VcpuExit::Unknown), + EmulationCase::SystemEvent(event_type, event_flags) => { + Ok(VcpuExit::SystemEvent(*event_type, *event_flags)) + } + EmulationCase::Error(e) => Err(kvm_ioctls::Error::new(*e)), + } + } + } + + #[cfg(target_arch = "x86_64")] + fn create_vcpu() -> (Vcpu, Receiver) { + // Call for kvm too frequently would cause error in some host kernel. + std::thread::sleep(std::time::Duration::from_millis(5)); + + let kvm = Kvm::new().unwrap(); + let vm = Arc::new(kvm.create_vm().unwrap()); + let kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap(); + let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap()); + let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new())))); + let supported_cpuid = kvm_context + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .unwrap(); + let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let (tx, rx) = channel(); + let time_stamp = TimestampUs::default(); + + let vcpu = Vcpu::new_x86_64( + 0, + vcpu_fd, + io_manager, + supported_cpuid, + reset_event_fd, + vcpu_state_event, + tx, + time_stamp, + false, + ) + .unwrap(); + + (vcpu, rx) + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_vcpu_run_emulation() { + let (mut vcpu, _) = create_vcpu(); + + // Io in + *(EMULATE_RES.lock().unwrap()) = EmulationCase::IoIn; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Io out + *(EMULATE_RES.lock().unwrap()) = EmulationCase::IoOut; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Mmio read + *(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioRead; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Mmio write + *(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioWrite; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // KVM_EXIT_HLT signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Hlt; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_SHUTDOWN signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Shutdown; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_FAIL_ENTRY signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::FailEntry; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_INTERNAL_ERROR signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::InternalError; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_SYSTEM_EVENT_RESET + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Stopped))); + + // KVM_SYSTEM_EVENT_SHUTDOWN + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Stopped))); + + // Other system event + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // Unknown exit reason + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Unknown; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // Error: EAGAIN + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EAGAIN); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Error: EINTR + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINTR); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Interrupted))); + + // other error + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINVAL); + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_vcpu_check_io_port_info() { + let (vcpu, receiver) = create_vcpu(); + + // boot complete signal + let res = vcpu + .check_io_port_info( + MAGIC_IOPORT_SIGNAL_GUEST_BOOT_COMPLETE, + &[MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE], + ) + .unwrap(); + assert!(res); + + // debug info signal + let res = vcpu + .check_io_port_info(MAGIC_IOPORT_DEBUG_INFO, &[0, 0, 0, 0]) + .unwrap(); + assert!(res); + } +} diff --git a/src/dragonball/src/vcpu/vcpu_manager.rs b/src/dragonball/src/vcpu/vcpu_manager.rs new file mode 100644 index 0000000000..189dfc6155 --- /dev/null +++ b/src/dragonball/src/vcpu/vcpu_manager.rs @@ -0,0 +1,1039 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// +// Copyright © 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +//! vCPU manager to enable bootstrap and CPU hotplug. +use std::io; +use std::os::unix::io::AsRawFd; +use std::sync::mpsc::{channel, Receiver, RecvError, RecvTimeoutError, Sender}; +use std::sync::{Arc, Barrier, Mutex, RwLock}; +use std::time::Duration; + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +use dbs_upcall::{DevMgrService, UpcallClient}; +use dbs_utils::epoll_manager::{EpollManager, EventOps, EventSet, Events, MutEventSubscriber}; +use dbs_utils::time::TimestampUs; +use kvm_ioctls::{Cap, VcpuFd, VmFd}; +use log::{debug, error, info}; +use seccompiler::{apply_filter, BpfProgram, Error as SecError}; +use vm_memory::GuestAddress; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::api::v1::InstanceInfo; +use crate::kvm_context::KvmContext; +use crate::vcpu::vcpu_impl::{ + Vcpu, VcpuError, VcpuEvent, VcpuHandle, VcpuResponse, VcpuStateEvent, +}; +use crate::vcpu::VcpuConfig; +use crate::vm::VmConfigInfo; +use crate::IoManagerCached; + +/// the timeout for communication with vcpu threads +const CPU_RECV_TIMEOUT_MS: u64 = 1000; + +/// vCPU manager error +#[derive(Debug, thiserror::Error)] +pub enum VcpuManagerError { + /// IO errors in vCPU manager + #[error("IO errors in vCPU manager {0}")] + VcpuIO(#[source] io::Error), + + /// vCPU manager is not initialized + #[error("vcpu manager is not initialized")] + VcpuManagerNotInitialized, + + /// Expected vcpu exceed max count + #[error("expected vcpu exceed max count")] + ExpectedVcpuExceedMax, + + /// vCPU not found + #[error("vcpu not found {0}")] + VcpuNotFound(u8), + + /// Cannot recv vCPU thread tid + #[error("cannot get vCPU thread id")] + VcpuGettid, + + /// vCPU pause failed. + #[error("failure while pausing vCPU thread")] + VcpuPause, + + /// vCPU resume failed. + #[error("failure while resuming vCPU thread")] + VcpuResume, + + /// vCPU save failed. + #[error("failure while save vCPU state")] + VcpuSave, + + /// Vcpu is in unexpected state. + #[error("Vcpu is in unexpected state")] + UnexpectedVcpuResponse, + + /// Vcpu not create + #[error("Vcpu is not create")] + VcpuNotCreate, + + /// The number of max_vcpu reached kvm's limitation + #[error("specified vcpu count {0} is greater than max allowed count {1} by kvm")] + MaxVcpuLimitation(u8, usize), + + /// Revalidate vcpu IoManager cache failed. + #[error("failure while revalidating vcpu IoManager cache")] + VcpuRevalidateCache, + + /// Event fd is already set so there could be some problem in the VMM if we try to reset it. + #[error("Event fd is already set for the vcpu")] + EventAlreadyExist, + + /// Response channel error + #[error("Response channel error: {0}")] + VcpuResponseChannel(RecvError), + + /// Vcpu response timeout + #[error("Vcpu response timeout: {0}")] + VcpuResponseTimeout(RecvTimeoutError), + + /// Cannot build seccomp filters. + #[error("failure while configuring seccomp filters: {0}")] + SeccompFilters(#[source] seccompiler::Error), + + /// Cannot send event to vCPU. + #[error("failure while sending message to vCPU thread: {0}")] + VcpuEvent(#[source] VcpuError), + + /// vCPU Error + #[error("vcpu internal error: {0}")] + Vcpu(#[source] VcpuError), + + #[cfg(feature = "hotplug")] + /// vCPU resize error + #[error("resize vcpu error: {0}")] + VcpuResize(#[source] VcpuResizeError), + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command: {0}")] + Kvm(#[source] kvm_ioctls::Error), +} + +#[cfg(feature = "hotplug")] +/// Errror associated with resize instance +#[derive(Debug, thiserror::Error)] +pub enum VcpuResizeError { + /// vcpu is in hotplug process + #[error("vcpu is in hotplug process")] + VcpuIsHotplugging, + + /// Cannot update the configuration of the microvm pre boot. + #[error("resize vcpu operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// Expected vcpu exceed max count + #[error("expected vcpu exceed max count")] + ExpectedVcpuExceedMax, + + /// vcpu 0 can't be removed + #[error("vcpu 0 can't be removed")] + Vcpu0CanNotBeRemoved, + + /// Lack removable vcpu + #[error("Removable vcpu not enough, removable vcpu num: {0}, number to remove: {1}, present vcpu count {2}")] + LackRemovableVcpus(u16, u16, u16), + + /// Cannot update the configuration by upcall channel. + #[error("cannot update the configuration by upcall channel: {0}")] + Upcall(#[source] dbs_upcall::UpcallClientError), +} + +/// Result for vCPU manager operations +pub type Result = std::result::Result; + +#[derive(Debug, PartialEq, Copy, Clone)] +enum VcpuAction { + None, + Hotplug, + Hotunplug, +} + +/// Infos related to per vcpu +#[derive(Default)] +pub(crate) struct VcpuInfo { + pub(crate) vcpu: Option, + vcpu_fd: Option>, + handle: Option, + tid: u32, +} + +impl std::fmt::Debug for VcpuInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VcpuInfo") + .field("vcpu", &self.vcpu.is_some()) + .field("vcpu_fd", &self.vcpu_fd.is_some()) + .field("handle", &self.handle.is_some()) + .field("tid", &self.tid) + .finish() + } +} + +/// Manage all vcpu related actions +pub struct VcpuManager { + pub(crate) vcpu_infos: Vec, + vcpu_config: VcpuConfig, + vcpu_seccomp_filter: BpfProgram, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + support_immediate_exit: bool, + + // The purpose of putting a reference of IoManager here is to simplify the + // design of the API when creating vcpus, and the IoManager has numerous OS + // resources that need to be released when vmm exits. However, since + // VcpuManager is referenced by VcpuEpollHandler and VcpuEpollHandler will + // not be released when vmm is closed, we need to release io manager + // manually when we exit all vcpus. + io_manager: Option, + shared_info: Arc>, + vm_as: GuestAddressSpaceImpl, + pub(crate) vm_fd: Arc, + + action_sycn_tx: Option>, + vcpus_in_action: (VcpuAction, Vec), + pub(crate) reset_event_fd: Option, + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_channel: Option>>, + + // X86 specific fields. + #[cfg(target_arch = "x86_64")] + pub(crate) supported_cpuid: kvm_bindings::CpuId, +} + +#[allow(clippy::too_many_arguments)] +impl VcpuManager { + /// Get a new VcpuManager instance + pub fn new( + vm_fd: Arc, + kvm_context: &KvmContext, + vm_config_info: &VmConfigInfo, + vm_as: GuestAddressSpaceImpl, + vcpu_seccomp_filter: BpfProgram, + shared_info: Arc>, + io_manager: IoManagerCached, + epoll_manager: EpollManager, + ) -> Result>> { + let support_immediate_exit = kvm_context.kvm().check_extension(Cap::ImmediateExit); + let max_vcpu_count = vm_config_info.max_vcpu_count; + let kvm_max_vcpu_count = kvm_context.get_max_vcpus(); + + // check the max vcpu count in kvm. max_vcpu_count is u8 and kvm_context.get_max_vcpus() + // returns usize, so convert max_vcpu_count to usize instead of converting kvm max vcpu to + // u8, to avoid wraping usize. Otherwise if kvm_max_vcpu_count is greater than 255, it'll + // be casted into a smaller number. + if max_vcpu_count as usize > kvm_max_vcpu_count { + error!( + "vcpu_manager: specified vcpu count {} is greater than max allowed count {} by kvm", + max_vcpu_count, kvm_max_vcpu_count + ); + return Err(VcpuManagerError::MaxVcpuLimitation( + max_vcpu_count, + kvm_max_vcpu_count, + )); + } + + let mut vcpu_infos = Vec::with_capacity(max_vcpu_count.into()); + vcpu_infos.resize_with(max_vcpu_count.into(), Default::default); + + let (tx, rx) = channel(); + let vcpu_state_event = + EventFd::new(libc::EFD_NONBLOCK).map_err(VcpuManagerError::VcpuIO)?; + let vcpu_state_event2 = vcpu_state_event + .try_clone() + .map_err(VcpuManagerError::VcpuIO)?; + + #[cfg(target_arch = "x86_64")] + let supported_cpuid = kvm_context + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .map_err(VcpuManagerError::Kvm)?; + #[cfg(target_arch = "x86_64")] + let vpmu_feature_level = match vm_config_info.vpmu_feature { + 1 => dbs_arch::cpuid::VpmuFeatureLevel::LimitedlyEnabled, + 2 => dbs_arch::cpuid::VpmuFeatureLevel::FullyEnabled, + _ => dbs_arch::cpuid::VpmuFeatureLevel::Disabled, + }; + + let vcpu_manager = Arc::new(Mutex::new(VcpuManager { + vcpu_infos, + vcpu_config: VcpuConfig { + boot_vcpu_count: vm_config_info.vcpu_count, + max_vcpu_count, + threads_per_core: vm_config_info.cpu_topology.threads_per_core, + cores_per_die: vm_config_info.cpu_topology.cores_per_die, + dies_per_socket: vm_config_info.cpu_topology.dies_per_socket, + sockets: vm_config_info.cpu_topology.sockets, + #[cfg(target_arch = "x86_64")] + vpmu_feature: vpmu_feature_level, + }, + vcpu_seccomp_filter, + vcpu_state_event, + vcpu_state_sender: tx, + support_immediate_exit, + io_manager: Some(io_manager), + shared_info, + vm_as, + vm_fd, + action_sycn_tx: None, + vcpus_in_action: (VcpuAction::None, Vec::new()), + reset_event_fd: None, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_channel: None, + #[cfg(target_arch = "x86_64")] + supported_cpuid, + })); + + let handler = Box::new(VcpuEpollHandler { + vcpu_manager: vcpu_manager.clone(), + eventfd: vcpu_state_event2, + rx, + }); + epoll_manager.add_subscriber(handler); + + Ok(vcpu_manager) + } + + /// get vcpu instances in vcpu manager + pub fn vcpus(&self) -> Vec<&Vcpu> { + let mut vcpus = Vec::new(); + for vcpu_info in &self.vcpu_infos { + if let Some(vcpu) = &vcpu_info.vcpu { + vcpus.push(vcpu); + } + } + vcpus + } + + /// get vcpu instances in vcpu manager as mut + pub fn vcpus_mut(&mut self) -> Vec<&mut Vcpu> { + let mut vcpus = Vec::new(); + for vcpu_info in &mut self.vcpu_infos { + if let Some(vcpu) = &mut vcpu_info.vcpu { + vcpus.push(vcpu); + } + } + vcpus + } + + /// add reset event fd for each vcpu, if the reset_event_fd is already set, error will be returned. + pub fn set_reset_event_fd(&mut self, reset_event_fd: EventFd) -> Result<()> { + if self.reset_event_fd.is_some() { + return Err(VcpuManagerError::EventAlreadyExist); + } + self.reset_event_fd = Some(reset_event_fd); + Ok(()) + } + + /// create default num of vcpus for bootup + pub fn create_boot_vcpus( + &mut self, + request_ts: TimestampUs, + entry_addr: GuestAddress, + ) -> Result<()> { + info!("create boot vcpus"); + self.create_vcpus( + self.vcpu_config.boot_vcpu_count, + Some(request_ts), + Some(entry_addr), + )?; + + Ok(()) + } + + /// start the boot vcpus + pub fn start_boot_vcpus(&mut self, vmm_seccomp_filter: BpfProgram) -> Result<()> { + info!("start boot vcpus"); + self.start_vcpus(self.vcpu_config.boot_vcpu_count, vmm_seccomp_filter, true)?; + + Ok(()) + } + + /// create a specified num of vcpu + /// note: we can't create vcpus again until the previously created vcpus are + /// started + pub fn create_vcpus( + &mut self, + vcpu_count: u8, + request_ts: Option, + entry_addr: Option, + ) -> Result> { + info!("create vcpus"); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::ExpectedVcpuExceedMax); + } + + let request_ts = request_ts.unwrap_or_default(); + let mut created_cpus = Vec::new(); + for cpu_id in self.calculate_available_vcpus(vcpu_count) { + self.create_vcpu(cpu_id, request_ts.clone(), entry_addr)?; + created_cpus.push(cpu_id); + } + + Ok(created_cpus) + } + + /// start a specified num of vcpu + pub fn start_vcpus( + &mut self, + vcpu_count: u8, + vmm_seccomp_filter: BpfProgram, + need_resume: bool, + ) -> Result<()> { + info!("start vcpus"); + Vcpu::register_kick_signal_handler(); + self.activate_vcpus(vcpu_count, need_resume)?; + + // Load seccomp filters for the VMM thread. + // Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters + // altogether is the desired behaviour. + if let Err(e) = apply_filter(&vmm_seccomp_filter) { + if !matches!(e, SecError::EmptyFilter) { + return Err(VcpuManagerError::SeccompFilters(e)); + } + } + + Ok(()) + } + + /// pause all vcpus + pub fn pause_all_vcpus(&mut self) -> Result<()> { + self.pause_vcpus(&self.present_vcpus()) + } + + /// resume all vcpus + pub fn resume_all_vcpus(&mut self) -> Result<()> { + self.resume_vcpus(&self.present_vcpus()) + } + + /// exit all vcpus, and never restart again + pub fn exit_all_vcpus(&mut self) -> Result<()> { + self.exit_vcpus(&self.present_vcpus())?; + // clear all vcpu infos + self.vcpu_infos.clear(); + // release io manager's reference manually + self.io_manager.take(); + + Ok(()) + } + + /// revalidate IoManager cache of all vcpus + pub fn revalidate_all_vcpus_cache(&mut self) -> Result<()> { + self.revalidate_vcpus_cache(&self.present_vcpus()) + } + + /// return all present vcpus + pub fn present_vcpus(&self) -> Vec { + self.vcpu_infos + .iter() + .enumerate() + .filter(|(_i, info)| info.handle.is_some()) + .map(|(i, _info)| i as u8) + .collect() + } + + /// Get available vcpus to create with target vcpu_count + /// Argument: + /// * vcpu_count: target vcpu_count online in VcpuManager. + /// Return: + /// * return available vcpu ids to create vcpu . + fn calculate_available_vcpus(&self, vcpu_count: u8) -> Vec { + let present_vcpus_count = self.present_vcpus_count(); + let mut available_vcpus = Vec::new(); + + if present_vcpus_count < vcpu_count { + let mut size = vcpu_count - present_vcpus_count; + for cpu_id in 0..self.vcpu_config.max_vcpu_count { + let info = &self.vcpu_infos[cpu_id as usize]; + if info.handle.is_none() { + available_vcpus.push(cpu_id); + size -= 1; + if size == 0 { + break; + } + } + } + } + + available_vcpus + } + + /// Present vcpus count + fn present_vcpus_count(&self) -> u8 { + self.vcpu_infos + .iter() + .fold(0, |sum, info| sum + info.handle.is_some() as u8) + } + + /// Configure single vcpu + fn configure_single_vcpu( + &mut self, + entry_addr: Option, + vcpu: &mut Vcpu, + ) -> std::result::Result<(), VcpuError> { + vcpu.configure( + &self.vcpu_config, + &self.vm_fd, + &self.vm_as, + entry_addr, + None, + ) + } + + fn create_vcpu( + &mut self, + cpu_index: u8, + request_ts: TimestampUs, + entry_addr: Option, + ) -> Result<()> { + info!("creating vcpu {}", cpu_index); + if self.vcpu_infos.get(cpu_index as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(cpu_index)); + } + // We will reuse the kvm's vcpufd after first creation, for we can't + // create vcpufd with same id in one kvm instance. + let kvm_vcpu = match &self.vcpu_infos[cpu_index as usize].vcpu_fd { + Some(vcpu_fd) => vcpu_fd.clone(), + None => { + let vcpu_fd = Arc::new( + self.vm_fd + .create_vcpu(cpu_index as u64) + .map_err(VcpuError::VcpuFd) + .map_err(VcpuManagerError::Vcpu)?, + ); + self.vcpu_infos[cpu_index as usize].vcpu_fd = Some(vcpu_fd.clone()); + vcpu_fd + } + }; + + let mut vcpu = self.create_vcpu_arch(cpu_index, kvm_vcpu, request_ts)?; + self.configure_single_vcpu(entry_addr, &mut vcpu) + .map_err(VcpuManagerError::Vcpu)?; + self.vcpu_infos[cpu_index as usize].vcpu = Some(vcpu); + + Ok(()) + } + + fn start_vcpu(&mut self, cpu_index: u8, barrier: Arc) -> Result<()> { + info!("starting vcpu {}", cpu_index); + if self.vcpu_infos.get(cpu_index as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(cpu_index)); + } + if let Some(vcpu) = self.vcpu_infos[cpu_index as usize].vcpu.take() { + let handle = vcpu + .start_threaded(self.vcpu_seccomp_filter.clone(), barrier) + .map_err(VcpuManagerError::Vcpu)?; + self.vcpu_infos[cpu_index as usize].handle = Some(handle); + Ok(()) + } else { + Err(VcpuManagerError::VcpuNotCreate) + } + } + + fn get_vcpus_tid(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Gettid) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + for cpu_id in cpu_indexes { + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + match handle + .response_receiver() + .recv_timeout(Duration::from_millis(CPU_RECV_TIMEOUT_MS)) + { + Ok(VcpuResponse::Tid(_, id)) => self.vcpu_infos[*cpu_id as usize].tid = id, + Err(e) => { + error!("vCPU get tid error! {:?}", e); + return Err(VcpuManagerError::VcpuGettid); + } + _ => { + error!("vCPU get tid error!"); + return Err(VcpuManagerError::VcpuGettid); + } + } + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + // Save all vCPU thread ID to self.shared_info + let tids: Vec<(u8, u32)> = cpu_indexes + .iter() + .map(|cpu_id| (*cpu_id, self.vcpu_infos[*cpu_id as usize].tid)) + .collect(); + + // Append the new started vcpu thread IDs into self.shared_info + self.shared_info + .write() + .unwrap() + .tids + .extend_from_slice(&tids[..]); + + Ok(()) + } + + fn revalidate_vcpus_cache(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::RevalidateCache) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + fn pause_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Pause) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + fn resume_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Resume) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + // exit vcpus and notify the vmm exit event + fn exit_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + info!("exiting vcpus {:?}", cpu_indexes); + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Exit) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + for cpu_id in cpu_indexes { + let handle = self.vcpu_infos[*cpu_id as usize].handle.take().unwrap(); + handle + .join_vcpu_thread() + .map_err(|e| error!("vcpu exit error! {:?}", e)) + .ok(); + } + + let tids: &mut Vec<(u8, u32)> = &mut self + .shared_info + .write() + .expect( + "Failed to stop vcpus because shared info couldn't be written due to poisoned lock", + ) + .tids; + + // Here's a trick: since we always stop the vcpus started latest, + // thus it's ok here to remove the stopped vcpus from end to head. + tids.truncate(tids.len() - cpu_indexes.len()); + + Ok(()) + } + + fn stop_vcpus_in_action(&mut self) -> Result<()> { + let vcpus_in_action = self.vcpus_in_action.1.clone(); + self.exit_vcpus(&vcpus_in_action) + } + + fn activate_vcpus(&mut self, vcpu_count: u8, need_resume: bool) -> Result> { + let present_vcpus_count = self.present_vcpus_count(); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::ExpectedVcpuExceedMax); + } else if vcpu_count < present_vcpus_count { + return Ok(Vec::new()); + } + + let available_vcpus = self.calculate_available_vcpus(vcpu_count); + let barrier = Arc::new(Barrier::new(available_vcpus.len() + 1_usize)); + for cpu_id in available_vcpus.iter() { + self.start_vcpu(*cpu_id, barrier.clone())?; + } + barrier.wait(); + + self.get_vcpus_tid(&available_vcpus)?; + if need_resume { + self.resume_vcpus(&available_vcpus)?; + } + + Ok(available_vcpus) + } + + fn sync_action_finish(&mut self, got_error: bool) { + if let Some(tx) = self.action_sycn_tx.take() { + if let Err(e) = tx.send(got_error) { + debug!("cpu sync action send to closed channel {}", e); + } + } + } + + fn set_vcpus_action(&mut self, action: VcpuAction, vcpus: Vec) { + self.vcpus_in_action = (action, vcpus); + } + + fn get_vcpus_action(&self) -> VcpuAction { + self.vcpus_in_action.0 + } +} + +#[cfg(target_arch = "x86_64")] +impl VcpuManager { + fn create_vcpu_arch( + &self, + cpu_index: u8, + vcpu_fd: Arc, + request_ts: TimestampUs, + ) -> Result { + // It's safe to unwrap because guest_kernel always exist until vcpu manager done + Vcpu::new_x86_64( + cpu_index, + vcpu_fd, + // safe to unwrap + self.io_manager.as_ref().unwrap().clone(), + self.supported_cpuid.clone(), + self.reset_event_fd.as_ref().unwrap().try_clone().unwrap(), + self.vcpu_state_event.try_clone().unwrap(), + self.vcpu_state_sender.clone(), + request_ts, + self.support_immediate_exit, + ) + .map_err(VcpuManagerError::Vcpu) + } +} + +#[cfg(target_arch = "aarch64")] +impl VcpuManager { + // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) and configured before + // setting up the IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP + // was already initialized. + // Search for `kvm_arch_vcpu_create` in arch/arm/kvm/arm.c. + fn create_vcpu_arch( + &self, + cpu_index: u8, + vcpu_fd: Arc, + request_ts: TimestampUs, + ) -> Result { + Vcpu::new_aarch64( + cpu_index, + vcpu_fd, + // safe to unwrap + self.io_manager.as_ref().unwrap().clone(), + self.reset_event_fd.as_ref().unwrap().try_clone().unwrap(), + self.vcpu_state_event.try_clone().unwrap(), + self.vcpu_state_sender.clone(), + request_ts.clone(), + self.support_immediate_exit, + ) + .map_err(VcpuManagerError::Vcpu) + } +} + +#[cfg(feature = "hotplug")] +mod hotplug { + use std::cmp::Ordering; + + use super::*; + #[cfg(not(test))] + use dbs_upcall::CpuDevRequest; + use dbs_upcall::{DevMgrRequest, DevMgrResponse, UpcallClientRequest, UpcallClientResponse}; + + #[cfg(all(target_arch = "x86_64", not(test)))] + use dbs_boot::mptable::APIC_VERSION; + #[cfg(all(target_arch = "aarch64", not(test)))] + const APIC_VERSION: u8 = 0; + + #[cfg(feature = "dbs-upcall")] + impl VcpuManager { + /// add upcall channel for vcpu manager + pub fn set_upcall_channel( + &mut self, + upcall_channel: Option>>, + ) { + self.upcall_channel = upcall_channel; + } + + /// resize the count of vcpu in runtime + pub fn resize_vcpu( + &mut self, + vcpu_count: u8, + sync_tx: Option>, + ) -> std::result::Result<(), VcpuManagerError> { + if self.get_vcpus_action() != VcpuAction::None { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::VcpuIsHotplugging, + )); + } + self.action_sycn_tx = sync_tx; + + if let Some(upcall) = self.upcall_channel.clone() { + let now_vcpu = self.present_vcpus_count(); + info!("resize vcpu: now: {}, desire: {}", now_vcpu, vcpu_count); + match vcpu_count.cmp(&now_vcpu) { + Ordering::Equal => { + info!("resize vcpu: no need to resize"); + self.sync_action_finish(false); + Ok(()) + } + Ordering::Greater => self.do_add_vcpu(vcpu_count, upcall), + Ordering::Less => self.do_del_vcpu(vcpu_count, upcall), + } + } else { + Err(VcpuManagerError::VcpuResize( + VcpuResizeError::UpdateNotAllowedPostBoot, + )) + } + } + + fn do_add_vcpu( + &mut self, + vcpu_count: u8, + upcall_client: Arc>, + ) -> std::result::Result<(), VcpuManagerError> { + info!("resize vcpu: add"); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::ExpectedVcpuExceedMax, + )); + } + + let created_vcpus = self.create_vcpus(vcpu_count, None, None)?; + let cpu_ids = self.activate_vcpus(vcpu_count, true).map_err(|e| { + // we need to rollback when activate vcpu error + error!("activate vcpu error, rollback! {:?}", e); + let activated_vcpus: Vec = created_vcpus + .iter() + .filter(|&cpu_id| self.vcpu_infos[*cpu_id as usize].handle.is_some()) + .copied() + .collect(); + if let Err(e) = self.exit_vcpus(&activated_vcpus) { + error!("try to rollback error, stop_vcpu: {:?}", e); + } + e + })?; + + let mut cpu_ids_array = [0u8; (u8::MAX as usize) + 1]; + cpu_ids_array[..cpu_ids.len()].copy_from_slice(&cpu_ids[..cpu_ids.len()]); + let req = DevMgrRequest::AddVcpu(CpuDevRequest { + count: cpu_ids.len() as u8, + apic_ids: cpu_ids_array, + apic_ver: APIC_VERSION, + }); + self.send_upcall_action(upcall_client, req)?; + + self.set_vcpus_action(VcpuAction::Hotplug, cpu_ids); + + Ok(()) + } + + fn do_del_vcpu( + &mut self, + vcpu_count: u8, + upcall_client: Arc>, + ) -> std::result::Result<(), VcpuManagerError> { + info!("resize vcpu: delete"); + if vcpu_count == 0 { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::Vcpu0CanNotBeRemoved, + )); + } + + let mut cpu_ids = self.calculate_removable_vcpus(); + let cpu_num_to_be_del = (self.present_vcpus_count() - vcpu_count) as usize; + if cpu_num_to_be_del >= cpu_ids.len() { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::LackRemovableVcpus( + cpu_ids.len() as u16, + cpu_num_to_be_del as u16, + self.present_vcpus_count() as u16, + ), + )); + } + + cpu_ids.reverse(); + cpu_ids.truncate(cpu_num_to_be_del); + + let mut cpu_ids_array = [0u8; 256]; + cpu_ids_array[..cpu_ids.len()].copy_from_slice(&cpu_ids[..cpu_ids.len()]); + let req = DevMgrRequest::DelVcpu(CpuDevRequest { + count: cpu_num_to_be_del as u8, + apic_ids: cpu_ids_array, + apic_ver: APIC_VERSION, + }); + self.send_upcall_action(upcall_client, req)?; + + self.set_vcpus_action(VcpuAction::Hotunplug, cpu_ids); + + Ok(()) + } + + #[cfg(test)] + fn send_upcall_action( + &self, + _upcall_client: Arc>, + _request: DevMgrRequest, + ) -> std::result::Result<(), VcpuManagerError> { + Ok(()) + } + + #[cfg(not(test))] + fn send_upcall_action( + &self, + upcall_client: Arc>, + request: DevMgrRequest, + ) -> std::result::Result<(), VcpuManagerError> { + let vcpu_state_event = self.vcpu_state_event.try_clone().unwrap(); + let vcpu_state_sender = self.vcpu_state_sender.clone(); + + upcall_client + .send_request( + UpcallClientRequest::DevMgr(request), + Box::new(move |result| match result { + UpcallClientResponse::DevMgr(response) => { + if let DevMgrResponse::CpuDev(resp) = response { + vcpu_state_sender + .send(VcpuStateEvent::Hotplug(( + resp.result, + resp.info.apic_id_index, + ))) + .unwrap(); + vcpu_state_event.write(1).unwrap(); + } + } + UpcallClientResponse::UpcallReset => { + vcpu_state_sender + .send(VcpuStateEvent::Hotplug((0, 0))) + .unwrap(); + vcpu_state_event.write(1).unwrap(); + } + #[cfg(test)] + UpcallClientResponse::FakeResponse => { + panic!("shouldn't happen"); + } + }), + ) + .map_err(VcpuResizeError::Upcall) + .map_err(VcpuManagerError::VcpuResize) + } + + /// Get removable vcpus. + /// Return: + /// * return removable vcpu_id with cascade order. + fn calculate_removable_vcpus(&self) -> Vec { + self.present_vcpus() + } + } +} + +struct VcpuEpollHandler { + vcpu_manager: Arc>, + eventfd: EventFd, + rx: Receiver, +} + +impl VcpuEpollHandler { + fn process_cpu_state_event(&mut self, _ops: &mut EventOps) { + // It's level triggered, so it's safe to ignore the result. + let _ = self.eventfd.read(); + while let Ok(event) = self.rx.try_recv() { + match event { + VcpuStateEvent::Hotplug((success, cpu_count)) => { + info!("get vcpu event, cpu_index {}", cpu_count); + self.process_cpu_action(success != 0, cpu_count); + } + } + } + } + + fn process_cpu_action(&self, success: bool, _cpu_index: u32) { + let mut vcpu_manager = self.vcpu_manager.lock().unwrap(); + if success { + match vcpu_manager.get_vcpus_action() { + VcpuAction::Hotplug => { + // Notify hotplug success + vcpu_manager.sync_action_finish(false); + } + VcpuAction::Hotunplug => { + if let Err(e) = vcpu_manager.stop_vcpus_in_action() { + error!("stop vcpus in action error: {:?}", e); + } + // notify hotunplug success + vcpu_manager.sync_action_finish(false); + } + VcpuAction::None => { + error!("cannot be here"); + } + }; + vcpu_manager.set_vcpus_action(VcpuAction::None, Vec::new()); + + vcpu_manager.sync_action_finish(true); + // TODO(sicun): rollback + } + } +} + +impl MutEventSubscriber for VcpuEpollHandler { + fn process(&mut self, events: Events, ops: &mut EventOps) { + let vcpu_state_eventfd = self.eventfd.as_raw_fd(); + + match events.fd() { + fd if fd == vcpu_state_eventfd => self.process_cpu_state_event(ops), + _ => error!("vcpu manager epoll handler: unknown event"), + } + } + + fn init(&mut self, ops: &mut EventOps) { + ops.add(Events::new(&self.eventfd, EventSet::IN)).unwrap(); + } +} diff --git a/src/dragonball/src/vcpu/x86_64.rs b/src/dragonball/src/vcpu/x86_64.rs new file mode 100644 index 0000000000..738d574bba --- /dev/null +++ b/src/dragonball/src/vcpu/x86_64.rs @@ -0,0 +1,149 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::sync::mpsc::{channel, Sender}; +use std::sync::Arc; + +use dbs_arch::cpuid::{process_cpuid, VmSpec}; +use dbs_arch::gdt::gdt_entry; +use dbs_utils::time::TimestampUs; +use kvm_bindings::CpuId; +use kvm_ioctls::{VcpuFd, VmFd}; +use log::error; +use vm_memory::{Address, GuestAddress, GuestAddressSpace}; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::metric::{IncMetric, METRICS}; +use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent}; +use crate::vcpu::VcpuConfig; +use crate::IoManagerCached; + +impl Vcpu { + /// Constructs a new VCPU for `vm`. + /// + /// # Arguments + /// + /// * `id` - Represents the CPU number between [0, max vcpus). + /// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu. + /// * `io_mgr` - The io-manager used to access port-io and mmio devices. + /// * `cpuid` - The `CpuId` listing the supported capabilities of this vcpu. + /// * `exit_evt` - An `EventFd` that will be written into when this vcpu + /// exits. + /// * `vcpu_state_event` - The eventfd which can notify vmm state of some + /// vcpu should change. + /// * `vcpu_state_sender` - The channel to send state change message from + /// vcpu thread to vmm thread. + /// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime. + /// * `support_immediate_exit` - whether kvm used supports immediate_exit flag. + #[allow(clippy::too_many_arguments)] + pub fn new_x86_64( + id: u8, + vcpu_fd: Arc, + io_mgr: IoManagerCached, + cpuid: CpuId, + exit_evt: EventFd, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + create_ts: TimestampUs, + support_immediate_exit: bool, + ) -> Result { + let (event_sender, event_receiver) = channel(); + let (response_sender, response_receiver) = channel(); + // Initially the cpuid per vCPU is the one supported by this VM. + Ok(Vcpu { + fd: vcpu_fd, + id, + io_mgr, + create_ts, + event_receiver, + event_sender: Some(event_sender), + response_receiver: Some(response_receiver), + response_sender, + vcpu_state_event, + vcpu_state_sender, + exit_evt, + support_immediate_exit, + cpuid, + }) + } + + /// Configures a x86_64 specific vcpu and should be called once per vcpu. + /// + /// # Arguments + /// + /// * `vm_config` - The machine configuration of this microvm needed for the CPUID configuration. + /// * `vm_fd` - The kvm `VmFd` for the virtual machine this vcpu will get attached to. + /// * `vm_memory` - The guest memory used by this microvm. + /// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts. + /// * `pgtable_addr` - pgtable address for ap vcpu + pub fn configure( + &mut self, + vcpu_config: &VcpuConfig, + _vm_fd: &VmFd, + vm_as: &GuestAddressSpaceImpl, + kernel_start_addr: Option, + _pgtable_addr: Option, + ) -> Result<()> { + self.set_cpuid(vcpu_config)?; + + dbs_arch::regs::setup_msrs(&self.fd).map_err(VcpuError::MSRSConfiguration)?; + if let Some(start_addr) = kernel_start_addr { + dbs_arch::regs::setup_regs( + &self.fd, + start_addr.raw_value() as u64, + dbs_boot::layout::BOOT_STACK_POINTER, + dbs_boot::layout::BOOT_STACK_POINTER, + dbs_boot::layout::ZERO_PAGE_START, + ) + .map_err(VcpuError::REGSConfiguration)?; + dbs_arch::regs::setup_fpu(&self.fd).map_err(VcpuError::FPUConfiguration)?; + let gdt_table: [u64; dbs_boot::layout::BOOT_GDT_MAX as usize] = [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ]; + let pgtable_addr = + dbs_boot::setup_identity_mapping(&*vm_as.memory()).map_err(VcpuError::PageTable)?; + dbs_arch::regs::setup_sregs( + &*vm_as.memory(), + &self.fd, + pgtable_addr, + &gdt_table, + dbs_boot::layout::BOOT_GDT_OFFSET, + dbs_boot::layout::BOOT_IDT_OFFSET, + ) + .map_err(VcpuError::SREGSConfiguration)?; + } + dbs_arch::interrupts::set_lint(&self.fd).map_err(VcpuError::LocalIntConfiguration)?; + + Ok(()) + } + + fn set_cpuid(&mut self, vcpu_config: &VcpuConfig) -> Result<()> { + let cpuid_vm_spec = VmSpec::new( + self.id, + vcpu_config.max_vcpu_count as u8, + vcpu_config.threads_per_core, + vcpu_config.cores_per_die, + vcpu_config.dies_per_socket, + vcpu_config.vpmu_feature, + ) + .map_err(VcpuError::CpuId)?; + process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| { + METRICS.vcpu.filter_cpuid.inc(); + error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e); + VcpuError::CpuId(e) + })?; + + self.fd + .set_cpuid2(&self.cpuid) + .map_err(VcpuError::SetSupportedCpusFailed) + } +} diff --git a/src/dragonball/src/vm/mod.rs b/src/dragonball/src/vm/mod.rs index c1510308da..e09ec316bb 100644 --- a/src/dragonball/src/vm/mod.rs +++ b/src/dragonball/src/vm/mod.rs @@ -18,3 +18,79 @@ pub struct NumaRegionInfo { /// vcpu ids belonging to this region pub vcpu_ids: Vec, } + +/// Information for cpu topology to guide guest init +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct CpuTopology { + /// threads per core to indicate hyperthreading is enabled or not + pub threads_per_core: u8, + /// cores per die to guide guest cpu topology init + pub cores_per_die: u8, + /// dies per socket to guide guest cpu topology + pub dies_per_socket: u8, + /// number of sockets + pub sockets: u8, +} + +impl Default for CpuTopology { + fn default() -> Self { + CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + } + } +} + +/// Configuration information for virtual machine instance. +#[derive(Clone, Debug, PartialEq)] +pub struct VmConfigInfo { + /// Number of vcpu to start. + pub vcpu_count: u8, + /// Max number of vcpu can be added + pub max_vcpu_count: u8, + /// Enable or disable hyperthreading. + pub ht_enabled: bool, + /// cpu power management. + pub cpu_pm: String, + /// cpu topology information + pub cpu_topology: CpuTopology, + /// vpmu support level + pub vpmu_feature: u8, + + /// Memory type that can be either hugetlbfs or shmem, default is shmem + pub mem_type: String, + /// Memory file path + pub mem_file_path: String, + /// The memory size in MiB. + pub mem_size_mib: usize, + /// reserve memory bytes + pub reserve_memory_bytes: u64, + + /// sock path + pub serial_path: Option, +} + +impl Default for VmConfigInfo { + fn default() -> Self { + VmConfigInfo { + vcpu_count: 1, + max_vcpu_count: 1, + ht_enabled: false, + cpu_pm: String::from("on"), + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + mem_type: String::from("shmem"), + mem_file_path: String::from(""), + mem_size_mib: 128, + reserve_memory_bytes: 0, + serial_path: None, + } + } +}