Merge pull request #4400 from openanolis/anolis/dragonball-2

runtime-rs: built-in Dragonball sandbox part II - vCPU manager
This commit is contained in:
Bin Liu 2022-06-28 20:41:36 +08:00 committed by GitHub
commit badbbcd8be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 3429 additions and 36 deletions

View File

@ -14,18 +14,22 @@ arc-swap = "1.5.0"
bytes = "1.1.0"
dbs-address-space = "0.1.0"
dbs-allocator = "0.1.0"
dbs-arch = "0.1.0"
dbs-boot = "0.2.0"
dbs-device = "0.1.0"
dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] }
dbs-legacy-devices = "0.1.0"
dbs-upcall = { version = "0.1.0", optional = true }
dbs-utils = "0.1.0"
dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] }
kvm-bindings = "0.5.0"
kvm-ioctls = "0.11.0"
lazy_static = "1.2"
libc = "0.2.39"
linux-loader = "0.4.0"
log = "0.4.14"
nix = "0.23.1"
seccompiler = "0.2.0"
serde = "1.0.27"
serde_derive = "1.0.27"
serde_json = "1.0.9"
@ -41,13 +45,15 @@ slog-term = "2.9.0"
slog-async = "2.7.0"
[features]
acpi = []
atomic-guest-memory = []
hotplug = ["virtio-vsock"]
virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"]
[patch.'crates-io']
dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }

View File

@ -17,7 +17,10 @@ and configuration process.
# Documentation
Device: [Device Document](docs/device.md)
vCPU: [vCPU Document](docs/vcpu.md)
API: [API Document](docs/api.md)
Currently, the documents are still actively adding.
You could see the [official documentation](docs/) page for more details.
# Supported Architectures

View File

@ -0,0 +1,7 @@
# API
We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager.
This document provides the introduction for each of them.
TODO: Details will be added in the Part III PR for `Dragonball`

View File

@ -14,4 +14,7 @@ Currently we have following device manager:
## Device supported
`VIRTIO-VSOCK`
`i8042`
`COM1`
`COM2`

View File

@ -0,0 +1,42 @@
# vCPU
## vCPU Manager
The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc.
For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445)
### vCPU config
`VcpuConfig` is used to configure guest overall CPU info.
`boot_vcpu_count` is used to define the initial vCPU number.
`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature
`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology.
`vpmu_feature` is used to define `vPMU` feature level.
If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default).
If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions).
If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported
## vCPU State
There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow.
When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`.
During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason.
If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine.
When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM.
When the VMM is stopped / destroyed, the state will change to `exited`.
## vCPU Hot plug
Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug.
To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try.
vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid.

View File

@ -0,0 +1,6 @@
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! API related data structures to configure the vmm.
pub mod v1;

View File

@ -0,0 +1,84 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// SPDX-License-Identifier: Apache-2.0
use serde_derive::{Deserialize, Serialize};
/// The microvm state.
///
/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is
/// called, the state goes from Uninitialized to Starting. The state is changed to Running until
/// the start_microvm method ends. Halting and Halted are currently unsupported.
#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)]
pub enum InstanceState {
/// Microvm is not initialized.
Uninitialized,
/// Microvm is starting.
Starting,
/// Microvm is running.
Running,
/// Microvm is Paused.
Paused,
/// Microvm received a halt instruction.
Halting,
/// Microvm is halted.
Halted,
/// Microvm exit instead of process exit.
Exited(i32),
}
/// The state of async actions
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
pub enum AsyncState {
/// Uninitialized
Uninitialized,
/// Success
Success,
/// Failure
Failure,
}
/// The strongly typed that contains general information about the microVM.
#[derive(Debug, Deserialize, Serialize)]
pub struct InstanceInfo {
/// The ID of the microVM.
pub id: String,
/// The state of the microVM.
pub state: InstanceState,
/// The version of the VMM that runs the microVM.
pub vmm_version: String,
/// The pid of the current VMM process.
pub pid: u32,
/// The state of async actions.
pub async_state: AsyncState,
/// List of tids of vcpu threads (vcpu index, tid)
pub tids: Vec<(u8, u32)>,
}
impl InstanceInfo {
/// create instance info object with given id, version, and platform type
pub fn new(id: String, vmm_version: String) -> Self {
InstanceInfo {
id,
state: InstanceState::Uninitialized,
vmm_version,
pid: std::process::id(),
async_state: AsyncState::Uninitialized,
tids: Vec::new(),
}
}
}
impl Default for InstanceInfo {
fn default() -> Self {
InstanceInfo {
id: String::from(""),
state: InstanceState::Uninitialized,
vmm_version: env!("CARGO_PKG_VERSION").to_string(),
pid: std::process::id(),
async_state: AsyncState::Uninitialized,
tids: Vec::new(),
}
}
}

View File

@ -0,0 +1,7 @@
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! API Version 1 related data structures to configure the vmm.
mod instance_info;
pub use self::instance_info::{InstanceInfo, InstanceState};

View File

@ -29,6 +29,12 @@ use dbs_virtio_devices::{
VirtioDevice,
};
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
use dbs_upcall::{
DevMgrRequest, DevMgrService, MmioDevRequest, UpcallClient, UpcallClientError,
UpcallClientRequest, UpcallClientResponse,
};
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::error::StartMicrovmError;
use crate::resource_manager::ResourceManager;
@ -83,6 +89,11 @@ pub enum DeviceMgrError {
/// Error from Virtio subsystem.
#[error(transparent)]
Virtio(virtio::Error),
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
/// Failed to hotplug the device.
#[error("failed to hotplug virtual device")]
HotplugDevice(#[source] UpcallClientError),
}
/// Specialized version of `std::result::Result` for device manager operations.
@ -188,6 +199,8 @@ pub struct DeviceOpContext {
logger: slog::Logger,
is_hotplug: bool,
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
upcall_client: Option<Arc<UpcallClient<DevMgrService>>>,
#[cfg(feature = "dbs-virtio-devices")]
virtio_devices: Vec<Arc<DbsMmioV2Device>>,
}
@ -220,6 +233,8 @@ impl DeviceOpContext {
address_space,
logger,
is_hotplug,
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
upcall_client: None,
#[cfg(feature = "dbs-virtio-devices")]
virtio_devices: Vec::new(),
}
@ -236,35 +251,122 @@ impl DeviceOpContext {
&self.logger
}
#[allow(unused_variables)]
fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> {
if !self.is_hotplug {
if self.is_hotplug {
return Err(DeviceMgrError::InvalidOperation);
}
#[cfg(feature = "dbs-virtio-devices")]
let cmdline = kernel_config.kernel_cmdline_mut();
{
let cmdline = kernel_config.kernel_cmdline_mut();
#[cfg(feature = "dbs-virtio-devices")]
for device in self.virtio_devices.iter() {
let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?;
for device in self.virtio_devices.iter() {
let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?;
// as per doc, [virtio_mmio.]device=<size>@<baseaddr>:<irq> needs to be appended
// to kernel commandline for virtio mmio devices to get recognized
// the size parameter has to be transformed to KiB, so dividing hexadecimal value in
// bytes to 1024; further, the '{}' formatting rust construct will automatically
// transform it to decimal
cmdline
.insert(
"virtio_mmio.device",
&format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq),
)
.map_err(DeviceMgrError::Cmdline)?;
// as per doc, [virtio_mmio.]device=<size>@<baseaddr>:<irq> needs to be appended
// to kernel commandline for virtio mmio devices to get recognized
// the size parameter has to be transformed to KiB, so dividing hexadecimal value in
// bytes to 1024; further, the '{}' formatting rust construct will automatically
// transform it to decimal
cmdline
.insert(
"virtio_mmio.device",
&format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq),
)
.map_err(DeviceMgrError::Cmdline)?;
}
}
Ok(())
}
}
#[cfg(not(feature = "hotplug"))]
impl DeviceOpContext {
pub(crate) fn insert_hotplug_mmio_device(
&self,
_dev: &Arc<dyn DeviceIo>,
_callback: Option<()>,
) -> Result<()> {
Err(DeviceMgrError::InvalidOperation)
}
pub(crate) fn remove_hotplug_mmio_device(
&self,
_dev: &Arc<dyn DeviceIo>,
_callback: Option<()>,
) -> Result<()> {
Err(DeviceMgrError::InvalidOperation)
}
}
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
impl DeviceOpContext {
fn call_hotplug_device(
&self,
req: DevMgrRequest,
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
) -> Result<()> {
if let Some(upcall_client) = self.upcall_client.as_ref() {
if let Some(cb) = callback {
upcall_client
.send_request(UpcallClientRequest::DevMgr(req), cb)
.map_err(DeviceMgrError::HotplugDevice)?;
} else {
upcall_client
.send_request_without_result(UpcallClientRequest::DevMgr(req))
.map_err(DeviceMgrError::HotplugDevice)?;
}
Ok(())
} else {
Err(DeviceMgrError::InvalidOperation)
}
}
pub(crate) fn insert_hotplug_mmio_device(
&self,
dev: &Arc<DbsMmioV2Device>,
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
) -> Result<()> {
if !self.is_hotplug {
return Err(DeviceMgrError::InvalidOperation);
}
let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
let req = DevMgrRequest::AddMmioDev(MmioDevRequest {
mmio_base,
mmio_size,
mmio_irq,
});
self.call_hotplug_device(req, callback)
}
pub(crate) fn remove_hotplug_mmio_device(
&self,
dev: &Arc<DbsMmioV2Device>,
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
) -> Result<()> {
if !self.is_hotplug {
return Err(DeviceMgrError::InvalidOperation);
}
let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
let req = DevMgrRequest::DelMmioDev(MmioDevRequest {
mmio_base,
mmio_size,
mmio_irq,
});
self.call_hotplug_device(req, callback)
}
}
#[cfg(all(feature = "hotplug", feature = "acpi"))]
impl DeviceOpContext {
// TODO: We will implement this when we develop ACPI virtualization
}
/// Device manager for virtual machines, which manages all device for a virtual machine.
pub struct DeviceManager {
io_manager: Arc<ArcSwap<IoManager>>,
@ -351,7 +453,7 @@ impl DeviceManager {
self.set_guest_kernel_log_stream(dmesg_fifo)
.map_err(|_| StartMicrovmError::EventFd)?;
slog::info!(self.logger, "init console path: {:?}", com1_sock_path);
info!(self.logger, "init console path: {:?}", com1_sock_path);
if let Some(path) = com1_sock_path {
if let Some(legacy_manager) = self.legacy_manager.as_ref() {
let com1 = legacy_manager.get_com1_serial();
@ -387,19 +489,6 @@ impl DeviceManager {
Ok(())
}
/// Restore legacy devices
pub fn restore_legacy_devices(
&mut self,
dmesg_fifo: Option<Box<dyn io::Write + Send>>,
com1_sock_path: Option<String>,
) -> std::result::Result<(), StartMicrovmError> {
self.set_guest_kernel_log_stream(dmesg_fifo)
.map_err(|_| StartMicrovmError::EventFd)?;
slog::info!(self.logger, "restore console path: {:?}", com1_sock_path);
// TODO: restore console
Ok(())
}
/// Reset the console into canonical mode.
pub fn reset_console(&self) -> Result<()> {
self.con_manager.reset_console()

View File

@ -14,6 +14,37 @@ use dbs_virtio_devices::Error as VirtIoError;
use crate::device_manager;
/// Shorthand result type for internal VMM commands.
pub type Result<T> = std::result::Result<T, Error>;
/// Errors associated with the VMM internal logic.
///
/// These errors cannot be generated by direct user input, but can result from bad configuration
/// of the host (for example if Dragonball doesn't have permissions to open the KVM fd).
#[derive(Debug, thiserror::Error)]
pub enum Error {
/// Failure occurs in issuing KVM ioctls and errors will be returned from kvm_ioctls lib.
#[error("failure in issuing KVM ioctl command")]
Kvm(#[source] kvm_ioctls::Error),
/// The host kernel reports an unsupported KVM API version.
#[error("unsupported KVM version {0}")]
KvmApiVersion(i32),
/// Cannot initialize the KVM context due to missing capabilities.
#[error("missing KVM capability")]
KvmCap(kvm_ioctls::Cap),
#[cfg(target_arch = "x86_64")]
#[error("failed to configure MSRs")]
/// Cannot configure MSRs
GuestMSRs(dbs_arch::msr::Error),
/// MSR inner error
#[error("MSR inner error")]
Msr(vmm_sys_util::fam::Error),
}
/// Errors associated with starting the instance.
#[derive(Debug, thiserror::Error)]
pub enum StartMicrovmError {

View File

@ -0,0 +1,60 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::sync::Arc;
use arc_swap::{ArcSwap, Cache};
use dbs_device::device_manager::Error;
use dbs_device::device_manager::IoManager;
/// A specialized version of [`std::result::Result`] for IO manager related operations.
pub type Result<T> = std::result::Result<T, Error>;
/// Wrapper over IoManager to support device hotplug with [`ArcSwap`] and [`Cache`].
#[derive(Clone)]
pub struct IoManagerCached(pub(crate) Cache<Arc<ArcSwap<IoManager>>, Arc<IoManager>>);
impl IoManagerCached {
/// Create a new instance of [`IoManagerCached`].
pub fn new(io_manager: Arc<ArcSwap<IoManager>>) -> Self {
IoManagerCached(Cache::new(io_manager))
}
#[cfg(target_arch = "x86_64")]
#[inline]
/// Read data from IO ports.
pub fn pio_read(&mut self, addr: u16, data: &mut [u8]) -> Result<()> {
self.0.load().pio_read(addr, data)
}
#[cfg(target_arch = "x86_64")]
#[inline]
/// Write data to IO ports.
pub fn pio_write(&mut self, addr: u16, data: &[u8]) -> Result<()> {
self.0.load().pio_write(addr, data)
}
#[inline]
/// Read data to MMIO address.
pub fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> Result<()> {
self.0.load().mmio_read(addr, data)
}
#[inline]
/// Write data to MMIO address.
pub fn mmio_write(&mut self, addr: u64, data: &[u8]) -> Result<()> {
self.0.load().mmio_write(addr, data)
}
#[inline]
/// Revalidate the inner cache
pub fn revalidate_cache(&mut self) {
let _ = self.0.load();
}
#[inline]
/// Get immutable reference to underlying [`IoManager`].
pub fn load(&mut self) -> &IoManager {
self.0.load()
}
}

View File

@ -0,0 +1,251 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.
#![allow(dead_code)]
use kvm_bindings::KVM_API_VERSION;
use kvm_ioctls::{Cap, Kvm, VmFd};
use std::os::unix::io::{FromRawFd, RawFd};
use crate::error::{Error, Result};
/// Describes a KVM context that gets attached to the micro VM instance.
/// It gives access to the functionality of the KVM wrapper as long as every required
/// KVM capability is present on the host.
pub struct KvmContext {
kvm: Kvm,
max_memslots: usize,
#[cfg(target_arch = "x86_64")]
supported_msrs: kvm_bindings::MsrList,
}
impl KvmContext {
/// Create a new KVM context object, using the provided `kvm_fd` if one is presented.
pub fn new(kvm_fd: Option<RawFd>) -> Result<Self> {
let kvm = if let Some(fd) = kvm_fd {
// Safe because we expect kvm_fd to contain a valid fd number when is_some() == true.
unsafe { Kvm::from_raw_fd(fd) }
} else {
Kvm::new().map_err(Error::Kvm)?
};
if kvm.get_api_version() != KVM_API_VERSION as i32 {
return Err(Error::KvmApiVersion(kvm.get_api_version()));
}
Self::check_cap(&kvm, Cap::Irqchip)?;
Self::check_cap(&kvm, Cap::Irqfd)?;
Self::check_cap(&kvm, Cap::Ioeventfd)?;
Self::check_cap(&kvm, Cap::UserMemory)?;
#[cfg(target_arch = "x86_64")]
Self::check_cap(&kvm, Cap::SetTssAddr)?;
#[cfg(target_arch = "x86_64")]
let supported_msrs = dbs_arch::msr::supported_guest_msrs(&kvm).map_err(Error::GuestMSRs)?;
let max_memslots = kvm.get_nr_memslots();
Ok(KvmContext {
kvm,
max_memslots,
#[cfg(target_arch = "x86_64")]
supported_msrs,
})
}
/// Get underlying KVM object to access kvm-ioctls interfaces.
pub fn kvm(&self) -> &Kvm {
&self.kvm
}
/// Get the maximum number of memory slots reported by this KVM context.
pub fn max_memslots(&self) -> usize {
self.max_memslots
}
/// Create a virtual machine object.
pub fn create_vm(&self) -> Result<VmFd> {
self.kvm.create_vm().map_err(Error::Kvm)
}
/// Get the max vcpu count supported by kvm
pub fn get_max_vcpus(&self) -> usize {
self.kvm.get_max_vcpus()
}
fn check_cap(kvm: &Kvm, cap: Cap) -> std::result::Result<(), Error> {
if !kvm.check_extension(cap) {
return Err(Error::KvmCap(cap));
}
Ok(())
}
}
#[cfg(target_arch = "x86_64")]
mod x86_64 {
use super::*;
use dbs_arch::msr::*;
use kvm_bindings::{kvm_msr_entry, CpuId, MsrList, Msrs};
use std::collections::HashSet;
impl KvmContext {
/// Get information about supported CPUID of x86 processor.
pub fn supported_cpuid(
&self,
max_entries_count: usize,
) -> std::result::Result<CpuId, kvm_ioctls::Error> {
self.kvm.get_supported_cpuid(max_entries_count)
}
/// Get information about supported MSRs of x86 processor.
pub fn supported_msrs(
&self,
_max_entries_count: usize,
) -> std::result::Result<MsrList, kvm_ioctls::Error> {
Ok(self.supported_msrs.clone())
}
// It's very sensible to manipulate MSRs, so please be careful to change code below.
fn build_msrs_list(kvm: &Kvm) -> Result<Msrs> {
let mut mset: HashSet<u32> = HashSet::new();
let supported_msr_list = kvm.get_msr_index_list().map_err(super::Error::Kvm)?;
for msr in supported_msr_list.as_slice() {
mset.insert(*msr);
}
let mut msrs = vec![
MSR_IA32_APICBASE,
MSR_IA32_SYSENTER_CS,
MSR_IA32_SYSENTER_ESP,
MSR_IA32_SYSENTER_EIP,
MSR_IA32_CR_PAT,
];
let filters_list = vec![
MSR_STAR,
MSR_VM_HSAVE_PA,
MSR_TSC_AUX,
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_BNDCFGS,
MSR_IA32_SPEC_CTRL,
];
for msr in filters_list {
if mset.contains(&msr) {
msrs.push(msr);
}
}
// TODO: several msrs are optional.
// TODO: Since our guests don't support nested-vmx, LMCE nor SGX for now.
// msrs.push(MSR_IA32_FEATURE_CONTROL);
msrs.push(MSR_CSTAR);
msrs.push(MSR_KERNEL_GS_BASE);
msrs.push(MSR_SYSCALL_MASK);
msrs.push(MSR_LSTAR);
msrs.push(MSR_IA32_TSC);
msrs.push(MSR_KVM_SYSTEM_TIME_NEW);
msrs.push(MSR_KVM_WALL_CLOCK_NEW);
// FIXME: check if it's supported.
msrs.push(MSR_KVM_ASYNC_PF_EN);
msrs.push(MSR_KVM_PV_EOI_EN);
msrs.push(MSR_KVM_STEAL_TIME);
msrs.push(MSR_CORE_PERF_FIXED_CTR_CTRL);
msrs.push(MSR_CORE_PERF_GLOBAL_CTRL);
msrs.push(MSR_CORE_PERF_GLOBAL_STATUS);
msrs.push(MSR_CORE_PERF_GLOBAL_OVF_CTRL);
const MAX_FIXED_COUNTERS: u32 = 3;
for i in 0..MAX_FIXED_COUNTERS {
msrs.push(MSR_CORE_PERF_FIXED_CTR0 + i);
}
// FIXME: skip MCE for now.
let mtrr_msrs = vec![
MSR_MTRRdefType,
MSR_MTRRfix64K_00000,
MSR_MTRRfix16K_80000,
MSR_MTRRfix16K_A0000,
MSR_MTRRfix4K_C0000,
MSR_MTRRfix4K_C8000,
MSR_MTRRfix4K_D0000,
MSR_MTRRfix4K_D8000,
MSR_MTRRfix4K_E0000,
MSR_MTRRfix4K_E8000,
MSR_MTRRfix4K_F0000,
MSR_MTRRfix4K_F8000,
];
for mtrr in mtrr_msrs {
msrs.push(mtrr);
}
const MSR_MTRRCAP_VCNT: u32 = 8;
for i in 0..MSR_MTRRCAP_VCNT {
msrs.push(0x200 + 2 * i);
msrs.push(0x200 + 2 * i + 1);
}
let msrs: Vec<kvm_msr_entry> = msrs
.iter()
.map(|reg| kvm_msr_entry {
index: *reg,
reserved: 0,
data: 0,
})
.collect();
Msrs::from_entries(&msrs).map_err(super::Error::Msr)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use kvm_ioctls::Kvm;
use std::fs::File;
use std::os::unix::fs::MetadataExt;
use std::os::unix::io::{AsRawFd, FromRawFd};
#[test]
fn test_create_kvm_context() {
let c = KvmContext::new(None).unwrap();
assert!(c.max_memslots >= 32);
let kvm = Kvm::new().unwrap();
let f = unsafe { File::from_raw_fd(kvm.as_raw_fd()) };
let m1 = f.metadata().unwrap();
let m2 = File::open("/dev/kvm").unwrap().metadata().unwrap();
assert_eq!(m1.dev(), m2.dev());
assert_eq!(m1.ino(), m2.ino());
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_get_supported_cpu_id() {
let c = KvmContext::new(None).unwrap();
let _ = c
.supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.expect("failed to get supported CPUID");
assert!(c.supported_cpuid(0).is_err());
}
#[test]
fn test_create_vm() {
let c = KvmContext::new(None).unwrap();
let _ = c.create_vm().unwrap();
}
}

View File

@ -1,4 +1,5 @@
// Copyright (C) 2018-2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//! Dragonball is a light-weight virtual machine manager(VMM) based on Linux Kernel-based Virtual
@ -10,13 +11,45 @@
/// Address space manager for virtual machines.
pub mod address_space_manager;
/// API to handle vmm requests.
pub mod api;
/// Structs to maintain configuration information.
pub mod config_manager;
/// Device manager for virtual machines.
pub mod device_manager;
/// Errors related to Virtual machine manager.
pub mod error;
/// KVM operation context for virtual machines.
pub mod kvm_context;
/// Metrics system.
pub mod metric;
/// Resource manager for virtual machines.
pub mod resource_manager;
/// Signal handler for virtual machines.
pub mod signal_handler;
/// Virtual CPU manager for virtual machines.
pub mod vcpu;
/// Virtual machine manager for virtual machines.
pub mod vm;
mod io_manager;
pub use self::io_manager::IoManagerCached;
/// Success exit code.
pub const EXIT_CODE_OK: u8 = 0;
/// Generic error exit code.
pub const EXIT_CODE_GENERIC_ERROR: u8 = 1;
/// Generic exit code for an error considered not possible to occur if the program logic is sound.
pub const EXIT_CODE_UNEXPECTED_ERROR: u8 = 2;
/// Dragonball was shut down after intercepting a restricted system call.
pub const EXIT_CODE_BAD_SYSCALL: u8 = 148;
/// Dragonball was shut down after intercepting `SIGBUS`.
pub const EXIT_CODE_SIGBUS: u8 = 149;
/// Dragonball was shut down after intercepting `SIGSEGV`.
pub const EXIT_CODE_SIGSEGV: u8 = 150;
/// Invalid json passed to the Dragonball process for configuring microvm.
pub const EXIT_CODE_INVALID_JSON: u8 = 151;
/// Bad configuration for microvm's resources, when using a single json.
pub const EXIT_CODE_BAD_CONFIGURATION: u8 = 152;
/// Command line arguments parsing error.
pub const EXIT_CODE_ARG_PARSING: u8 = 153;

View File

@ -0,0 +1,58 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use dbs_utils::metric::SharedIncMetric;
use lazy_static::lazy_static;
use serde::Serialize;
pub use dbs_utils::metric::IncMetric;
lazy_static! {
/// Static instance used for handling metrics.
pub static ref METRICS: DragonballMetrics = DragonballMetrics::default();
}
/// Metrics specific to VCPUs' mode of functioning.
#[derive(Default, Serialize)]
pub struct VcpuMetrics {
/// Number of KVM exits for handling input IO.
pub exit_io_in: SharedIncMetric,
/// Number of KVM exits for handling output IO.
pub exit_io_out: SharedIncMetric,
/// Number of KVM exits for handling MMIO reads.
pub exit_mmio_read: SharedIncMetric,
/// Number of KVM exits for handling MMIO writes.
pub exit_mmio_write: SharedIncMetric,
/// Number of errors during this VCPU's run.
pub failures: SharedIncMetric,
/// Failures in configuring the CPUID.
pub filter_cpuid: SharedIncMetric,
}
/// Metrics for the seccomp filtering.
#[derive(Default, Serialize)]
pub struct SeccompMetrics {
/// Number of errors inside the seccomp filtering.
pub num_faults: SharedIncMetric,
}
/// Metrics related to signals.
#[derive(Default, Serialize)]
pub struct SignalMetrics {
/// Number of times that SIGBUS was handled.
pub sigbus: SharedIncMetric,
/// Number of times that SIGSEGV was handled.
pub sigsegv: SharedIncMetric,
}
/// Structure storing all metrics while enforcing serialization support on them.
#[derive(Default, Serialize)]
pub struct DragonballMetrics {
/// Metrics related to a vcpu's functioning.
pub vcpu: VcpuMetrics,
/// Metrics related to seccomp filtering.
pub seccomp: SeccompMetrics,
/// Metrics related to signals.
pub signals: SignalMetrics,
}

View File

@ -0,0 +1,219 @@
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use libc::{_exit, c_int, c_void, siginfo_t, SIGBUS, SIGSEGV, SIGSYS};
use log::error;
use vmm_sys_util::signal::register_signal_handler;
use crate::metric::{IncMetric, METRICS};
// The offset of `si_syscall` (offending syscall identifier) within the siginfo structure
// expressed as an `(u)int*`.
// Offset `6` for an `i32` field means that the needed information is located at `6 * sizeof(i32)`.
// See /usr/include/linux/signal.h for the C struct definition.
// See https://github.com/rust-lang/libc/issues/716 for why the offset is different in Rust.
const SI_OFF_SYSCALL: isize = 6;
const SYS_SECCOMP_CODE: i32 = 1;
extern "C" {
fn __libc_current_sigrtmin() -> c_int;
fn __libc_current_sigrtmax() -> c_int;
}
/// Gets current sigrtmin
pub fn sigrtmin() -> c_int {
unsafe { __libc_current_sigrtmin() }
}
/// Gets current sigrtmax
pub fn sigrtmax() -> c_int {
unsafe { __libc_current_sigrtmax() }
}
/// Signal handler for `SIGSYS`.
///
/// Increments the `seccomp.num_faults` metric, logs an error message and terminates the process
/// with a specific exit code.
extern "C" fn sigsys_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) {
// Safe because we're just reading some fields from a supposedly valid argument.
let si_signo = unsafe { (*info).si_signo };
let si_code = unsafe { (*info).si_code };
// Sanity check. The condition should never be true.
if num != si_signo || num != SIGSYS || si_code != SYS_SECCOMP_CODE as i32 {
// Safe because we're terminating the process anyway.
unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) };
}
// Other signals which might do async unsafe things incompatible with the rest of this
// function are blocked due to the sa_mask used when registering the signal handler.
let syscall = unsafe { *(info as *const i32).offset(SI_OFF_SYSCALL) as usize };
// SIGSYS is triggered when bad syscalls are detected. num_faults is only added when SIGSYS is detected
// so it actually only collects the count for bad syscalls.
METRICS.seccomp.num_faults.inc();
error!(
"Shutting down VM after intercepting a bad syscall ({}).",
syscall
);
// Safe because we're terminating the process anyway. We don't actually do anything when
// running unit tests.
#[cfg(not(test))]
unsafe {
_exit(i32::from(super::EXIT_CODE_BAD_SYSCALL))
};
}
/// Signal handler for `SIGBUS` and `SIGSEGV`.
///
/// Logs an error message and terminates the process with a specific exit code.
extern "C" fn sigbus_sigsegv_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) {
// Safe because we're just reading some fields from a supposedly valid argument.
let si_signo = unsafe { (*info).si_signo };
let si_code = unsafe { (*info).si_code };
// Sanity check. The condition should never be true.
if num != si_signo || (num != SIGBUS && num != SIGSEGV) {
// Safe because we're terminating the process anyway.
unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) };
}
// Other signals which might do async unsafe things incompatible with the rest of this
// function are blocked due to the sa_mask used when registering the signal handler.
match si_signo {
SIGBUS => METRICS.signals.sigbus.inc(),
SIGSEGV => METRICS.signals.sigsegv.inc(),
_ => (),
}
error!(
"Shutting down VM after intercepting signal {}, code {}.",
si_signo, si_code
);
// Safe because we're terminating the process anyway. We don't actually do anything when
// running unit tests.
#[cfg(not(test))]
unsafe {
_exit(i32::from(match si_signo {
SIGBUS => super::EXIT_CODE_SIGBUS,
SIGSEGV => super::EXIT_CODE_SIGSEGV,
_ => super::EXIT_CODE_UNEXPECTED_ERROR,
}))
};
}
/// Registers all the required signal handlers.
///
/// Custom handlers are installed for: `SIGBUS`, `SIGSEGV`, `SIGSYS`.
pub fn register_signal_handlers() -> vmm_sys_util::errno::Result<()> {
// Call to unsafe register_signal_handler which is considered unsafe because it will
// register a signal handler which will be called in the current thread and will interrupt
// whatever work is done on the current thread, so we have to keep in mind that the registered
// signal handler must only do async-signal-safe operations.
register_signal_handler(SIGSYS, sigsys_handler)?;
register_signal_handler(SIGBUS, sigbus_sigsegv_handler)?;
register_signal_handler(SIGSEGV, sigbus_sigsegv_handler)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use libc::{cpu_set_t, syscall};
use std::convert::TryInto;
use std::{mem, process, thread};
use seccompiler::{apply_filter, BpfProgram, SeccompAction, SeccompFilter};
// This function is used when running unit tests, so all the unsafes are safe.
fn cpu_count() -> usize {
let mut cpuset: cpu_set_t = unsafe { mem::zeroed() };
unsafe {
libc::CPU_ZERO(&mut cpuset);
}
let ret = unsafe {
libc::sched_getaffinity(
0,
mem::size_of::<cpu_set_t>(),
&mut cpuset as *mut cpu_set_t,
)
};
assert_eq!(ret, 0);
let mut num = 0;
for i in 0..libc::CPU_SETSIZE as usize {
if unsafe { libc::CPU_ISSET(i, &cpuset) } {
num += 1;
}
}
num
}
#[test]
fn test_signal_handler() {
let child = thread::spawn(move || {
assert!(register_signal_handlers().is_ok());
let filter = SeccompFilter::new(
vec![
(libc::SYS_brk, vec![]),
(libc::SYS_exit, vec![]),
(libc::SYS_futex, vec![]),
(libc::SYS_getpid, vec![]),
(libc::SYS_munmap, vec![]),
(libc::SYS_kill, vec![]),
(libc::SYS_rt_sigprocmask, vec![]),
(libc::SYS_rt_sigreturn, vec![]),
(libc::SYS_sched_getaffinity, vec![]),
(libc::SYS_set_tid_address, vec![]),
(libc::SYS_sigaltstack, vec![]),
(libc::SYS_write, vec![]),
]
.into_iter()
.collect(),
SeccompAction::Trap,
SeccompAction::Allow,
std::env::consts::ARCH.try_into().unwrap(),
)
.unwrap();
assert!(apply_filter(&TryInto::<BpfProgram>::try_into(filter).unwrap()).is_ok());
assert_eq!(METRICS.seccomp.num_faults.count(), 0);
// Call the blacklisted `SYS_mkdirat`.
unsafe { syscall(libc::SYS_mkdirat, "/foo/bar\0") };
// Call SIGBUS signal handler.
assert_eq!(METRICS.signals.sigbus.count(), 0);
unsafe {
syscall(libc::SYS_kill, process::id(), SIGBUS);
}
// Call SIGSEGV signal handler.
assert_eq!(METRICS.signals.sigsegv.count(), 0);
unsafe {
syscall(libc::SYS_kill, process::id(), SIGSEGV);
}
});
assert!(child.join().is_ok());
// Sanity check.
assert!(cpu_count() > 0);
// Kcov somehow messes with our handler getting the SIGSYS signal when a bad syscall
// is caught, so the following assertion no longer holds. Ideally, we'd have a surefire
// way of either preventing this behaviour, or detecting for certain whether this test is
// run by kcov or not. The best we could do so far is to look at the perceived number of
// available CPUs. Kcov seems to make a single CPU available to the process running the
// tests, so we use this as an heuristic to decide if we check the assertion.
if cpu_count() > 1 {
// The signal handler should let the program continue during unit tests.
assert!(METRICS.seccomp.num_faults.count() >= 1);
}
assert!(METRICS.signals.sigbus.count() >= 1);
assert!(METRICS.signals.sigsegv.count() >= 1);
}
}

View File

@ -0,0 +1,94 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.
use std::sync::mpsc::{channel, Sender};
use std::sync::Arc;
use crate::IoManagerCached;
use dbs_utils::time::TimestampUs;
use kvm_ioctls::{VcpuFd, VmFd};
use vm_memory::GuestAddress;
use vmm_sys_util::eventfd::EventFd;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuStateEvent};
use crate::vcpu::VcpuConfig;
#[allow(unused)]
impl Vcpu {
/// Constructs a new VCPU for `vm`.
///
/// # Arguments
///
/// * `id` - Represents the CPU number between [0, max vcpus).
/// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu.
/// * `io_mgr` - The io-manager used to access port-io and mmio devices.
/// * `exit_evt` - An `EventFd` that will be written into when this vcpu
/// exits.
/// * `vcpu_state_event` - The eventfd which can notify vmm state of some
/// vcpu should change.
/// * `vcpu_state_sender` - The channel to send state change message from
/// vcpu thread to vmm thread.
/// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime.
/// * `support_immediate_exit` - whether kvm uses supports immediate_exit flag.
pub fn new_aarch64(
id: u8,
vcpu_fd: Arc<VcpuFd>,
io_mgr: IoManagerCached,
exit_evt: EventFd,
vcpu_state_event: EventFd,
vcpu_state_sender: Sender<VcpuStateEvent>,
create_ts: TimestampUs,
support_immediate_exit: bool,
) -> Result<Self> {
let (event_sender, event_receiver) = channel();
let (response_sender, response_receiver) = channel();
Ok(Vcpu {
fd: vcpu_fd,
id,
io_mgr,
create_ts,
event_receiver,
event_sender: Some(event_sender),
response_receiver: Some(response_receiver),
response_sender,
vcpu_state_event,
vcpu_state_sender,
support_immediate_exit,
mpidr: 0,
exit_evt,
})
}
/// Configures an aarch64 specific vcpu.
///
/// # Arguments
///
/// * `vcpu_config` - vCPU config for this vCPU status
/// * `vm_fd` - The kvm `VmFd` for this microvm.
/// * `vm_as` - The guest memory address space used by this microvm.
/// * `kernel_load_addr` - Offset from `guest_mem` at which the kernel is loaded.
/// * `_pgtable_addr` - pgtable address for ap vcpu (not used in aarch64)
pub fn configure(
&mut self,
_vcpu_config: &VcpuConfig,
vm_fd: &VmFd,
vm_as: &GuestAddressSpaceImpl,
kernel_load_addr: Option<GuestAddress>,
_pgtable_addr: Option<GuestAddress>,
) -> Result<()> {
// TODO: add arm vcpu configure() function. issue: #4445
Ok(())
}
/// Gets the MPIDR register value.
pub fn get_mpidr(&self) -> u64 {
self.mpidr
}
}

View File

@ -0,0 +1,32 @@
// Copyright (C) 2022 Alibaba Cloud Computing. All rights reserved.
// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// SPDX-License-Identifier: Apache-2.0
mod sm;
pub mod vcpu_impl;
pub mod vcpu_manager;
#[cfg(target_arch = "x86_64")]
use dbs_arch::cpuid::VpmuFeatureLevel;
/// vcpu config collection
pub struct VcpuConfig {
/// initial vcpu count
pub boot_vcpu_count: u8,
/// max vcpu count for hotplug
pub max_vcpu_count: u8,
/// threads per core for cpu topology information
pub threads_per_core: u8,
/// cores per die for cpu topology information
pub cores_per_die: u8,
/// dies per socket for cpu topology information
pub dies_per_socket: u8,
/// socket number for cpu topology information
pub sockets: u8,
/// if vpmu feature is Disabled, it means vpmu feature is off (by default)
/// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions)
/// if vpmu feature is FullyEnabled, it means all vpmu counters are supported
#[cfg(target_arch = "x86_64")]
pub vpmu_feature: VpmuFeatureLevel,
}

View File

@ -0,0 +1,149 @@
// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use std::ops::Deref;
/// Simple abstraction of a state machine.
///
/// `StateMachine<T>` is a wrapper over `T` that also encodes state information for `T`.
///
/// Each state for `T` is represented by a `StateFn<T>` which is a function that acts as
/// the state handler for that particular state of `T`.
///
/// `StateFn<T>` returns exactly one other `StateMachine<T>` thus each state gets clearly
/// defined transitions to other states.
pub struct StateMachine<T> {
function: StateFn<T>,
end_state: bool,
}
/// Type representing a state handler of a `StateMachine<T>` machine. Each state handler
/// is a function from `T` that handles a specific state of `T`.
type StateFn<T> = fn(&mut T) -> StateMachine<T>;
impl<T> StateMachine<T> {
/// Creates a new state wrapper.
///
/// # Arguments
///
/// `function` - the state handler for this state.
/// `end_state` - whether this state is final.
pub fn new(function: StateFn<T>, end_state: bool) -> StateMachine<T> {
StateMachine {
function,
end_state,
}
}
/// Creates a new state wrapper that has further possible transitions.
///
/// # Arguments
///
/// `function` - the state handler for this state.
pub fn next(function: StateFn<T>) -> StateMachine<T> {
StateMachine::new(function, false)
}
/// Creates a new state wrapper that has no further transitions. The state machine
/// will finish after running this handler.
///
/// # Arguments
///
/// `function` - the state handler for this last state.
pub fn finish(function: StateFn<T>) -> StateMachine<T> {
StateMachine::new(function, true)
}
/// Runs a state machine for `T` starting from the provided state.
///
/// # Arguments
///
/// `machine` - a mutable reference to the object running through the various states.
/// `starting_state_fn` - a `fn(&mut T) -> StateMachine<T>` that should be the handler for
/// the initial state.
pub fn run(machine: &mut T, starting_state_fn: StateFn<T>) {
// Start off in the `starting_state` state.
let mut sf = StateMachine::new(starting_state_fn, false);
// While current state is not a final/end state, keep churning.
while !sf.end_state {
// Run the current state handler, and get the next one.
sf = sf(machine);
}
}
}
// Implement Deref of `StateMachine<T>` so that we can directly call its underlying state handler.
impl<T> Deref for StateMachine<T> {
type Target = StateFn<T>;
fn deref(&self) -> &Self::Target {
&self.function
}
}
#[cfg(test)]
mod tests {
use super::*;
// DummyMachine with states `s1`, `s2` and `s3`.
struct DummyMachine {
private_data_s1: bool,
private_data_s2: bool,
private_data_s3: bool,
}
impl DummyMachine {
fn new() -> Self {
DummyMachine {
private_data_s1: false,
private_data_s2: false,
private_data_s3: false,
}
}
// DummyMachine functions here.
// Simple state-machine: start->s1->s2->s3->done.
fn run(&mut self) {
// Verify the machine has not run yet.
assert!(!self.private_data_s1);
assert!(!self.private_data_s2);
assert!(!self.private_data_s3);
// Run the state-machine.
StateMachine::run(self, Self::s1);
// Verify the machine went through all states.
assert!(self.private_data_s1);
assert!(self.private_data_s2);
assert!(self.private_data_s3);
}
fn s1(&mut self) -> StateMachine<Self> {
// Verify private data mutates along with the states.
assert!(!self.private_data_s1);
self.private_data_s1 = true;
StateMachine::next(Self::s2)
}
fn s2(&mut self) -> StateMachine<Self> {
// Verify private data mutates along with the states.
assert!(!self.private_data_s2);
self.private_data_s2 = true;
StateMachine::next(Self::s3)
}
fn s3(&mut self) -> StateMachine<Self> {
// Verify private data mutates along with the states.
assert!(!self.private_data_s3);
self.private_data_s3 = true;
// The machine ends here, adding `s1` as next state to validate this.
StateMachine::finish(Self::s1)
}
}
#[test]
fn test_sm() {
let mut machine = DummyMachine::new();
machine.run();
}
}

View File

@ -0,0 +1,955 @@
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.
//! The implementation for per vcpu
use std::cell::Cell;
use std::result;
use std::sync::atomic::{fence, Ordering};
use std::sync::mpsc::{Receiver, Sender, TryRecvError};
use std::sync::{Arc, Barrier};
use std::thread;
use dbs_utils::time::TimestampUs;
use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
use kvm_ioctls::{VcpuExit, VcpuFd};
use libc::{c_int, c_void, siginfo_t};
use log::{error, info};
use seccompiler::{apply_filter, BpfProgram, Error as SecError};
use vmm_sys_util::eventfd::EventFd;
use vmm_sys_util::signal::{register_signal_handler, Killable};
use super::sm::StateMachine;
use crate::metric::{IncMetric, METRICS};
use crate::signal_handler::sigrtmin;
use crate::IoManagerCached;
#[cfg(target_arch = "x86_64")]
#[path = "x86_64.rs"]
mod x86_64;
#[cfg(target_arch = "aarch64")]
#[path = "aarch64.rs"]
mod aarch64;
#[cfg(target_arch = "x86_64")]
const MAGIC_IOPORT_BASE: u16 = 0xdbdb;
#[cfg(target_arch = "x86_64")]
const MAGIC_IOPORT_DEBUG_INFO: u16 = MAGIC_IOPORT_BASE;
/// Signal number (SIGRTMIN) used to kick Vcpus.
pub const VCPU_RTSIG_OFFSET: i32 = 0;
#[cfg(target_arch = "x86_64")]
/// Errors associated with the wrappers over KVM ioctls.
#[derive(Debug, thiserror::Error)]
pub enum VcpuError {
/// Failed to signal Vcpu.
#[error("cannot signal the vCPU thread")]
SignalVcpu(#[source] vmm_sys_util::errno::Error),
/// Cannot open the vCPU file descriptor.
#[error("cannot open the vCPU file descriptor")]
VcpuFd(#[source] kvm_ioctls::Error),
/// Cannot spawn a new vCPU thread.
#[error("cannot spawn vCPU thread")]
VcpuSpawn(#[source] std::io::Error),
/// Cannot cleanly initialize vCPU TLS.
#[error("cannot cleanly initialize TLS fro vCPU")]
VcpuTlsInit,
/// Vcpu not present in TLS.
#[error("vCPU not present in the TLS")]
VcpuTlsNotPresent,
/// Unexpected KVM_RUN exit reason
#[error("Unexpected KVM_RUN exit reason")]
VcpuUnhandledKvmExit,
/// Pause vcpu failed
#[error("failed to pause vcpus")]
PauseFailed,
/// Kvm Ioctl Error
#[error("failure in issuing KVM ioctl command")]
Kvm(#[source] kvm_ioctls::Error),
/// Msr error
#[error("failure to deal with MSRs")]
Msr(vmm_sys_util::fam::Error),
/// A call to cpuid instruction failed on x86_64.
#[error("failure while configuring CPUID for virtual CPU on x86_64")]
CpuId(dbs_arch::cpuid::Error),
/// Error configuring the floating point related registers on x86_64.
#[error("failure while configuring the floating point related registers on x86_64")]
FPUConfiguration(dbs_arch::regs::Error),
/// Cannot set the local interruption due to bad configuration on x86_64.
#[error("cannot set the local interruption due to bad configuration on x86_64")]
LocalIntConfiguration(dbs_arch::interrupts::Error),
/// Error configuring the MSR registers on x86_64.
#[error("failure while configuring the MSR registers on x86_64")]
MSRSConfiguration(dbs_arch::regs::Error),
/// Error configuring the general purpose registers on x86_64.
#[error("failure while configuring the general purpose registers on x86_64")]
REGSConfiguration(dbs_arch::regs::Error),
/// Error configuring the special registers on x86_64.
#[error("failure while configuring the special registers on x86_64")]
SREGSConfiguration(dbs_arch::regs::Error),
/// Error configuring the page table on x86_64.
#[error("failure while configuring the page table on x86_64")]
PageTable(dbs_boot::Error),
/// The call to KVM_SET_CPUID2 failed on x86_64.
#[error("failure while calling KVM_SET_CPUID2 on x86_64")]
SetSupportedCpusFailed(#[source] kvm_ioctls::Error),
}
#[cfg(target_arch = "aarch64")]
/// Errors associated with the wrappers over KVM ioctls.
#[derive(Debug, thiserror::Error)]
pub enum VcpuError {
/// Failed to signal Vcpu.
#[error("cannot signal the vCPU thread")]
SignalVcpu(#[source] vmm_sys_util::errno::Error),
/// Cannot open the vCPU file descriptor.
#[error("cannot open the vCPU file descriptor")]
VcpuFd(#[source] kvm_ioctls::Error),
/// Cannot spawn a new vCPU thread.
#[error("cannot spawn vCPU thread")]
VcpuSpawn(#[source] std::io::Error),
/// Cannot cleanly initialize vCPU TLS.
#[error("cannot cleanly initialize TLS fro vCPU")]
VcpuTlsInit,
/// Vcpu not present in TLS.
#[error("vCPU not present in the TLS")]
VcpuTlsNotPresent,
/// Unexpected KVM_RUN exit reason
#[error("Unexpected KVM_RUN exit reason")]
VcpuUnhandledKvmExit,
/// Pause vcpu failed
#[error("failed to pause vcpus")]
PauseFailed,
/// Kvm Ioctl Error
#[error("failure in issuing KVM ioctl command")]
Kvm(#[source] kvm_ioctls::Error),
/// Msr error
#[error("failure to deal with MSRs")]
Msr(vmm_sys_util::fam::Error),
#[cfg(target_arch = "aarch64")]
/// Error configuring the general purpose aarch64 registers on aarch64.
#[error("failure while configuring the general purpose registers on aarch64")]
REGSConfiguration(dbs_arch::regs::Error),
#[cfg(target_arch = "aarch64")]
/// Error setting up the global interrupt controller on aarch64.
#[error("failure while setting up the global interrupt controller on aarch64")]
SetupGIC(dbs_arch::gic::Error),
#[cfg(target_arch = "aarch64")]
/// Error getting the Vcpu preferred target on aarch64.
#[error("failure while getting the vCPU preferred target on aarch64")]
VcpuArmPreferredTarget(kvm_ioctls::Error),
#[cfg(target_arch = "aarch64")]
/// Error doing vCPU Init on aarch64.
#[error("failure while doing vCPU init on aarch64")]
VcpuArmInit(kvm_ioctls::Error),
}
/// Result for Vcpu related operations.
pub type Result<T> = result::Result<T, VcpuError>;
/// List of events that the Vcpu can receive.
#[derive(Debug)]
pub enum VcpuEvent {
/// Kill the Vcpu.
Exit,
/// Pause the Vcpu.
Pause,
/// Event that should resume the Vcpu.
Resume,
/// Get vcpu thread tid
Gettid,
/// Event to revalidate vcpu IoManager cache
RevalidateCache,
}
/// List of responses that the Vcpu reports.
pub enum VcpuResponse {
/// Vcpu is paused.
Paused,
/// Vcpu is resumed.
Resumed,
/// Vcpu index and thread tid.
Tid(u8, u32),
/// Requested Vcpu operation is not allowed.
NotAllowed,
/// Requestion action encountered an error
Error(VcpuError),
/// Vcpu IoManager cache is revalidated
CacheRevalidated,
}
/// List of events that the vcpu_state_sender can send.
pub enum VcpuStateEvent {
/// (result, response) for hotplug, result 0 means failure, 1 means success.
Hotplug((i32, u32)),
}
/// Wrapper over vCPU that hides the underlying interactions with the vCPU thread.
pub struct VcpuHandle {
event_sender: Sender<VcpuEvent>,
response_receiver: Receiver<VcpuResponse>,
vcpu_thread: thread::JoinHandle<()>,
}
impl VcpuHandle {
/// Send event to vCPU thread
pub fn send_event(&self, event: VcpuEvent) -> Result<()> {
// Use expect() to crash if the other thread closed this channel.
self.event_sender
.send(event)
.expect("event sender channel closed on vcpu end.");
// Kick the vCPU so it picks up the message.
self.vcpu_thread
.kill(sigrtmin() + VCPU_RTSIG_OFFSET)
.map_err(VcpuError::SignalVcpu)?;
Ok(())
}
/// Receive response from vcpu thread
pub fn response_receiver(&self) -> &Receiver<VcpuResponse> {
&self.response_receiver
}
#[allow(dead_code)]
/// Join the vcpu thread
pub fn join_vcpu_thread(self) -> thread::Result<()> {
self.vcpu_thread.join()
}
}
#[derive(PartialEq)]
enum VcpuEmulation {
Handled,
Interrupted,
Stopped,
}
/// A wrapper around creating and using a kvm-based VCPU.
pub struct Vcpu {
// vCPU fd used by the vCPU
fd: Arc<VcpuFd>,
// vCPU id info
id: u8,
// Io manager Cached for facilitating IO operations
io_mgr: IoManagerCached,
// Records vCPU create time stamp
create_ts: TimestampUs,
// The receiving end of events channel owned by the vcpu side.
event_receiver: Receiver<VcpuEvent>,
// The transmitting end of the events channel which will be given to the handler.
event_sender: Option<Sender<VcpuEvent>>,
// The receiving end of the responses channel which will be given to the handler.
response_receiver: Option<Receiver<VcpuResponse>>,
// The transmitting end of the responses channel owned by the vcpu side.
response_sender: Sender<VcpuResponse>,
// Event notifier for CPU hotplug.
// After arm adapts to hotplug vcpu, the dead code macro needs to be removed
#[cfg_attr(target_arch = "aarch64", allow(dead_code))]
vcpu_state_event: EventFd,
// CPU hotplug events.
// After arm adapts to hotplug vcpu, the dead code macro needs to be removed
#[cfg_attr(target_arch = "aarch64", allow(dead_code))]
vcpu_state_sender: Sender<VcpuStateEvent>,
// An `EventFd` that will be written into when this vcpu exits.
exit_evt: EventFd,
// Whether kvm used supports immediate_exit flag.
support_immediate_exit: bool,
// CPUID information for the x86_64 CPU
#[cfg(target_arch = "x86_64")]
cpuid: kvm_bindings::CpuId,
/// Multiprocessor affinity register recorded for aarch64
#[cfg(target_arch = "aarch64")]
pub(crate) mpidr: u64,
}
// Using this for easier explicit type-casting to help IDEs interpret the code.
type VcpuCell = Cell<Option<*const Vcpu>>;
impl Vcpu {
thread_local!(static TLS_VCPU_PTR: VcpuCell = Cell::new(None));
/// Associates `self` with the current thread.
///
/// It is a prerequisite to successfully run `init_thread_local_data()` before using
/// `run_on_thread_local()` on the current thread.
/// This function will return an error if there already is a `Vcpu` present in the TLS.
fn init_thread_local_data(&mut self) -> Result<()> {
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
if cell.get().is_some() {
return Err(VcpuError::VcpuTlsInit);
}
cell.set(Some(self as *const Vcpu));
Ok(())
})
}
/// Deassociates `self` from the current thread.
///
/// Should be called if the current `self` had called `init_thread_local_data()` and
/// now needs to move to a different thread.
///
/// Fails if `self` was not previously associated with the current thread.
fn reset_thread_local_data(&mut self) -> Result<()> {
// Best-effort to clean up TLS. If the `Vcpu` was moved to another thread
// _before_ running this, then there is nothing we can do.
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
if let Some(vcpu_ptr) = cell.get() {
if vcpu_ptr == self as *const Vcpu {
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| cell.take());
return Ok(());
}
}
Err(VcpuError::VcpuTlsNotPresent)
})
}
/// Runs `func` for the `Vcpu` associated with the current thread.
///
/// It requires that `init_thread_local_data()` was run on this thread.
///
/// Fails if there is no `Vcpu` associated with the current thread.
///
/// # Safety
///
/// This is marked unsafe as it allows temporary aliasing through
/// dereferencing from pointer an already borrowed `Vcpu`.
unsafe fn run_on_thread_local<F>(func: F) -> Result<()>
where
F: FnOnce(&Vcpu),
{
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
if let Some(vcpu_ptr) = cell.get() {
// Dereferencing here is safe since `TLS_VCPU_PTR` is populated/non-empty,
// and it is being cleared on `Vcpu::drop` so there is no dangling pointer.
let vcpu_ref: &Vcpu = &*vcpu_ptr;
func(vcpu_ref);
Ok(())
} else {
Err(VcpuError::VcpuTlsNotPresent)
}
})
}
/// Registers a signal handler which makes use of TLS and kvm immediate exit to
/// kick the vcpu running on the current thread, if there is one.
pub fn register_kick_signal_handler() {
extern "C" fn handle_signal(_: c_int, _: *mut siginfo_t, _: *mut c_void) {
// This is safe because it's temporarily aliasing the `Vcpu` object, but we are
// only reading `vcpu.fd` which does not change for the lifetime of the `Vcpu`.
unsafe {
let _ = Vcpu::run_on_thread_local(|vcpu| {
vcpu.fd.set_kvm_immediate_exit(1);
fence(Ordering::Release);
});
}
}
register_signal_handler(sigrtmin() + VCPU_RTSIG_OFFSET, handle_signal)
.expect("Failed to register vcpu signal handler");
}
/// Returns the cpu index as seen by the guest OS.
pub fn cpu_index(&self) -> u8 {
self.id
}
/// Moves the vcpu to its own thread and constructs a VcpuHandle.
/// The handle can be used to control the remote vcpu.
pub fn start_threaded(
mut self,
seccomp_filter: BpfProgram,
barrier: Arc<Barrier>,
) -> Result<VcpuHandle> {
let event_sender = self.event_sender.take().unwrap();
let response_receiver = self.response_receiver.take().unwrap();
let vcpu_thread = thread::Builder::new()
.name(format!("db_vcpu{}", self.cpu_index()))
.spawn(move || {
self.init_thread_local_data()
.expect("Cannot cleanly initialize vcpu TLS.");
barrier.wait();
self.run(seccomp_filter);
})
.map_err(VcpuError::VcpuSpawn)?;
Ok(VcpuHandle {
event_sender,
response_receiver,
vcpu_thread,
})
}
/// Extract the vcpu running logic for test mocking.
#[cfg(not(test))]
pub fn emulate(fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
fd.run()
}
/// Runs the vCPU in KVM context and handles the kvm exit reason.
///
/// Returns error or enum specifying whether emulation was handled or interrupted.
fn run_emulation(&mut self) -> Result<VcpuEmulation> {
match Vcpu::emulate(&self.fd) {
Ok(run) => match run {
#[cfg(target_arch = "x86_64")]
VcpuExit::IoIn(addr, data) => {
let _ = self.io_mgr.pio_read(addr, data);
METRICS.vcpu.exit_io_in.inc();
Ok(VcpuEmulation::Handled)
}
#[cfg(target_arch = "x86_64")]
VcpuExit::IoOut(addr, data) => {
if !self.check_io_port_info(addr, data)? {
let _ = self.io_mgr.pio_write(addr, data);
}
METRICS.vcpu.exit_io_out.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::MmioRead(addr, data) => {
let _ = self.io_mgr.mmio_read(addr, data);
METRICS.vcpu.exit_mmio_read.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::MmioWrite(addr, data) => {
#[cfg(target_arch = "aarch64")]
self.check_boot_complete_signal(addr, data);
let _ = self.io_mgr.mmio_write(addr, data);
METRICS.vcpu.exit_mmio_write.inc();
Ok(VcpuEmulation::Handled)
}
VcpuExit::Hlt => {
info!("Received KVM_EXIT_HLT signal");
Err(VcpuError::VcpuUnhandledKvmExit)
}
VcpuExit::Shutdown => {
info!("Received KVM_EXIT_SHUTDOWN signal");
Err(VcpuError::VcpuUnhandledKvmExit)
}
// Documentation specifies that below kvm exits are considered errors.
VcpuExit::FailEntry => {
METRICS.vcpu.failures.inc();
error!("Received KVM_EXIT_FAIL_ENTRY signal");
Err(VcpuError::VcpuUnhandledKvmExit)
}
VcpuExit::InternalError => {
METRICS.vcpu.failures.inc();
error!("Received KVM_EXIT_INTERNAL_ERROR signal");
Err(VcpuError::VcpuUnhandledKvmExit)
}
VcpuExit::SystemEvent(event_type, event_flags) => match event_type {
KVM_SYSTEM_EVENT_RESET | KVM_SYSTEM_EVENT_SHUTDOWN => {
info!(
"Received KVM_SYSTEM_EVENT: type: {}, event: {}",
event_type, event_flags
);
Ok(VcpuEmulation::Stopped)
}
_ => {
METRICS.vcpu.failures.inc();
error!(
"Received KVM_SYSTEM_EVENT signal type: {}, flag: {}",
event_type, event_flags
);
Err(VcpuError::VcpuUnhandledKvmExit)
}
},
r => {
METRICS.vcpu.failures.inc();
// TODO: Are we sure we want to finish running a vcpu upon
// receiving a vm exit that is not necessarily an error?
error!("Unexpected exit reason on vcpu run: {:?}", r);
Err(VcpuError::VcpuUnhandledKvmExit)
}
},
// The unwrap on raw_os_error can only fail if we have a logic
// error in our code in which case it is better to panic.
Err(ref e) => {
match e.errno() {
libc::EAGAIN => Ok(VcpuEmulation::Handled),
libc::EINTR => {
self.fd.set_kvm_immediate_exit(0);
// Notify that this KVM_RUN was interrupted.
Ok(VcpuEmulation::Interrupted)
}
_ => {
METRICS.vcpu.failures.inc();
error!("Failure during vcpu run: {}", e);
#[cfg(target_arch = "x86_64")]
{
error!(
"dump regs: {:?}, dump sregs: {:?}",
self.fd.get_regs(),
self.fd.get_sregs()
);
}
Err(VcpuError::VcpuUnhandledKvmExit)
}
}
}
}
}
#[cfg(target_arch = "x86_64")]
// checkout the io port that dragonball used only
fn check_io_port_info(&self, addr: u16, data: &[u8]) -> Result<bool> {
let mut checked = false;
match addr {
// debug info signal
MAGIC_IOPORT_DEBUG_INFO => {
if data.len() == 4 {
let data = unsafe { std::ptr::read(data.as_ptr() as *const u32) };
log::warn!("KDBG: guest kernel debug info: 0x{:x}", data);
checked = true;
}
}
_ => {}
};
Ok(checked)
}
fn gettid() -> u32 {
nix::unistd::gettid().as_raw() as u32
}
fn revalidate_cache(&mut self) -> Result<()> {
self.io_mgr.revalidate_cache();
Ok(())
}
/// Main loop of the vCPU thread.
///
/// Runs the vCPU in KVM context in a loop. Handles KVM_EXITs then goes back in.
/// Note that the state of the VCPU and associated VM must be setup first for this to do
/// anything useful.
pub fn run(&mut self, seccomp_filter: BpfProgram) {
// Load seccomp filters for this vCPU thread.
// Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters
// altogether is the desired behaviour.
if let Err(e) = apply_filter(&seccomp_filter) {
if matches!(e, SecError::EmptyFilter) {
info!("vCPU thread {} use empty seccomp filters.", self.id);
} else {
panic!(
"Failed to set the requested seccomp filters on vCPU {}: Error: {}",
self.id, e
);
}
}
info!("vcpu {} is running", self.cpu_index());
// Start running the machine state in the `Paused` state.
StateMachine::run(self, Self::paused);
}
// This is the main loop of the `Running` state.
fn running(&mut self) -> StateMachine<Self> {
// This loop is here just for optimizing the emulation path.
// No point in ticking the state machine if there are no external events.
loop {
match self.run_emulation() {
// Emulation ran successfully, continue.
Ok(VcpuEmulation::Handled) => {
// We need to break here if kvm doesn't support
// immediate_exit flag. Because the signal sent from vmm
// thread may occurs when handling the vcpu exit events, and
// in this case the external vcpu events may not be handled
// correctly, so we need to check the event_receiver channel
// after handle vcpu exit events to decrease the window that
// doesn't handle the vcpu external events.
if !self.support_immediate_exit {
break;
}
}
// Emulation was interrupted, check external events.
Ok(VcpuEmulation::Interrupted) => break,
// Emulation was stopped due to reset or shutdown.
Ok(VcpuEmulation::Stopped) => return StateMachine::next(Self::waiting_exit),
// Emulation errors lead to vCPU exit.
Err(e) => {
error!("vcpu: {}, run_emulation failed: {:?}", self.id, e);
return StateMachine::next(Self::waiting_exit);
}
}
}
// By default don't change state.
let mut state = StateMachine::next(Self::running);
// Break this emulation loop on any transition request/external event.
match self.event_receiver.try_recv() {
// Running ---- Exit ----> Exited
Ok(VcpuEvent::Exit) => {
// Move to 'exited' state.
state = StateMachine::next(Self::exited);
}
// Running ---- Pause ----> Paused
Ok(VcpuEvent::Pause) => {
// Nothing special to do.
self.response_sender
.send(VcpuResponse::Paused)
.expect("failed to send pause status");
// TODO: we should call `KVM_KVMCLOCK_CTRL` here to make sure
// TODO continued: the guest soft lockup watchdog does not panic on Resume.
//let _ = self.fd.kvmclock_ctrl();
// Move to 'paused' state.
state = StateMachine::next(Self::paused);
}
Ok(VcpuEvent::Resume) => {
self.response_sender
.send(VcpuResponse::Resumed)
.expect("failed to send resume status");
}
Ok(VcpuEvent::Gettid) => {
self.response_sender
.send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid()))
.expect("failed to send vcpu thread tid");
}
Ok(VcpuEvent::RevalidateCache) => {
self.revalidate_cache()
.map(|()| {
self.response_sender
.send(VcpuResponse::CacheRevalidated)
.expect("failed to revalidate vcpu IoManager cache");
})
.map_err(|e| self.response_sender.send(VcpuResponse::Error(e)))
.expect("failed to revalidate vcpu IoManager cache");
}
// Unhandled exit of the other end.
Err(TryRecvError::Disconnected) => {
// Move to 'exited' state.
state = StateMachine::next(Self::exited);
}
// All other events or lack thereof have no effect on current 'running' state.
Err(TryRecvError::Empty) => (),
}
state
}
// This is the main loop of the `Paused` state.
fn paused(&mut self) -> StateMachine<Self> {
match self.event_receiver.recv() {
// Paused ---- Exit ----> Exited
Ok(VcpuEvent::Exit) => {
// Move to 'exited' state.
StateMachine::next(Self::exited)
}
// Paused ---- Resume ----> Running
Ok(VcpuEvent::Resume) => {
self.response_sender
.send(VcpuResponse::Resumed)
.expect("failed to send resume status");
// Move to 'running' state.
StateMachine::next(Self::running)
}
Ok(VcpuEvent::Pause) => {
self.response_sender
.send(VcpuResponse::Paused)
.expect("failed to send pause status");
// continue 'pause' state.
StateMachine::next(Self::paused)
}
Ok(VcpuEvent::Gettid) => {
self.response_sender
.send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid()))
.expect("failed to send vcpu thread tid");
StateMachine::next(Self::paused)
}
Ok(VcpuEvent::RevalidateCache) => {
self.revalidate_cache()
.map(|()| {
self.response_sender
.send(VcpuResponse::CacheRevalidated)
.expect("failed to revalidate vcpu IoManager cache");
})
.map_err(|e| self.response_sender.send(VcpuResponse::Error(e)))
.expect("failed to revalidate vcpu IoManager cache");
StateMachine::next(Self::paused)
}
// Unhandled exit of the other end.
Err(_) => {
// Move to 'exited' state.
StateMachine::next(Self::exited)
}
}
}
// This is the main loop of the `WaitingExit` state.
fn waiting_exit(&mut self) -> StateMachine<Self> {
// trigger vmm to stop machine
if let Err(e) = self.exit_evt.write(1) {
METRICS.vcpu.failures.inc();
error!("Failed signaling vcpu exit event: {}", e);
}
let mut state = StateMachine::next(Self::waiting_exit);
match self.event_receiver.recv() {
Ok(VcpuEvent::Exit) => state = StateMachine::next(Self::exited),
Ok(_) => error!(
"wrong state received in waiting exit state on vcpu {}",
self.id
),
Err(_) => {
error!(
"vcpu channel closed in waiting exit state on vcpu {}",
self.id
);
state = StateMachine::next(Self::exited);
}
}
state
}
// This is the main loop of the `Exited` state.
fn exited(&mut self) -> StateMachine<Self> {
// State machine reached its end.
StateMachine::finish(Self::exited)
}
}
impl Drop for Vcpu {
fn drop(&mut self) {
let _ = self.reset_thread_local_data();
}
}
#[cfg(test)]
pub mod tests {
use std::os::unix::io::AsRawFd;
use std::sync::mpsc::{channel, Receiver};
use std::sync::Mutex;
use arc_swap::ArcSwap;
use dbs_device::device_manager::IoManager;
use kvm_ioctls::Kvm;
use lazy_static::lazy_static;
use super::*;
use crate::kvm_context::KvmContext;
pub enum EmulationCase {
IoIn,
IoOut,
MmioRead,
MmioWrite,
Hlt,
Shutdown,
FailEntry,
InternalError,
Unknown,
SystemEvent(u32, u64),
Error(i32),
}
lazy_static! {
pub static ref EMULATE_RES: Mutex<EmulationCase> = Mutex::new(EmulationCase::Unknown);
}
impl Vcpu {
pub fn emulate(_fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
let res = &*EMULATE_RES.lock().unwrap();
match res {
EmulationCase::IoIn => Ok(VcpuExit::IoIn(0, &mut [])),
EmulationCase::IoOut => Ok(VcpuExit::IoOut(0, &[])),
EmulationCase::MmioRead => Ok(VcpuExit::MmioRead(0, &mut [])),
EmulationCase::MmioWrite => Ok(VcpuExit::MmioWrite(0, &[])),
EmulationCase::Hlt => Ok(VcpuExit::Hlt),
EmulationCase::Shutdown => Ok(VcpuExit::Shutdown),
EmulationCase::FailEntry => Ok(VcpuExit::FailEntry),
EmulationCase::InternalError => Ok(VcpuExit::InternalError),
EmulationCase::Unknown => Ok(VcpuExit::Unknown),
EmulationCase::SystemEvent(event_type, event_flags) => {
Ok(VcpuExit::SystemEvent(*event_type, *event_flags))
}
EmulationCase::Error(e) => Err(kvm_ioctls::Error::new(*e)),
}
}
}
#[cfg(target_arch = "x86_64")]
fn create_vcpu() -> (Vcpu, Receiver<VcpuStateEvent>) {
// Call for kvm too frequently would cause error in some host kernel.
std::thread::sleep(std::time::Duration::from_millis(5));
let kvm = Kvm::new().unwrap();
let vm = Arc::new(kvm.create_vm().unwrap());
let kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap();
let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap());
let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new()))));
let supported_cpuid = kvm_context
.supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.unwrap();
let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap();
let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap();
let (tx, rx) = channel();
let time_stamp = TimestampUs::default();
let vcpu = Vcpu::new_x86_64(
0,
vcpu_fd,
io_manager,
supported_cpuid,
reset_event_fd,
vcpu_state_event,
tx,
time_stamp,
false,
)
.unwrap();
(vcpu, rx)
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_vcpu_run_emulation() {
let (mut vcpu, _) = create_vcpu();
// Io in
*(EMULATE_RES.lock().unwrap()) = EmulationCase::IoIn;
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
// Io out
*(EMULATE_RES.lock().unwrap()) = EmulationCase::IoOut;
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
// Mmio read
*(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioRead;
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
// Mmio write
*(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioWrite;
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
// KVM_EXIT_HLT signal
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Hlt;
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// KVM_EXIT_SHUTDOWN signal
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Shutdown;
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// KVM_EXIT_FAIL_ENTRY signal
*(EMULATE_RES.lock().unwrap()) = EmulationCase::FailEntry;
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// KVM_EXIT_INTERNAL_ERROR signal
*(EMULATE_RES.lock().unwrap()) = EmulationCase::InternalError;
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// KVM_SYSTEM_EVENT_RESET
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, 0);
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
// KVM_SYSTEM_EVENT_SHUTDOWN
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, 0);
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
// Other system event
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, 0);
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// Unknown exit reason
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Unknown;
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
// Error: EAGAIN
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EAGAIN);
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
// Error: EINTR
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINTR);
let res = vcpu.run_emulation();
assert!(matches!(res, Ok(VcpuEmulation::Interrupted)));
// other error
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINVAL);
let res = vcpu.run_emulation();
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_vcpu_check_io_port_info() {
let (vcpu, receiver) = create_vcpu();
// boot complete signal
let res = vcpu
.check_io_port_info(
MAGIC_IOPORT_SIGNAL_GUEST_BOOT_COMPLETE,
&[MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE],
)
.unwrap();
assert!(res);
// debug info signal
let res = vcpu
.check_io_port_info(MAGIC_IOPORT_DEBUG_INFO, &[0, 0, 0, 0])
.unwrap();
assert!(res);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,149 @@
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.
use std::sync::mpsc::{channel, Sender};
use std::sync::Arc;
use dbs_arch::cpuid::{process_cpuid, VmSpec};
use dbs_arch::gdt::gdt_entry;
use dbs_utils::time::TimestampUs;
use kvm_bindings::CpuId;
use kvm_ioctls::{VcpuFd, VmFd};
use log::error;
use vm_memory::{Address, GuestAddress, GuestAddressSpace};
use vmm_sys_util::eventfd::EventFd;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::metric::{IncMetric, METRICS};
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent};
use crate::vcpu::VcpuConfig;
use crate::IoManagerCached;
impl Vcpu {
/// Constructs a new VCPU for `vm`.
///
/// # Arguments
///
/// * `id` - Represents the CPU number between [0, max vcpus).
/// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu.
/// * `io_mgr` - The io-manager used to access port-io and mmio devices.
/// * `cpuid` - The `CpuId` listing the supported capabilities of this vcpu.
/// * `exit_evt` - An `EventFd` that will be written into when this vcpu
/// exits.
/// * `vcpu_state_event` - The eventfd which can notify vmm state of some
/// vcpu should change.
/// * `vcpu_state_sender` - The channel to send state change message from
/// vcpu thread to vmm thread.
/// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime.
/// * `support_immediate_exit` - whether kvm used supports immediate_exit flag.
#[allow(clippy::too_many_arguments)]
pub fn new_x86_64(
id: u8,
vcpu_fd: Arc<VcpuFd>,
io_mgr: IoManagerCached,
cpuid: CpuId,
exit_evt: EventFd,
vcpu_state_event: EventFd,
vcpu_state_sender: Sender<VcpuStateEvent>,
create_ts: TimestampUs,
support_immediate_exit: bool,
) -> Result<Self> {
let (event_sender, event_receiver) = channel();
let (response_sender, response_receiver) = channel();
// Initially the cpuid per vCPU is the one supported by this VM.
Ok(Vcpu {
fd: vcpu_fd,
id,
io_mgr,
create_ts,
event_receiver,
event_sender: Some(event_sender),
response_receiver: Some(response_receiver),
response_sender,
vcpu_state_event,
vcpu_state_sender,
exit_evt,
support_immediate_exit,
cpuid,
})
}
/// Configures a x86_64 specific vcpu and should be called once per vcpu.
///
/// # Arguments
///
/// * `vm_config` - The machine configuration of this microvm needed for the CPUID configuration.
/// * `vm_fd` - The kvm `VmFd` for the virtual machine this vcpu will get attached to.
/// * `vm_memory` - The guest memory used by this microvm.
/// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts.
/// * `pgtable_addr` - pgtable address for ap vcpu
pub fn configure(
&mut self,
vcpu_config: &VcpuConfig,
_vm_fd: &VmFd,
vm_as: &GuestAddressSpaceImpl,
kernel_start_addr: Option<GuestAddress>,
_pgtable_addr: Option<GuestAddress>,
) -> Result<()> {
self.set_cpuid(vcpu_config)?;
dbs_arch::regs::setup_msrs(&self.fd).map_err(VcpuError::MSRSConfiguration)?;
if let Some(start_addr) = kernel_start_addr {
dbs_arch::regs::setup_regs(
&self.fd,
start_addr.raw_value() as u64,
dbs_boot::layout::BOOT_STACK_POINTER,
dbs_boot::layout::BOOT_STACK_POINTER,
dbs_boot::layout::ZERO_PAGE_START,
)
.map_err(VcpuError::REGSConfiguration)?;
dbs_arch::regs::setup_fpu(&self.fd).map_err(VcpuError::FPUConfiguration)?;
let gdt_table: [u64; dbs_boot::layout::BOOT_GDT_MAX as usize] = [
gdt_entry(0, 0, 0), // NULL
gdt_entry(0xa09b, 0, 0xfffff), // CODE
gdt_entry(0xc093, 0, 0xfffff), // DATA
gdt_entry(0x808b, 0, 0xfffff), // TSS
];
let pgtable_addr =
dbs_boot::setup_identity_mapping(&*vm_as.memory()).map_err(VcpuError::PageTable)?;
dbs_arch::regs::setup_sregs(
&*vm_as.memory(),
&self.fd,
pgtable_addr,
&gdt_table,
dbs_boot::layout::BOOT_GDT_OFFSET,
dbs_boot::layout::BOOT_IDT_OFFSET,
)
.map_err(VcpuError::SREGSConfiguration)?;
}
dbs_arch::interrupts::set_lint(&self.fd).map_err(VcpuError::LocalIntConfiguration)?;
Ok(())
}
fn set_cpuid(&mut self, vcpu_config: &VcpuConfig) -> Result<()> {
let cpuid_vm_spec = VmSpec::new(
self.id,
vcpu_config.max_vcpu_count as u8,
vcpu_config.threads_per_core,
vcpu_config.cores_per_die,
vcpu_config.dies_per_socket,
vcpu_config.vpmu_feature,
)
.map_err(VcpuError::CpuId)?;
process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| {
METRICS.vcpu.filter_cpuid.inc();
error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e);
VcpuError::CpuId(e)
})?;
self.fd
.set_cpuid2(&self.cpuid)
.map_err(VcpuError::SetSupportedCpusFailed)
}
}

View File

@ -18,3 +18,79 @@ pub struct NumaRegionInfo {
/// vcpu ids belonging to this region
pub vcpu_ids: Vec<u32>,
}
/// Information for cpu topology to guide guest init
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct CpuTopology {
/// threads per core to indicate hyperthreading is enabled or not
pub threads_per_core: u8,
/// cores per die to guide guest cpu topology init
pub cores_per_die: u8,
/// dies per socket to guide guest cpu topology
pub dies_per_socket: u8,
/// number of sockets
pub sockets: u8,
}
impl Default for CpuTopology {
fn default() -> Self {
CpuTopology {
threads_per_core: 1,
cores_per_die: 1,
dies_per_socket: 1,
sockets: 1,
}
}
}
/// Configuration information for virtual machine instance.
#[derive(Clone, Debug, PartialEq)]
pub struct VmConfigInfo {
/// Number of vcpu to start.
pub vcpu_count: u8,
/// Max number of vcpu can be added
pub max_vcpu_count: u8,
/// Enable or disable hyperthreading.
pub ht_enabled: bool,
/// cpu power management.
pub cpu_pm: String,
/// cpu topology information
pub cpu_topology: CpuTopology,
/// vpmu support level
pub vpmu_feature: u8,
/// Memory type that can be either hugetlbfs or shmem, default is shmem
pub mem_type: String,
/// Memory file path
pub mem_file_path: String,
/// The memory size in MiB.
pub mem_size_mib: usize,
/// reserve memory bytes
pub reserve_memory_bytes: u64,
/// sock path
pub serial_path: Option<String>,
}
impl Default for VmConfigInfo {
fn default() -> Self {
VmConfigInfo {
vcpu_count: 1,
max_vcpu_count: 1,
ht_enabled: false,
cpu_pm: String::from("on"),
cpu_topology: CpuTopology {
threads_per_core: 1,
cores_per_die: 1,
dies_per_socket: 1,
sockets: 1,
},
vpmu_feature: 0,
mem_type: String::from("shmem"),
mem_file_path: String::from(""),
mem_size_mib: 128,
reserve_memory_bytes: 0,
serial_path: None,
}
}
}