mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-08-09 11:58:16 +00:00
Merge pull request #4400 from openanolis/anolis/dragonball-2
runtime-rs: built-in Dragonball sandbox part II - vCPU manager
This commit is contained in:
commit
badbbcd8be
@ -14,18 +14,22 @@ arc-swap = "1.5.0"
|
|||||||
bytes = "1.1.0"
|
bytes = "1.1.0"
|
||||||
dbs-address-space = "0.1.0"
|
dbs-address-space = "0.1.0"
|
||||||
dbs-allocator = "0.1.0"
|
dbs-allocator = "0.1.0"
|
||||||
|
dbs-arch = "0.1.0"
|
||||||
dbs-boot = "0.2.0"
|
dbs-boot = "0.2.0"
|
||||||
dbs-device = "0.1.0"
|
dbs-device = "0.1.0"
|
||||||
dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] }
|
dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] }
|
||||||
dbs-legacy-devices = "0.1.0"
|
dbs-legacy-devices = "0.1.0"
|
||||||
|
dbs-upcall = { version = "0.1.0", optional = true }
|
||||||
dbs-utils = "0.1.0"
|
dbs-utils = "0.1.0"
|
||||||
dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] }
|
dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] }
|
||||||
kvm-bindings = "0.5.0"
|
kvm-bindings = "0.5.0"
|
||||||
kvm-ioctls = "0.11.0"
|
kvm-ioctls = "0.11.0"
|
||||||
|
lazy_static = "1.2"
|
||||||
libc = "0.2.39"
|
libc = "0.2.39"
|
||||||
linux-loader = "0.4.0"
|
linux-loader = "0.4.0"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
nix = "0.23.1"
|
nix = "0.23.1"
|
||||||
|
seccompiler = "0.2.0"
|
||||||
serde = "1.0.27"
|
serde = "1.0.27"
|
||||||
serde_derive = "1.0.27"
|
serde_derive = "1.0.27"
|
||||||
serde_json = "1.0.9"
|
serde_json = "1.0.9"
|
||||||
@ -41,13 +45,15 @@ slog-term = "2.9.0"
|
|||||||
slog-async = "2.7.0"
|
slog-async = "2.7.0"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
acpi = []
|
||||||
atomic-guest-memory = []
|
atomic-guest-memory = []
|
||||||
|
hotplug = ["virtio-vsock"]
|
||||||
virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"]
|
virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"]
|
||||||
|
|
||||||
[patch.'crates-io']
|
[patch.'crates-io']
|
||||||
dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "84eee5737cc7d85f9921c94a93e6b9dc4ae24a39" }
|
dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "127621db934af5ffba558e44b77afa00cdf62af6" }
|
||||||
|
@ -17,7 +17,10 @@ and configuration process.
|
|||||||
# Documentation
|
# Documentation
|
||||||
|
|
||||||
Device: [Device Document](docs/device.md)
|
Device: [Device Document](docs/device.md)
|
||||||
|
vCPU: [vCPU Document](docs/vcpu.md)
|
||||||
|
API: [API Document](docs/api.md)
|
||||||
|
|
||||||
|
Currently, the documents are still actively adding.
|
||||||
You could see the [official documentation](docs/) page for more details.
|
You could see the [official documentation](docs/) page for more details.
|
||||||
|
|
||||||
# Supported Architectures
|
# Supported Architectures
|
||||||
|
7
src/dragonball/docs/api.md
Normal file
7
src/dragonball/docs/api.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# API
|
||||||
|
|
||||||
|
We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager.
|
||||||
|
This document provides the introduction for each of them.
|
||||||
|
|
||||||
|
TODO: Details will be added in the Part III PR for `Dragonball`
|
||||||
|
|
@ -14,4 +14,7 @@ Currently we have following device manager:
|
|||||||
|
|
||||||
## Device supported
|
## Device supported
|
||||||
`VIRTIO-VSOCK`
|
`VIRTIO-VSOCK`
|
||||||
|
`i8042`
|
||||||
|
`COM1`
|
||||||
|
`COM2`
|
||||||
|
|
||||||
|
42
src/dragonball/docs/vcpu.md
Normal file
42
src/dragonball/docs/vcpu.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# vCPU
|
||||||
|
|
||||||
|
## vCPU Manager
|
||||||
|
The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc.
|
||||||
|
|
||||||
|
For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445)
|
||||||
|
|
||||||
|
### vCPU config
|
||||||
|
`VcpuConfig` is used to configure guest overall CPU info.
|
||||||
|
|
||||||
|
`boot_vcpu_count` is used to define the initial vCPU number.
|
||||||
|
|
||||||
|
`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature
|
||||||
|
|
||||||
|
`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology.
|
||||||
|
|
||||||
|
`vpmu_feature` is used to define `vPMU` feature level.
|
||||||
|
If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default).
|
||||||
|
If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions).
|
||||||
|
If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported
|
||||||
|
|
||||||
|
## vCPU State
|
||||||
|
|
||||||
|
There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow.
|
||||||
|
|
||||||
|
When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`.
|
||||||
|
|
||||||
|
During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason.
|
||||||
|
|
||||||
|
If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine.
|
||||||
|
When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM.
|
||||||
|
|
||||||
|
When the VMM is stopped / destroyed, the state will change to `exited`.
|
||||||
|
|
||||||
|
## vCPU Hot plug
|
||||||
|
Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug.
|
||||||
|
|
||||||
|
To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try.
|
||||||
|
|
||||||
|
vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid.
|
||||||
|
|
||||||
|
|
6
src/dragonball/src/api/mod.rs
Normal file
6
src/dragonball/src/api/mod.rs
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
//! API related data structures to configure the vmm.
|
||||||
|
|
||||||
|
pub mod v1;
|
84
src/dragonball/src/api/v1/instance_info.rs
Normal file
84
src/dragonball/src/api/v1/instance_info.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
use serde_derive::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// The microvm state.
|
||||||
|
///
|
||||||
|
/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is
|
||||||
|
/// called, the state goes from Uninitialized to Starting. The state is changed to Running until
|
||||||
|
/// the start_microvm method ends. Halting and Halted are currently unsupported.
|
||||||
|
#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)]
|
||||||
|
pub enum InstanceState {
|
||||||
|
/// Microvm is not initialized.
|
||||||
|
Uninitialized,
|
||||||
|
/// Microvm is starting.
|
||||||
|
Starting,
|
||||||
|
/// Microvm is running.
|
||||||
|
Running,
|
||||||
|
/// Microvm is Paused.
|
||||||
|
Paused,
|
||||||
|
/// Microvm received a halt instruction.
|
||||||
|
Halting,
|
||||||
|
/// Microvm is halted.
|
||||||
|
Halted,
|
||||||
|
/// Microvm exit instead of process exit.
|
||||||
|
Exited(i32),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The state of async actions
|
||||||
|
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
|
||||||
|
pub enum AsyncState {
|
||||||
|
/// Uninitialized
|
||||||
|
Uninitialized,
|
||||||
|
/// Success
|
||||||
|
Success,
|
||||||
|
/// Failure
|
||||||
|
Failure,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The strongly typed that contains general information about the microVM.
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct InstanceInfo {
|
||||||
|
/// The ID of the microVM.
|
||||||
|
pub id: String,
|
||||||
|
/// The state of the microVM.
|
||||||
|
pub state: InstanceState,
|
||||||
|
/// The version of the VMM that runs the microVM.
|
||||||
|
pub vmm_version: String,
|
||||||
|
/// The pid of the current VMM process.
|
||||||
|
pub pid: u32,
|
||||||
|
/// The state of async actions.
|
||||||
|
pub async_state: AsyncState,
|
||||||
|
/// List of tids of vcpu threads (vcpu index, tid)
|
||||||
|
pub tids: Vec<(u8, u32)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InstanceInfo {
|
||||||
|
/// create instance info object with given id, version, and platform type
|
||||||
|
pub fn new(id: String, vmm_version: String) -> Self {
|
||||||
|
InstanceInfo {
|
||||||
|
id,
|
||||||
|
state: InstanceState::Uninitialized,
|
||||||
|
vmm_version,
|
||||||
|
pid: std::process::id(),
|
||||||
|
async_state: AsyncState::Uninitialized,
|
||||||
|
tids: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for InstanceInfo {
|
||||||
|
fn default() -> Self {
|
||||||
|
InstanceInfo {
|
||||||
|
id: String::from(""),
|
||||||
|
state: InstanceState::Uninitialized,
|
||||||
|
vmm_version: env!("CARGO_PKG_VERSION").to_string(),
|
||||||
|
pid: std::process::id(),
|
||||||
|
async_state: AsyncState::Uninitialized,
|
||||||
|
tids: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
7
src/dragonball/src/api/v1/mod.rs
Normal file
7
src/dragonball/src/api/v1/mod.rs
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
//! API Version 1 related data structures to configure the vmm.
|
||||||
|
|
||||||
|
mod instance_info;
|
||||||
|
pub use self::instance_info::{InstanceInfo, InstanceState};
|
@ -29,6 +29,12 @@ use dbs_virtio_devices::{
|
|||||||
VirtioDevice,
|
VirtioDevice,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||||
|
use dbs_upcall::{
|
||||||
|
DevMgrRequest, DevMgrService, MmioDevRequest, UpcallClient, UpcallClientError,
|
||||||
|
UpcallClientRequest, UpcallClientResponse,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::address_space_manager::GuestAddressSpaceImpl;
|
use crate::address_space_manager::GuestAddressSpaceImpl;
|
||||||
use crate::error::StartMicrovmError;
|
use crate::error::StartMicrovmError;
|
||||||
use crate::resource_manager::ResourceManager;
|
use crate::resource_manager::ResourceManager;
|
||||||
@ -83,6 +89,11 @@ pub enum DeviceMgrError {
|
|||||||
/// Error from Virtio subsystem.
|
/// Error from Virtio subsystem.
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Virtio(virtio::Error),
|
Virtio(virtio::Error),
|
||||||
|
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||||
|
/// Failed to hotplug the device.
|
||||||
|
#[error("failed to hotplug virtual device")]
|
||||||
|
HotplugDevice(#[source] UpcallClientError),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Specialized version of `std::result::Result` for device manager operations.
|
/// Specialized version of `std::result::Result` for device manager operations.
|
||||||
@ -188,6 +199,8 @@ pub struct DeviceOpContext {
|
|||||||
logger: slog::Logger,
|
logger: slog::Logger,
|
||||||
is_hotplug: bool,
|
is_hotplug: bool,
|
||||||
|
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||||
|
upcall_client: Option<Arc<UpcallClient<DevMgrService>>>,
|
||||||
#[cfg(feature = "dbs-virtio-devices")]
|
#[cfg(feature = "dbs-virtio-devices")]
|
||||||
virtio_devices: Vec<Arc<DbsMmioV2Device>>,
|
virtio_devices: Vec<Arc<DbsMmioV2Device>>,
|
||||||
}
|
}
|
||||||
@ -220,6 +233,8 @@ impl DeviceOpContext {
|
|||||||
address_space,
|
address_space,
|
||||||
logger,
|
logger,
|
||||||
is_hotplug,
|
is_hotplug,
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||||
|
upcall_client: None,
|
||||||
#[cfg(feature = "dbs-virtio-devices")]
|
#[cfg(feature = "dbs-virtio-devices")]
|
||||||
virtio_devices: Vec::new(),
|
virtio_devices: Vec::new(),
|
||||||
}
|
}
|
||||||
@ -236,35 +251,122 @@ impl DeviceOpContext {
|
|||||||
&self.logger
|
&self.logger
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(unused_variables)]
|
||||||
fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> {
|
fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> {
|
||||||
if !self.is_hotplug {
|
if self.is_hotplug {
|
||||||
return Err(DeviceMgrError::InvalidOperation);
|
return Err(DeviceMgrError::InvalidOperation);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "dbs-virtio-devices")]
|
#[cfg(feature = "dbs-virtio-devices")]
|
||||||
let cmdline = kernel_config.kernel_cmdline_mut();
|
{
|
||||||
|
let cmdline = kernel_config.kernel_cmdline_mut();
|
||||||
|
|
||||||
#[cfg(feature = "dbs-virtio-devices")]
|
for device in self.virtio_devices.iter() {
|
||||||
for device in self.virtio_devices.iter() {
|
let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?;
|
||||||
let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?;
|
|
||||||
|
|
||||||
// as per doc, [virtio_mmio.]device=<size>@<baseaddr>:<irq> needs to be appended
|
// as per doc, [virtio_mmio.]device=<size>@<baseaddr>:<irq> needs to be appended
|
||||||
// to kernel commandline for virtio mmio devices to get recognized
|
// to kernel commandline for virtio mmio devices to get recognized
|
||||||
// the size parameter has to be transformed to KiB, so dividing hexadecimal value in
|
// the size parameter has to be transformed to KiB, so dividing hexadecimal value in
|
||||||
// bytes to 1024; further, the '{}' formatting rust construct will automatically
|
// bytes to 1024; further, the '{}' formatting rust construct will automatically
|
||||||
// transform it to decimal
|
// transform it to decimal
|
||||||
cmdline
|
cmdline
|
||||||
.insert(
|
.insert(
|
||||||
"virtio_mmio.device",
|
"virtio_mmio.device",
|
||||||
&format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq),
|
&format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq),
|
||||||
)
|
)
|
||||||
.map_err(DeviceMgrError::Cmdline)?;
|
.map_err(DeviceMgrError::Cmdline)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "hotplug"))]
|
||||||
|
impl DeviceOpContext {
|
||||||
|
pub(crate) fn insert_hotplug_mmio_device(
|
||||||
|
&self,
|
||||||
|
_dev: &Arc<dyn DeviceIo>,
|
||||||
|
_callback: Option<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
Err(DeviceMgrError::InvalidOperation)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn remove_hotplug_mmio_device(
|
||||||
|
&self,
|
||||||
|
_dev: &Arc<dyn DeviceIo>,
|
||||||
|
_callback: Option<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
Err(DeviceMgrError::InvalidOperation)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||||
|
impl DeviceOpContext {
|
||||||
|
fn call_hotplug_device(
|
||||||
|
&self,
|
||||||
|
req: DevMgrRequest,
|
||||||
|
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
if let Some(upcall_client) = self.upcall_client.as_ref() {
|
||||||
|
if let Some(cb) = callback {
|
||||||
|
upcall_client
|
||||||
|
.send_request(UpcallClientRequest::DevMgr(req), cb)
|
||||||
|
.map_err(DeviceMgrError::HotplugDevice)?;
|
||||||
|
} else {
|
||||||
|
upcall_client
|
||||||
|
.send_request_without_result(UpcallClientRequest::DevMgr(req))
|
||||||
|
.map_err(DeviceMgrError::HotplugDevice)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(DeviceMgrError::InvalidOperation)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn insert_hotplug_mmio_device(
|
||||||
|
&self,
|
||||||
|
dev: &Arc<DbsMmioV2Device>,
|
||||||
|
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
if !self.is_hotplug {
|
||||||
|
return Err(DeviceMgrError::InvalidOperation);
|
||||||
|
}
|
||||||
|
|
||||||
|
let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
|
||||||
|
let req = DevMgrRequest::AddMmioDev(MmioDevRequest {
|
||||||
|
mmio_base,
|
||||||
|
mmio_size,
|
||||||
|
mmio_irq,
|
||||||
|
});
|
||||||
|
|
||||||
|
self.call_hotplug_device(req, callback)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn remove_hotplug_mmio_device(
|
||||||
|
&self,
|
||||||
|
dev: &Arc<DbsMmioV2Device>,
|
||||||
|
callback: Option<Box<dyn Fn(UpcallClientResponse) + Send>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
if !self.is_hotplug {
|
||||||
|
return Err(DeviceMgrError::InvalidOperation);
|
||||||
|
}
|
||||||
|
let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
|
||||||
|
let req = DevMgrRequest::DelMmioDev(MmioDevRequest {
|
||||||
|
mmio_base,
|
||||||
|
mmio_size,
|
||||||
|
mmio_irq,
|
||||||
|
});
|
||||||
|
|
||||||
|
self.call_hotplug_device(req, callback)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(feature = "hotplug", feature = "acpi"))]
|
||||||
|
impl DeviceOpContext {
|
||||||
|
// TODO: We will implement this when we develop ACPI virtualization
|
||||||
|
}
|
||||||
|
|
||||||
/// Device manager for virtual machines, which manages all device for a virtual machine.
|
/// Device manager for virtual machines, which manages all device for a virtual machine.
|
||||||
pub struct DeviceManager {
|
pub struct DeviceManager {
|
||||||
io_manager: Arc<ArcSwap<IoManager>>,
|
io_manager: Arc<ArcSwap<IoManager>>,
|
||||||
@ -351,7 +453,7 @@ impl DeviceManager {
|
|||||||
self.set_guest_kernel_log_stream(dmesg_fifo)
|
self.set_guest_kernel_log_stream(dmesg_fifo)
|
||||||
.map_err(|_| StartMicrovmError::EventFd)?;
|
.map_err(|_| StartMicrovmError::EventFd)?;
|
||||||
|
|
||||||
slog::info!(self.logger, "init console path: {:?}", com1_sock_path);
|
info!(self.logger, "init console path: {:?}", com1_sock_path);
|
||||||
if let Some(path) = com1_sock_path {
|
if let Some(path) = com1_sock_path {
|
||||||
if let Some(legacy_manager) = self.legacy_manager.as_ref() {
|
if let Some(legacy_manager) = self.legacy_manager.as_ref() {
|
||||||
let com1 = legacy_manager.get_com1_serial();
|
let com1 = legacy_manager.get_com1_serial();
|
||||||
@ -387,19 +489,6 @@ impl DeviceManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Restore legacy devices
|
|
||||||
pub fn restore_legacy_devices(
|
|
||||||
&mut self,
|
|
||||||
dmesg_fifo: Option<Box<dyn io::Write + Send>>,
|
|
||||||
com1_sock_path: Option<String>,
|
|
||||||
) -> std::result::Result<(), StartMicrovmError> {
|
|
||||||
self.set_guest_kernel_log_stream(dmesg_fifo)
|
|
||||||
.map_err(|_| StartMicrovmError::EventFd)?;
|
|
||||||
slog::info!(self.logger, "restore console path: {:?}", com1_sock_path);
|
|
||||||
// TODO: restore console
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reset the console into canonical mode.
|
/// Reset the console into canonical mode.
|
||||||
pub fn reset_console(&self) -> Result<()> {
|
pub fn reset_console(&self) -> Result<()> {
|
||||||
self.con_manager.reset_console()
|
self.con_manager.reset_console()
|
||||||
|
@ -14,6 +14,37 @@ use dbs_virtio_devices::Error as VirtIoError;
|
|||||||
|
|
||||||
use crate::device_manager;
|
use crate::device_manager;
|
||||||
|
|
||||||
|
/// Shorthand result type for internal VMM commands.
|
||||||
|
pub type Result<T> = std::result::Result<T, Error>;
|
||||||
|
|
||||||
|
/// Errors associated with the VMM internal logic.
|
||||||
|
///
|
||||||
|
/// These errors cannot be generated by direct user input, but can result from bad configuration
|
||||||
|
/// of the host (for example if Dragonball doesn't have permissions to open the KVM fd).
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum Error {
|
||||||
|
/// Failure occurs in issuing KVM ioctls and errors will be returned from kvm_ioctls lib.
|
||||||
|
#[error("failure in issuing KVM ioctl command")]
|
||||||
|
Kvm(#[source] kvm_ioctls::Error),
|
||||||
|
|
||||||
|
/// The host kernel reports an unsupported KVM API version.
|
||||||
|
#[error("unsupported KVM version {0}")]
|
||||||
|
KvmApiVersion(i32),
|
||||||
|
|
||||||
|
/// Cannot initialize the KVM context due to missing capabilities.
|
||||||
|
#[error("missing KVM capability")]
|
||||||
|
KvmCap(kvm_ioctls::Cap),
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[error("failed to configure MSRs")]
|
||||||
|
/// Cannot configure MSRs
|
||||||
|
GuestMSRs(dbs_arch::msr::Error),
|
||||||
|
|
||||||
|
/// MSR inner error
|
||||||
|
#[error("MSR inner error")]
|
||||||
|
Msr(vmm_sys_util::fam::Error),
|
||||||
|
}
|
||||||
|
|
||||||
/// Errors associated with starting the instance.
|
/// Errors associated with starting the instance.
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum StartMicrovmError {
|
pub enum StartMicrovmError {
|
||||||
|
60
src/dragonball/src/io_manager.rs
Normal file
60
src/dragonball/src/io_manager.rs
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use arc_swap::{ArcSwap, Cache};
|
||||||
|
use dbs_device::device_manager::Error;
|
||||||
|
use dbs_device::device_manager::IoManager;
|
||||||
|
|
||||||
|
/// A specialized version of [`std::result::Result`] for IO manager related operations.
|
||||||
|
pub type Result<T> = std::result::Result<T, Error>;
|
||||||
|
|
||||||
|
/// Wrapper over IoManager to support device hotplug with [`ArcSwap`] and [`Cache`].
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct IoManagerCached(pub(crate) Cache<Arc<ArcSwap<IoManager>>, Arc<IoManager>>);
|
||||||
|
|
||||||
|
impl IoManagerCached {
|
||||||
|
/// Create a new instance of [`IoManagerCached`].
|
||||||
|
pub fn new(io_manager: Arc<ArcSwap<IoManager>>) -> Self {
|
||||||
|
IoManagerCached(Cache::new(io_manager))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[inline]
|
||||||
|
/// Read data from IO ports.
|
||||||
|
pub fn pio_read(&mut self, addr: u16, data: &mut [u8]) -> Result<()> {
|
||||||
|
self.0.load().pio_read(addr, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[inline]
|
||||||
|
/// Write data to IO ports.
|
||||||
|
pub fn pio_write(&mut self, addr: u16, data: &[u8]) -> Result<()> {
|
||||||
|
self.0.load().pio_write(addr, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
/// Read data to MMIO address.
|
||||||
|
pub fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> Result<()> {
|
||||||
|
self.0.load().mmio_read(addr, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
/// Write data to MMIO address.
|
||||||
|
pub fn mmio_write(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||||
|
self.0.load().mmio_write(addr, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
/// Revalidate the inner cache
|
||||||
|
pub fn revalidate_cache(&mut self) {
|
||||||
|
let _ = self.0.load();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
/// Get immutable reference to underlying [`IoManager`].
|
||||||
|
pub fn load(&mut self) -> &IoManager {
|
||||||
|
self.0.load()
|
||||||
|
}
|
||||||
|
}
|
251
src/dragonball/src/kvm_context.rs
Normal file
251
src/dragonball/src/kvm_context.rs
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the THIRD-PARTY file.
|
||||||
|
#![allow(dead_code)]
|
||||||
|
use kvm_bindings::KVM_API_VERSION;
|
||||||
|
use kvm_ioctls::{Cap, Kvm, VmFd};
|
||||||
|
use std::os::unix::io::{FromRawFd, RawFd};
|
||||||
|
|
||||||
|
use crate::error::{Error, Result};
|
||||||
|
|
||||||
|
/// Describes a KVM context that gets attached to the micro VM instance.
|
||||||
|
/// It gives access to the functionality of the KVM wrapper as long as every required
|
||||||
|
/// KVM capability is present on the host.
|
||||||
|
pub struct KvmContext {
|
||||||
|
kvm: Kvm,
|
||||||
|
max_memslots: usize,
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
supported_msrs: kvm_bindings::MsrList,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KvmContext {
|
||||||
|
/// Create a new KVM context object, using the provided `kvm_fd` if one is presented.
|
||||||
|
pub fn new(kvm_fd: Option<RawFd>) -> Result<Self> {
|
||||||
|
let kvm = if let Some(fd) = kvm_fd {
|
||||||
|
// Safe because we expect kvm_fd to contain a valid fd number when is_some() == true.
|
||||||
|
unsafe { Kvm::from_raw_fd(fd) }
|
||||||
|
} else {
|
||||||
|
Kvm::new().map_err(Error::Kvm)?
|
||||||
|
};
|
||||||
|
|
||||||
|
if kvm.get_api_version() != KVM_API_VERSION as i32 {
|
||||||
|
return Err(Error::KvmApiVersion(kvm.get_api_version()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::check_cap(&kvm, Cap::Irqchip)?;
|
||||||
|
Self::check_cap(&kvm, Cap::Irqfd)?;
|
||||||
|
Self::check_cap(&kvm, Cap::Ioeventfd)?;
|
||||||
|
Self::check_cap(&kvm, Cap::UserMemory)?;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
Self::check_cap(&kvm, Cap::SetTssAddr)?;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
let supported_msrs = dbs_arch::msr::supported_guest_msrs(&kvm).map_err(Error::GuestMSRs)?;
|
||||||
|
let max_memslots = kvm.get_nr_memslots();
|
||||||
|
|
||||||
|
Ok(KvmContext {
|
||||||
|
kvm,
|
||||||
|
max_memslots,
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
supported_msrs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get underlying KVM object to access kvm-ioctls interfaces.
|
||||||
|
pub fn kvm(&self) -> &Kvm {
|
||||||
|
&self.kvm
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the maximum number of memory slots reported by this KVM context.
|
||||||
|
pub fn max_memslots(&self) -> usize {
|
||||||
|
self.max_memslots
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a virtual machine object.
|
||||||
|
pub fn create_vm(&self) -> Result<VmFd> {
|
||||||
|
self.kvm.create_vm().map_err(Error::Kvm)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the max vcpu count supported by kvm
|
||||||
|
pub fn get_max_vcpus(&self) -> usize {
|
||||||
|
self.kvm.get_max_vcpus()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_cap(kvm: &Kvm, cap: Cap) -> std::result::Result<(), Error> {
|
||||||
|
if !kvm.check_extension(cap) {
|
||||||
|
return Err(Error::KvmCap(cap));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
mod x86_64 {
|
||||||
|
use super::*;
|
||||||
|
use dbs_arch::msr::*;
|
||||||
|
use kvm_bindings::{kvm_msr_entry, CpuId, MsrList, Msrs};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
impl KvmContext {
|
||||||
|
/// Get information about supported CPUID of x86 processor.
|
||||||
|
pub fn supported_cpuid(
|
||||||
|
&self,
|
||||||
|
max_entries_count: usize,
|
||||||
|
) -> std::result::Result<CpuId, kvm_ioctls::Error> {
|
||||||
|
self.kvm.get_supported_cpuid(max_entries_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get information about supported MSRs of x86 processor.
|
||||||
|
pub fn supported_msrs(
|
||||||
|
&self,
|
||||||
|
_max_entries_count: usize,
|
||||||
|
) -> std::result::Result<MsrList, kvm_ioctls::Error> {
|
||||||
|
Ok(self.supported_msrs.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
// It's very sensible to manipulate MSRs, so please be careful to change code below.
|
||||||
|
fn build_msrs_list(kvm: &Kvm) -> Result<Msrs> {
|
||||||
|
let mut mset: HashSet<u32> = HashSet::new();
|
||||||
|
let supported_msr_list = kvm.get_msr_index_list().map_err(super::Error::Kvm)?;
|
||||||
|
for msr in supported_msr_list.as_slice() {
|
||||||
|
mset.insert(*msr);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut msrs = vec![
|
||||||
|
MSR_IA32_APICBASE,
|
||||||
|
MSR_IA32_SYSENTER_CS,
|
||||||
|
MSR_IA32_SYSENTER_ESP,
|
||||||
|
MSR_IA32_SYSENTER_EIP,
|
||||||
|
MSR_IA32_CR_PAT,
|
||||||
|
];
|
||||||
|
|
||||||
|
let filters_list = vec![
|
||||||
|
MSR_STAR,
|
||||||
|
MSR_VM_HSAVE_PA,
|
||||||
|
MSR_TSC_AUX,
|
||||||
|
MSR_IA32_TSC_ADJUST,
|
||||||
|
MSR_IA32_TSCDEADLINE,
|
||||||
|
MSR_IA32_MISC_ENABLE,
|
||||||
|
MSR_IA32_BNDCFGS,
|
||||||
|
MSR_IA32_SPEC_CTRL,
|
||||||
|
];
|
||||||
|
for msr in filters_list {
|
||||||
|
if mset.contains(&msr) {
|
||||||
|
msrs.push(msr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: several msrs are optional.
|
||||||
|
|
||||||
|
// TODO: Since our guests don't support nested-vmx, LMCE nor SGX for now.
|
||||||
|
// msrs.push(MSR_IA32_FEATURE_CONTROL);
|
||||||
|
|
||||||
|
msrs.push(MSR_CSTAR);
|
||||||
|
msrs.push(MSR_KERNEL_GS_BASE);
|
||||||
|
msrs.push(MSR_SYSCALL_MASK);
|
||||||
|
msrs.push(MSR_LSTAR);
|
||||||
|
msrs.push(MSR_IA32_TSC);
|
||||||
|
|
||||||
|
msrs.push(MSR_KVM_SYSTEM_TIME_NEW);
|
||||||
|
msrs.push(MSR_KVM_WALL_CLOCK_NEW);
|
||||||
|
|
||||||
|
// FIXME: check if it's supported.
|
||||||
|
msrs.push(MSR_KVM_ASYNC_PF_EN);
|
||||||
|
msrs.push(MSR_KVM_PV_EOI_EN);
|
||||||
|
msrs.push(MSR_KVM_STEAL_TIME);
|
||||||
|
|
||||||
|
msrs.push(MSR_CORE_PERF_FIXED_CTR_CTRL);
|
||||||
|
msrs.push(MSR_CORE_PERF_GLOBAL_CTRL);
|
||||||
|
msrs.push(MSR_CORE_PERF_GLOBAL_STATUS);
|
||||||
|
msrs.push(MSR_CORE_PERF_GLOBAL_OVF_CTRL);
|
||||||
|
|
||||||
|
const MAX_FIXED_COUNTERS: u32 = 3;
|
||||||
|
for i in 0..MAX_FIXED_COUNTERS {
|
||||||
|
msrs.push(MSR_CORE_PERF_FIXED_CTR0 + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: skip MCE for now.
|
||||||
|
|
||||||
|
let mtrr_msrs = vec![
|
||||||
|
MSR_MTRRdefType,
|
||||||
|
MSR_MTRRfix64K_00000,
|
||||||
|
MSR_MTRRfix16K_80000,
|
||||||
|
MSR_MTRRfix16K_A0000,
|
||||||
|
MSR_MTRRfix4K_C0000,
|
||||||
|
MSR_MTRRfix4K_C8000,
|
||||||
|
MSR_MTRRfix4K_D0000,
|
||||||
|
MSR_MTRRfix4K_D8000,
|
||||||
|
MSR_MTRRfix4K_E0000,
|
||||||
|
MSR_MTRRfix4K_E8000,
|
||||||
|
MSR_MTRRfix4K_F0000,
|
||||||
|
MSR_MTRRfix4K_F8000,
|
||||||
|
];
|
||||||
|
for mtrr in mtrr_msrs {
|
||||||
|
msrs.push(mtrr);
|
||||||
|
}
|
||||||
|
|
||||||
|
const MSR_MTRRCAP_VCNT: u32 = 8;
|
||||||
|
for i in 0..MSR_MTRRCAP_VCNT {
|
||||||
|
msrs.push(0x200 + 2 * i);
|
||||||
|
msrs.push(0x200 + 2 * i + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let msrs: Vec<kvm_msr_entry> = msrs
|
||||||
|
.iter()
|
||||||
|
.map(|reg| kvm_msr_entry {
|
||||||
|
index: *reg,
|
||||||
|
reserved: 0,
|
||||||
|
data: 0,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Msrs::from_entries(&msrs).map_err(super::Error::Msr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use kvm_ioctls::Kvm;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::os::unix::fs::MetadataExt;
|
||||||
|
use std::os::unix::io::{AsRawFd, FromRawFd};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_create_kvm_context() {
|
||||||
|
let c = KvmContext::new(None).unwrap();
|
||||||
|
|
||||||
|
assert!(c.max_memslots >= 32);
|
||||||
|
|
||||||
|
let kvm = Kvm::new().unwrap();
|
||||||
|
let f = unsafe { File::from_raw_fd(kvm.as_raw_fd()) };
|
||||||
|
let m1 = f.metadata().unwrap();
|
||||||
|
let m2 = File::open("/dev/kvm").unwrap().metadata().unwrap();
|
||||||
|
|
||||||
|
assert_eq!(m1.dev(), m2.dev());
|
||||||
|
assert_eq!(m1.ino(), m2.ino());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[test]
|
||||||
|
fn test_get_supported_cpu_id() {
|
||||||
|
let c = KvmContext::new(None).unwrap();
|
||||||
|
|
||||||
|
let _ = c
|
||||||
|
.supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
|
||||||
|
.expect("failed to get supported CPUID");
|
||||||
|
assert!(c.supported_cpuid(0).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_create_vm() {
|
||||||
|
let c = KvmContext::new(None).unwrap();
|
||||||
|
|
||||||
|
let _ = c.create_vm().unwrap();
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,5 @@
|
|||||||
// Copyright (C) 2018-2022 Alibaba Cloud. All rights reserved.
|
// Copyright (C) 2018-2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
//! Dragonball is a light-weight virtual machine manager(VMM) based on Linux Kernel-based Virtual
|
//! Dragonball is a light-weight virtual machine manager(VMM) based on Linux Kernel-based Virtual
|
||||||
@ -10,13 +11,45 @@
|
|||||||
|
|
||||||
/// Address space manager for virtual machines.
|
/// Address space manager for virtual machines.
|
||||||
pub mod address_space_manager;
|
pub mod address_space_manager;
|
||||||
|
/// API to handle vmm requests.
|
||||||
|
pub mod api;
|
||||||
/// Structs to maintain configuration information.
|
/// Structs to maintain configuration information.
|
||||||
pub mod config_manager;
|
pub mod config_manager;
|
||||||
/// Device manager for virtual machines.
|
/// Device manager for virtual machines.
|
||||||
pub mod device_manager;
|
pub mod device_manager;
|
||||||
/// Errors related to Virtual machine manager.
|
/// Errors related to Virtual machine manager.
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
/// KVM operation context for virtual machines.
|
||||||
|
pub mod kvm_context;
|
||||||
|
/// Metrics system.
|
||||||
|
pub mod metric;
|
||||||
/// Resource manager for virtual machines.
|
/// Resource manager for virtual machines.
|
||||||
pub mod resource_manager;
|
pub mod resource_manager;
|
||||||
|
/// Signal handler for virtual machines.
|
||||||
|
pub mod signal_handler;
|
||||||
|
/// Virtual CPU manager for virtual machines.
|
||||||
|
pub mod vcpu;
|
||||||
/// Virtual machine manager for virtual machines.
|
/// Virtual machine manager for virtual machines.
|
||||||
pub mod vm;
|
pub mod vm;
|
||||||
|
|
||||||
|
mod io_manager;
|
||||||
|
pub use self::io_manager::IoManagerCached;
|
||||||
|
|
||||||
|
/// Success exit code.
|
||||||
|
pub const EXIT_CODE_OK: u8 = 0;
|
||||||
|
/// Generic error exit code.
|
||||||
|
pub const EXIT_CODE_GENERIC_ERROR: u8 = 1;
|
||||||
|
/// Generic exit code for an error considered not possible to occur if the program logic is sound.
|
||||||
|
pub const EXIT_CODE_UNEXPECTED_ERROR: u8 = 2;
|
||||||
|
/// Dragonball was shut down after intercepting a restricted system call.
|
||||||
|
pub const EXIT_CODE_BAD_SYSCALL: u8 = 148;
|
||||||
|
/// Dragonball was shut down after intercepting `SIGBUS`.
|
||||||
|
pub const EXIT_CODE_SIGBUS: u8 = 149;
|
||||||
|
/// Dragonball was shut down after intercepting `SIGSEGV`.
|
||||||
|
pub const EXIT_CODE_SIGSEGV: u8 = 150;
|
||||||
|
/// Invalid json passed to the Dragonball process for configuring microvm.
|
||||||
|
pub const EXIT_CODE_INVALID_JSON: u8 = 151;
|
||||||
|
/// Bad configuration for microvm's resources, when using a single json.
|
||||||
|
pub const EXIT_CODE_BAD_CONFIGURATION: u8 = 152;
|
||||||
|
/// Command line arguments parsing error.
|
||||||
|
pub const EXIT_CODE_ARG_PARSING: u8 = 153;
|
||||||
|
58
src/dragonball/src/metric.rs
Normal file
58
src/dragonball/src/metric.rs
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
use dbs_utils::metric::SharedIncMetric;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
pub use dbs_utils::metric::IncMetric;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
/// Static instance used for handling metrics.
|
||||||
|
pub static ref METRICS: DragonballMetrics = DragonballMetrics::default();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics specific to VCPUs' mode of functioning.
|
||||||
|
#[derive(Default, Serialize)]
|
||||||
|
pub struct VcpuMetrics {
|
||||||
|
/// Number of KVM exits for handling input IO.
|
||||||
|
pub exit_io_in: SharedIncMetric,
|
||||||
|
/// Number of KVM exits for handling output IO.
|
||||||
|
pub exit_io_out: SharedIncMetric,
|
||||||
|
/// Number of KVM exits for handling MMIO reads.
|
||||||
|
pub exit_mmio_read: SharedIncMetric,
|
||||||
|
/// Number of KVM exits for handling MMIO writes.
|
||||||
|
pub exit_mmio_write: SharedIncMetric,
|
||||||
|
/// Number of errors during this VCPU's run.
|
||||||
|
pub failures: SharedIncMetric,
|
||||||
|
/// Failures in configuring the CPUID.
|
||||||
|
pub filter_cpuid: SharedIncMetric,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics for the seccomp filtering.
|
||||||
|
#[derive(Default, Serialize)]
|
||||||
|
pub struct SeccompMetrics {
|
||||||
|
/// Number of errors inside the seccomp filtering.
|
||||||
|
pub num_faults: SharedIncMetric,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metrics related to signals.
|
||||||
|
#[derive(Default, Serialize)]
|
||||||
|
pub struct SignalMetrics {
|
||||||
|
/// Number of times that SIGBUS was handled.
|
||||||
|
pub sigbus: SharedIncMetric,
|
||||||
|
/// Number of times that SIGSEGV was handled.
|
||||||
|
pub sigsegv: SharedIncMetric,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Structure storing all metrics while enforcing serialization support on them.
|
||||||
|
#[derive(Default, Serialize)]
|
||||||
|
pub struct DragonballMetrics {
|
||||||
|
/// Metrics related to a vcpu's functioning.
|
||||||
|
pub vcpu: VcpuMetrics,
|
||||||
|
/// Metrics related to seccomp filtering.
|
||||||
|
pub seccomp: SeccompMetrics,
|
||||||
|
/// Metrics related to signals.
|
||||||
|
pub signals: SignalMetrics,
|
||||||
|
}
|
219
src/dragonball/src/signal_handler.rs
Normal file
219
src/dragonball/src/signal_handler.rs
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
use libc::{_exit, c_int, c_void, siginfo_t, SIGBUS, SIGSEGV, SIGSYS};
|
||||||
|
use log::error;
|
||||||
|
use vmm_sys_util::signal::register_signal_handler;
|
||||||
|
|
||||||
|
use crate::metric::{IncMetric, METRICS};
|
||||||
|
|
||||||
|
// The offset of `si_syscall` (offending syscall identifier) within the siginfo structure
|
||||||
|
// expressed as an `(u)int*`.
|
||||||
|
// Offset `6` for an `i32` field means that the needed information is located at `6 * sizeof(i32)`.
|
||||||
|
// See /usr/include/linux/signal.h for the C struct definition.
|
||||||
|
// See https://github.com/rust-lang/libc/issues/716 for why the offset is different in Rust.
|
||||||
|
const SI_OFF_SYSCALL: isize = 6;
|
||||||
|
|
||||||
|
const SYS_SECCOMP_CODE: i32 = 1;
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
fn __libc_current_sigrtmin() -> c_int;
|
||||||
|
fn __libc_current_sigrtmax() -> c_int;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets current sigrtmin
|
||||||
|
pub fn sigrtmin() -> c_int {
|
||||||
|
unsafe { __libc_current_sigrtmin() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets current sigrtmax
|
||||||
|
pub fn sigrtmax() -> c_int {
|
||||||
|
unsafe { __libc_current_sigrtmax() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Signal handler for `SIGSYS`.
|
||||||
|
///
|
||||||
|
/// Increments the `seccomp.num_faults` metric, logs an error message and terminates the process
|
||||||
|
/// with a specific exit code.
|
||||||
|
extern "C" fn sigsys_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) {
|
||||||
|
// Safe because we're just reading some fields from a supposedly valid argument.
|
||||||
|
let si_signo = unsafe { (*info).si_signo };
|
||||||
|
let si_code = unsafe { (*info).si_code };
|
||||||
|
|
||||||
|
// Sanity check. The condition should never be true.
|
||||||
|
if num != si_signo || num != SIGSYS || si_code != SYS_SECCOMP_CODE as i32 {
|
||||||
|
// Safe because we're terminating the process anyway.
|
||||||
|
unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Other signals which might do async unsafe things incompatible with the rest of this
|
||||||
|
// function are blocked due to the sa_mask used when registering the signal handler.
|
||||||
|
let syscall = unsafe { *(info as *const i32).offset(SI_OFF_SYSCALL) as usize };
|
||||||
|
// SIGSYS is triggered when bad syscalls are detected. num_faults is only added when SIGSYS is detected
|
||||||
|
// so it actually only collects the count for bad syscalls.
|
||||||
|
METRICS.seccomp.num_faults.inc();
|
||||||
|
error!(
|
||||||
|
"Shutting down VM after intercepting a bad syscall ({}).",
|
||||||
|
syscall
|
||||||
|
);
|
||||||
|
|
||||||
|
// Safe because we're terminating the process anyway. We don't actually do anything when
|
||||||
|
// running unit tests.
|
||||||
|
#[cfg(not(test))]
|
||||||
|
unsafe {
|
||||||
|
_exit(i32::from(super::EXIT_CODE_BAD_SYSCALL))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Signal handler for `SIGBUS` and `SIGSEGV`.
|
||||||
|
///
|
||||||
|
/// Logs an error message and terminates the process with a specific exit code.
|
||||||
|
extern "C" fn sigbus_sigsegv_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) {
|
||||||
|
// Safe because we're just reading some fields from a supposedly valid argument.
|
||||||
|
let si_signo = unsafe { (*info).si_signo };
|
||||||
|
let si_code = unsafe { (*info).si_code };
|
||||||
|
|
||||||
|
// Sanity check. The condition should never be true.
|
||||||
|
if num != si_signo || (num != SIGBUS && num != SIGSEGV) {
|
||||||
|
// Safe because we're terminating the process anyway.
|
||||||
|
unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Other signals which might do async unsafe things incompatible with the rest of this
|
||||||
|
// function are blocked due to the sa_mask used when registering the signal handler.
|
||||||
|
match si_signo {
|
||||||
|
SIGBUS => METRICS.signals.sigbus.inc(),
|
||||||
|
SIGSEGV => METRICS.signals.sigsegv.inc(),
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
error!(
|
||||||
|
"Shutting down VM after intercepting signal {}, code {}.",
|
||||||
|
si_signo, si_code
|
||||||
|
);
|
||||||
|
|
||||||
|
// Safe because we're terminating the process anyway. We don't actually do anything when
|
||||||
|
// running unit tests.
|
||||||
|
#[cfg(not(test))]
|
||||||
|
unsafe {
|
||||||
|
_exit(i32::from(match si_signo {
|
||||||
|
SIGBUS => super::EXIT_CODE_SIGBUS,
|
||||||
|
SIGSEGV => super::EXIT_CODE_SIGSEGV,
|
||||||
|
_ => super::EXIT_CODE_UNEXPECTED_ERROR,
|
||||||
|
}))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers all the required signal handlers.
|
||||||
|
///
|
||||||
|
/// Custom handlers are installed for: `SIGBUS`, `SIGSEGV`, `SIGSYS`.
|
||||||
|
pub fn register_signal_handlers() -> vmm_sys_util::errno::Result<()> {
|
||||||
|
// Call to unsafe register_signal_handler which is considered unsafe because it will
|
||||||
|
// register a signal handler which will be called in the current thread and will interrupt
|
||||||
|
// whatever work is done on the current thread, so we have to keep in mind that the registered
|
||||||
|
// signal handler must only do async-signal-safe operations.
|
||||||
|
register_signal_handler(SIGSYS, sigsys_handler)?;
|
||||||
|
register_signal_handler(SIGBUS, sigbus_sigsegv_handler)?;
|
||||||
|
register_signal_handler(SIGSEGV, sigbus_sigsegv_handler)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
use libc::{cpu_set_t, syscall};
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::{mem, process, thread};
|
||||||
|
|
||||||
|
use seccompiler::{apply_filter, BpfProgram, SeccompAction, SeccompFilter};
|
||||||
|
|
||||||
|
// This function is used when running unit tests, so all the unsafes are safe.
|
||||||
|
fn cpu_count() -> usize {
|
||||||
|
let mut cpuset: cpu_set_t = unsafe { mem::zeroed() };
|
||||||
|
unsafe {
|
||||||
|
libc::CPU_ZERO(&mut cpuset);
|
||||||
|
}
|
||||||
|
let ret = unsafe {
|
||||||
|
libc::sched_getaffinity(
|
||||||
|
0,
|
||||||
|
mem::size_of::<cpu_set_t>(),
|
||||||
|
&mut cpuset as *mut cpu_set_t,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
assert_eq!(ret, 0);
|
||||||
|
|
||||||
|
let mut num = 0;
|
||||||
|
for i in 0..libc::CPU_SETSIZE as usize {
|
||||||
|
if unsafe { libc::CPU_ISSET(i, &cpuset) } {
|
||||||
|
num += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
num
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_signal_handler() {
|
||||||
|
let child = thread::spawn(move || {
|
||||||
|
assert!(register_signal_handlers().is_ok());
|
||||||
|
|
||||||
|
let filter = SeccompFilter::new(
|
||||||
|
vec![
|
||||||
|
(libc::SYS_brk, vec![]),
|
||||||
|
(libc::SYS_exit, vec![]),
|
||||||
|
(libc::SYS_futex, vec![]),
|
||||||
|
(libc::SYS_getpid, vec![]),
|
||||||
|
(libc::SYS_munmap, vec![]),
|
||||||
|
(libc::SYS_kill, vec![]),
|
||||||
|
(libc::SYS_rt_sigprocmask, vec![]),
|
||||||
|
(libc::SYS_rt_sigreturn, vec![]),
|
||||||
|
(libc::SYS_sched_getaffinity, vec![]),
|
||||||
|
(libc::SYS_set_tid_address, vec![]),
|
||||||
|
(libc::SYS_sigaltstack, vec![]),
|
||||||
|
(libc::SYS_write, vec![]),
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect(),
|
||||||
|
SeccompAction::Trap,
|
||||||
|
SeccompAction::Allow,
|
||||||
|
std::env::consts::ARCH.try_into().unwrap(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(apply_filter(&TryInto::<BpfProgram>::try_into(filter).unwrap()).is_ok());
|
||||||
|
assert_eq!(METRICS.seccomp.num_faults.count(), 0);
|
||||||
|
|
||||||
|
// Call the blacklisted `SYS_mkdirat`.
|
||||||
|
unsafe { syscall(libc::SYS_mkdirat, "/foo/bar\0") };
|
||||||
|
|
||||||
|
// Call SIGBUS signal handler.
|
||||||
|
assert_eq!(METRICS.signals.sigbus.count(), 0);
|
||||||
|
unsafe {
|
||||||
|
syscall(libc::SYS_kill, process::id(), SIGBUS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call SIGSEGV signal handler.
|
||||||
|
assert_eq!(METRICS.signals.sigsegv.count(), 0);
|
||||||
|
unsafe {
|
||||||
|
syscall(libc::SYS_kill, process::id(), SIGSEGV);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assert!(child.join().is_ok());
|
||||||
|
|
||||||
|
// Sanity check.
|
||||||
|
assert!(cpu_count() > 0);
|
||||||
|
// Kcov somehow messes with our handler getting the SIGSYS signal when a bad syscall
|
||||||
|
// is caught, so the following assertion no longer holds. Ideally, we'd have a surefire
|
||||||
|
// way of either preventing this behaviour, or detecting for certain whether this test is
|
||||||
|
// run by kcov or not. The best we could do so far is to look at the perceived number of
|
||||||
|
// available CPUs. Kcov seems to make a single CPU available to the process running the
|
||||||
|
// tests, so we use this as an heuristic to decide if we check the assertion.
|
||||||
|
if cpu_count() > 1 {
|
||||||
|
// The signal handler should let the program continue during unit tests.
|
||||||
|
assert!(METRICS.seccomp.num_faults.count() >= 1);
|
||||||
|
}
|
||||||
|
assert!(METRICS.signals.sigbus.count() >= 1);
|
||||||
|
assert!(METRICS.signals.sigsegv.count() >= 1);
|
||||||
|
}
|
||||||
|
}
|
94
src/dragonball/src/vcpu/aarch64.rs
Normal file
94
src/dragonball/src/vcpu/aarch64.rs
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the THIRD-PARTY file.
|
||||||
|
|
||||||
|
use std::sync::mpsc::{channel, Sender};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use crate::IoManagerCached;
|
||||||
|
use dbs_utils::time::TimestampUs;
|
||||||
|
use kvm_ioctls::{VcpuFd, VmFd};
|
||||||
|
use vm_memory::GuestAddress;
|
||||||
|
use vmm_sys_util::eventfd::EventFd;
|
||||||
|
|
||||||
|
use crate::address_space_manager::GuestAddressSpaceImpl;
|
||||||
|
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuStateEvent};
|
||||||
|
use crate::vcpu::VcpuConfig;
|
||||||
|
|
||||||
|
#[allow(unused)]
|
||||||
|
impl Vcpu {
|
||||||
|
/// Constructs a new VCPU for `vm`.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `id` - Represents the CPU number between [0, max vcpus).
|
||||||
|
/// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu.
|
||||||
|
/// * `io_mgr` - The io-manager used to access port-io and mmio devices.
|
||||||
|
/// * `exit_evt` - An `EventFd` that will be written into when this vcpu
|
||||||
|
/// exits.
|
||||||
|
/// * `vcpu_state_event` - The eventfd which can notify vmm state of some
|
||||||
|
/// vcpu should change.
|
||||||
|
/// * `vcpu_state_sender` - The channel to send state change message from
|
||||||
|
/// vcpu thread to vmm thread.
|
||||||
|
/// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime.
|
||||||
|
/// * `support_immediate_exit` - whether kvm uses supports immediate_exit flag.
|
||||||
|
pub fn new_aarch64(
|
||||||
|
id: u8,
|
||||||
|
vcpu_fd: Arc<VcpuFd>,
|
||||||
|
io_mgr: IoManagerCached,
|
||||||
|
exit_evt: EventFd,
|
||||||
|
vcpu_state_event: EventFd,
|
||||||
|
vcpu_state_sender: Sender<VcpuStateEvent>,
|
||||||
|
create_ts: TimestampUs,
|
||||||
|
support_immediate_exit: bool,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let (event_sender, event_receiver) = channel();
|
||||||
|
let (response_sender, response_receiver) = channel();
|
||||||
|
|
||||||
|
Ok(Vcpu {
|
||||||
|
fd: vcpu_fd,
|
||||||
|
id,
|
||||||
|
io_mgr,
|
||||||
|
create_ts,
|
||||||
|
event_receiver,
|
||||||
|
event_sender: Some(event_sender),
|
||||||
|
response_receiver: Some(response_receiver),
|
||||||
|
response_sender,
|
||||||
|
vcpu_state_event,
|
||||||
|
vcpu_state_sender,
|
||||||
|
support_immediate_exit,
|
||||||
|
mpidr: 0,
|
||||||
|
exit_evt,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configures an aarch64 specific vcpu.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `vcpu_config` - vCPU config for this vCPU status
|
||||||
|
/// * `vm_fd` - The kvm `VmFd` for this microvm.
|
||||||
|
/// * `vm_as` - The guest memory address space used by this microvm.
|
||||||
|
/// * `kernel_load_addr` - Offset from `guest_mem` at which the kernel is loaded.
|
||||||
|
/// * `_pgtable_addr` - pgtable address for ap vcpu (not used in aarch64)
|
||||||
|
pub fn configure(
|
||||||
|
&mut self,
|
||||||
|
_vcpu_config: &VcpuConfig,
|
||||||
|
vm_fd: &VmFd,
|
||||||
|
vm_as: &GuestAddressSpaceImpl,
|
||||||
|
kernel_load_addr: Option<GuestAddress>,
|
||||||
|
_pgtable_addr: Option<GuestAddress>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// TODO: add arm vcpu configure() function. issue: #4445
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the MPIDR register value.
|
||||||
|
pub fn get_mpidr(&self) -> u64 {
|
||||||
|
self.mpidr
|
||||||
|
}
|
||||||
|
}
|
32
src/dragonball/src/vcpu/mod.rs
Normal file
32
src/dragonball/src/vcpu/mod.rs
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud Computing. All rights reserved.
|
||||||
|
// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
mod sm;
|
||||||
|
pub mod vcpu_impl;
|
||||||
|
pub mod vcpu_manager;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
use dbs_arch::cpuid::VpmuFeatureLevel;
|
||||||
|
|
||||||
|
/// vcpu config collection
|
||||||
|
pub struct VcpuConfig {
|
||||||
|
/// initial vcpu count
|
||||||
|
pub boot_vcpu_count: u8,
|
||||||
|
/// max vcpu count for hotplug
|
||||||
|
pub max_vcpu_count: u8,
|
||||||
|
/// threads per core for cpu topology information
|
||||||
|
pub threads_per_core: u8,
|
||||||
|
/// cores per die for cpu topology information
|
||||||
|
pub cores_per_die: u8,
|
||||||
|
/// dies per socket for cpu topology information
|
||||||
|
pub dies_per_socket: u8,
|
||||||
|
/// socket number for cpu topology information
|
||||||
|
pub sockets: u8,
|
||||||
|
/// if vpmu feature is Disabled, it means vpmu feature is off (by default)
|
||||||
|
/// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions)
|
||||||
|
/// if vpmu feature is FullyEnabled, it means all vpmu counters are supported
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
pub vpmu_feature: VpmuFeatureLevel,
|
||||||
|
}
|
149
src/dragonball/src/vcpu/sm.rs
Normal file
149
src/dragonball/src/vcpu/sm.rs
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
/// Simple abstraction of a state machine.
|
||||||
|
///
|
||||||
|
/// `StateMachine<T>` is a wrapper over `T` that also encodes state information for `T`.
|
||||||
|
///
|
||||||
|
/// Each state for `T` is represented by a `StateFn<T>` which is a function that acts as
|
||||||
|
/// the state handler for that particular state of `T`.
|
||||||
|
///
|
||||||
|
/// `StateFn<T>` returns exactly one other `StateMachine<T>` thus each state gets clearly
|
||||||
|
/// defined transitions to other states.
|
||||||
|
pub struct StateMachine<T> {
|
||||||
|
function: StateFn<T>,
|
||||||
|
end_state: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type representing a state handler of a `StateMachine<T>` machine. Each state handler
|
||||||
|
/// is a function from `T` that handles a specific state of `T`.
|
||||||
|
type StateFn<T> = fn(&mut T) -> StateMachine<T>;
|
||||||
|
|
||||||
|
impl<T> StateMachine<T> {
|
||||||
|
/// Creates a new state wrapper.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// `function` - the state handler for this state.
|
||||||
|
/// `end_state` - whether this state is final.
|
||||||
|
pub fn new(function: StateFn<T>, end_state: bool) -> StateMachine<T> {
|
||||||
|
StateMachine {
|
||||||
|
function,
|
||||||
|
end_state,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new state wrapper that has further possible transitions.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// `function` - the state handler for this state.
|
||||||
|
pub fn next(function: StateFn<T>) -> StateMachine<T> {
|
||||||
|
StateMachine::new(function, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new state wrapper that has no further transitions. The state machine
|
||||||
|
/// will finish after running this handler.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// `function` - the state handler for this last state.
|
||||||
|
pub fn finish(function: StateFn<T>) -> StateMachine<T> {
|
||||||
|
StateMachine::new(function, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs a state machine for `T` starting from the provided state.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// `machine` - a mutable reference to the object running through the various states.
|
||||||
|
/// `starting_state_fn` - a `fn(&mut T) -> StateMachine<T>` that should be the handler for
|
||||||
|
/// the initial state.
|
||||||
|
pub fn run(machine: &mut T, starting_state_fn: StateFn<T>) {
|
||||||
|
// Start off in the `starting_state` state.
|
||||||
|
let mut sf = StateMachine::new(starting_state_fn, false);
|
||||||
|
// While current state is not a final/end state, keep churning.
|
||||||
|
while !sf.end_state {
|
||||||
|
// Run the current state handler, and get the next one.
|
||||||
|
sf = sf(machine);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implement Deref of `StateMachine<T>` so that we can directly call its underlying state handler.
|
||||||
|
impl<T> Deref for StateMachine<T> {
|
||||||
|
type Target = StateFn<T>;
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.function
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// DummyMachine with states `s1`, `s2` and `s3`.
|
||||||
|
struct DummyMachine {
|
||||||
|
private_data_s1: bool,
|
||||||
|
private_data_s2: bool,
|
||||||
|
private_data_s3: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DummyMachine {
|
||||||
|
fn new() -> Self {
|
||||||
|
DummyMachine {
|
||||||
|
private_data_s1: false,
|
||||||
|
private_data_s2: false,
|
||||||
|
private_data_s3: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DummyMachine functions here.
|
||||||
|
|
||||||
|
// Simple state-machine: start->s1->s2->s3->done.
|
||||||
|
fn run(&mut self) {
|
||||||
|
// Verify the machine has not run yet.
|
||||||
|
assert!(!self.private_data_s1);
|
||||||
|
assert!(!self.private_data_s2);
|
||||||
|
assert!(!self.private_data_s3);
|
||||||
|
|
||||||
|
// Run the state-machine.
|
||||||
|
StateMachine::run(self, Self::s1);
|
||||||
|
|
||||||
|
// Verify the machine went through all states.
|
||||||
|
assert!(self.private_data_s1);
|
||||||
|
assert!(self.private_data_s2);
|
||||||
|
assert!(self.private_data_s3);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn s1(&mut self) -> StateMachine<Self> {
|
||||||
|
// Verify private data mutates along with the states.
|
||||||
|
assert!(!self.private_data_s1);
|
||||||
|
self.private_data_s1 = true;
|
||||||
|
StateMachine::next(Self::s2)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn s2(&mut self) -> StateMachine<Self> {
|
||||||
|
// Verify private data mutates along with the states.
|
||||||
|
assert!(!self.private_data_s2);
|
||||||
|
self.private_data_s2 = true;
|
||||||
|
StateMachine::next(Self::s3)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn s3(&mut self) -> StateMachine<Self> {
|
||||||
|
// Verify private data mutates along with the states.
|
||||||
|
assert!(!self.private_data_s3);
|
||||||
|
self.private_data_s3 = true;
|
||||||
|
// The machine ends here, adding `s1` as next state to validate this.
|
||||||
|
StateMachine::finish(Self::s1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sm() {
|
||||||
|
let mut machine = DummyMachine::new();
|
||||||
|
machine.run();
|
||||||
|
}
|
||||||
|
}
|
955
src/dragonball/src/vcpu/vcpu_impl.rs
Normal file
955
src/dragonball/src/vcpu/vcpu_impl.rs
Normal file
@ -0,0 +1,955 @@
|
|||||||
|
// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the THIRD-PARTY file.
|
||||||
|
|
||||||
|
//! The implementation for per vcpu
|
||||||
|
|
||||||
|
use std::cell::Cell;
|
||||||
|
use std::result;
|
||||||
|
use std::sync::atomic::{fence, Ordering};
|
||||||
|
use std::sync::mpsc::{Receiver, Sender, TryRecvError};
|
||||||
|
use std::sync::{Arc, Barrier};
|
||||||
|
use std::thread;
|
||||||
|
|
||||||
|
use dbs_utils::time::TimestampUs;
|
||||||
|
use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
|
||||||
|
use kvm_ioctls::{VcpuExit, VcpuFd};
|
||||||
|
use libc::{c_int, c_void, siginfo_t};
|
||||||
|
use log::{error, info};
|
||||||
|
use seccompiler::{apply_filter, BpfProgram, Error as SecError};
|
||||||
|
use vmm_sys_util::eventfd::EventFd;
|
||||||
|
use vmm_sys_util::signal::{register_signal_handler, Killable};
|
||||||
|
|
||||||
|
use super::sm::StateMachine;
|
||||||
|
use crate::metric::{IncMetric, METRICS};
|
||||||
|
use crate::signal_handler::sigrtmin;
|
||||||
|
use crate::IoManagerCached;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[path = "x86_64.rs"]
|
||||||
|
mod x86_64;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
#[path = "aarch64.rs"]
|
||||||
|
mod aarch64;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
const MAGIC_IOPORT_BASE: u16 = 0xdbdb;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
const MAGIC_IOPORT_DEBUG_INFO: u16 = MAGIC_IOPORT_BASE;
|
||||||
|
|
||||||
|
/// Signal number (SIGRTMIN) used to kick Vcpus.
|
||||||
|
pub const VCPU_RTSIG_OFFSET: i32 = 0;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
/// Errors associated with the wrappers over KVM ioctls.
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum VcpuError {
|
||||||
|
/// Failed to signal Vcpu.
|
||||||
|
#[error("cannot signal the vCPU thread")]
|
||||||
|
SignalVcpu(#[source] vmm_sys_util::errno::Error),
|
||||||
|
|
||||||
|
/// Cannot open the vCPU file descriptor.
|
||||||
|
#[error("cannot open the vCPU file descriptor")]
|
||||||
|
VcpuFd(#[source] kvm_ioctls::Error),
|
||||||
|
|
||||||
|
/// Cannot spawn a new vCPU thread.
|
||||||
|
#[error("cannot spawn vCPU thread")]
|
||||||
|
VcpuSpawn(#[source] std::io::Error),
|
||||||
|
|
||||||
|
/// Cannot cleanly initialize vCPU TLS.
|
||||||
|
#[error("cannot cleanly initialize TLS fro vCPU")]
|
||||||
|
VcpuTlsInit,
|
||||||
|
|
||||||
|
/// Vcpu not present in TLS.
|
||||||
|
#[error("vCPU not present in the TLS")]
|
||||||
|
VcpuTlsNotPresent,
|
||||||
|
|
||||||
|
/// Unexpected KVM_RUN exit reason
|
||||||
|
#[error("Unexpected KVM_RUN exit reason")]
|
||||||
|
VcpuUnhandledKvmExit,
|
||||||
|
|
||||||
|
/// Pause vcpu failed
|
||||||
|
#[error("failed to pause vcpus")]
|
||||||
|
PauseFailed,
|
||||||
|
|
||||||
|
/// Kvm Ioctl Error
|
||||||
|
#[error("failure in issuing KVM ioctl command")]
|
||||||
|
Kvm(#[source] kvm_ioctls::Error),
|
||||||
|
|
||||||
|
/// Msr error
|
||||||
|
#[error("failure to deal with MSRs")]
|
||||||
|
Msr(vmm_sys_util::fam::Error),
|
||||||
|
|
||||||
|
/// A call to cpuid instruction failed on x86_64.
|
||||||
|
#[error("failure while configuring CPUID for virtual CPU on x86_64")]
|
||||||
|
CpuId(dbs_arch::cpuid::Error),
|
||||||
|
|
||||||
|
/// Error configuring the floating point related registers on x86_64.
|
||||||
|
#[error("failure while configuring the floating point related registers on x86_64")]
|
||||||
|
FPUConfiguration(dbs_arch::regs::Error),
|
||||||
|
|
||||||
|
/// Cannot set the local interruption due to bad configuration on x86_64.
|
||||||
|
#[error("cannot set the local interruption due to bad configuration on x86_64")]
|
||||||
|
LocalIntConfiguration(dbs_arch::interrupts::Error),
|
||||||
|
|
||||||
|
/// Error configuring the MSR registers on x86_64.
|
||||||
|
#[error("failure while configuring the MSR registers on x86_64")]
|
||||||
|
MSRSConfiguration(dbs_arch::regs::Error),
|
||||||
|
|
||||||
|
/// Error configuring the general purpose registers on x86_64.
|
||||||
|
#[error("failure while configuring the general purpose registers on x86_64")]
|
||||||
|
REGSConfiguration(dbs_arch::regs::Error),
|
||||||
|
|
||||||
|
/// Error configuring the special registers on x86_64.
|
||||||
|
#[error("failure while configuring the special registers on x86_64")]
|
||||||
|
SREGSConfiguration(dbs_arch::regs::Error),
|
||||||
|
|
||||||
|
/// Error configuring the page table on x86_64.
|
||||||
|
#[error("failure while configuring the page table on x86_64")]
|
||||||
|
PageTable(dbs_boot::Error),
|
||||||
|
|
||||||
|
/// The call to KVM_SET_CPUID2 failed on x86_64.
|
||||||
|
#[error("failure while calling KVM_SET_CPUID2 on x86_64")]
|
||||||
|
SetSupportedCpusFailed(#[source] kvm_ioctls::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
/// Errors associated with the wrappers over KVM ioctls.
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum VcpuError {
|
||||||
|
/// Failed to signal Vcpu.
|
||||||
|
#[error("cannot signal the vCPU thread")]
|
||||||
|
SignalVcpu(#[source] vmm_sys_util::errno::Error),
|
||||||
|
|
||||||
|
/// Cannot open the vCPU file descriptor.
|
||||||
|
#[error("cannot open the vCPU file descriptor")]
|
||||||
|
VcpuFd(#[source] kvm_ioctls::Error),
|
||||||
|
|
||||||
|
/// Cannot spawn a new vCPU thread.
|
||||||
|
#[error("cannot spawn vCPU thread")]
|
||||||
|
VcpuSpawn(#[source] std::io::Error),
|
||||||
|
|
||||||
|
/// Cannot cleanly initialize vCPU TLS.
|
||||||
|
#[error("cannot cleanly initialize TLS fro vCPU")]
|
||||||
|
VcpuTlsInit,
|
||||||
|
|
||||||
|
/// Vcpu not present in TLS.
|
||||||
|
#[error("vCPU not present in the TLS")]
|
||||||
|
VcpuTlsNotPresent,
|
||||||
|
|
||||||
|
/// Unexpected KVM_RUN exit reason
|
||||||
|
#[error("Unexpected KVM_RUN exit reason")]
|
||||||
|
VcpuUnhandledKvmExit,
|
||||||
|
|
||||||
|
/// Pause vcpu failed
|
||||||
|
#[error("failed to pause vcpus")]
|
||||||
|
PauseFailed,
|
||||||
|
|
||||||
|
/// Kvm Ioctl Error
|
||||||
|
#[error("failure in issuing KVM ioctl command")]
|
||||||
|
Kvm(#[source] kvm_ioctls::Error),
|
||||||
|
|
||||||
|
/// Msr error
|
||||||
|
#[error("failure to deal with MSRs")]
|
||||||
|
Msr(vmm_sys_util::fam::Error),
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
/// Error configuring the general purpose aarch64 registers on aarch64.
|
||||||
|
#[error("failure while configuring the general purpose registers on aarch64")]
|
||||||
|
REGSConfiguration(dbs_arch::regs::Error),
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
/// Error setting up the global interrupt controller on aarch64.
|
||||||
|
#[error("failure while setting up the global interrupt controller on aarch64")]
|
||||||
|
SetupGIC(dbs_arch::gic::Error),
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
/// Error getting the Vcpu preferred target on aarch64.
|
||||||
|
#[error("failure while getting the vCPU preferred target on aarch64")]
|
||||||
|
VcpuArmPreferredTarget(kvm_ioctls::Error),
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
/// Error doing vCPU Init on aarch64.
|
||||||
|
#[error("failure while doing vCPU init on aarch64")]
|
||||||
|
VcpuArmInit(kvm_ioctls::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result for Vcpu related operations.
|
||||||
|
pub type Result<T> = result::Result<T, VcpuError>;
|
||||||
|
|
||||||
|
/// List of events that the Vcpu can receive.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum VcpuEvent {
|
||||||
|
/// Kill the Vcpu.
|
||||||
|
Exit,
|
||||||
|
/// Pause the Vcpu.
|
||||||
|
Pause,
|
||||||
|
/// Event that should resume the Vcpu.
|
||||||
|
Resume,
|
||||||
|
/// Get vcpu thread tid
|
||||||
|
Gettid,
|
||||||
|
|
||||||
|
/// Event to revalidate vcpu IoManager cache
|
||||||
|
RevalidateCache,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List of responses that the Vcpu reports.
|
||||||
|
pub enum VcpuResponse {
|
||||||
|
/// Vcpu is paused.
|
||||||
|
Paused,
|
||||||
|
/// Vcpu is resumed.
|
||||||
|
Resumed,
|
||||||
|
/// Vcpu index and thread tid.
|
||||||
|
Tid(u8, u32),
|
||||||
|
/// Requested Vcpu operation is not allowed.
|
||||||
|
NotAllowed,
|
||||||
|
/// Requestion action encountered an error
|
||||||
|
Error(VcpuError),
|
||||||
|
/// Vcpu IoManager cache is revalidated
|
||||||
|
CacheRevalidated,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List of events that the vcpu_state_sender can send.
|
||||||
|
pub enum VcpuStateEvent {
|
||||||
|
/// (result, response) for hotplug, result 0 means failure, 1 means success.
|
||||||
|
Hotplug((i32, u32)),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wrapper over vCPU that hides the underlying interactions with the vCPU thread.
|
||||||
|
pub struct VcpuHandle {
|
||||||
|
event_sender: Sender<VcpuEvent>,
|
||||||
|
response_receiver: Receiver<VcpuResponse>,
|
||||||
|
vcpu_thread: thread::JoinHandle<()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VcpuHandle {
|
||||||
|
/// Send event to vCPU thread
|
||||||
|
pub fn send_event(&self, event: VcpuEvent) -> Result<()> {
|
||||||
|
// Use expect() to crash if the other thread closed this channel.
|
||||||
|
self.event_sender
|
||||||
|
.send(event)
|
||||||
|
.expect("event sender channel closed on vcpu end.");
|
||||||
|
// Kick the vCPU so it picks up the message.
|
||||||
|
self.vcpu_thread
|
||||||
|
.kill(sigrtmin() + VCPU_RTSIG_OFFSET)
|
||||||
|
.map_err(VcpuError::SignalVcpu)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Receive response from vcpu thread
|
||||||
|
pub fn response_receiver(&self) -> &Receiver<VcpuResponse> {
|
||||||
|
&self.response_receiver
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// Join the vcpu thread
|
||||||
|
pub fn join_vcpu_thread(self) -> thread::Result<()> {
|
||||||
|
self.vcpu_thread.join()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(PartialEq)]
|
||||||
|
enum VcpuEmulation {
|
||||||
|
Handled,
|
||||||
|
Interrupted,
|
||||||
|
Stopped,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A wrapper around creating and using a kvm-based VCPU.
|
||||||
|
pub struct Vcpu {
|
||||||
|
// vCPU fd used by the vCPU
|
||||||
|
fd: Arc<VcpuFd>,
|
||||||
|
// vCPU id info
|
||||||
|
id: u8,
|
||||||
|
// Io manager Cached for facilitating IO operations
|
||||||
|
io_mgr: IoManagerCached,
|
||||||
|
// Records vCPU create time stamp
|
||||||
|
create_ts: TimestampUs,
|
||||||
|
|
||||||
|
// The receiving end of events channel owned by the vcpu side.
|
||||||
|
event_receiver: Receiver<VcpuEvent>,
|
||||||
|
// The transmitting end of the events channel which will be given to the handler.
|
||||||
|
event_sender: Option<Sender<VcpuEvent>>,
|
||||||
|
// The receiving end of the responses channel which will be given to the handler.
|
||||||
|
response_receiver: Option<Receiver<VcpuResponse>>,
|
||||||
|
// The transmitting end of the responses channel owned by the vcpu side.
|
||||||
|
response_sender: Sender<VcpuResponse>,
|
||||||
|
// Event notifier for CPU hotplug.
|
||||||
|
// After arm adapts to hotplug vcpu, the dead code macro needs to be removed
|
||||||
|
#[cfg_attr(target_arch = "aarch64", allow(dead_code))]
|
||||||
|
vcpu_state_event: EventFd,
|
||||||
|
// CPU hotplug events.
|
||||||
|
// After arm adapts to hotplug vcpu, the dead code macro needs to be removed
|
||||||
|
#[cfg_attr(target_arch = "aarch64", allow(dead_code))]
|
||||||
|
vcpu_state_sender: Sender<VcpuStateEvent>,
|
||||||
|
|
||||||
|
// An `EventFd` that will be written into when this vcpu exits.
|
||||||
|
exit_evt: EventFd,
|
||||||
|
// Whether kvm used supports immediate_exit flag.
|
||||||
|
support_immediate_exit: bool,
|
||||||
|
|
||||||
|
// CPUID information for the x86_64 CPU
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
cpuid: kvm_bindings::CpuId,
|
||||||
|
|
||||||
|
/// Multiprocessor affinity register recorded for aarch64
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
pub(crate) mpidr: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Using this for easier explicit type-casting to help IDEs interpret the code.
|
||||||
|
type VcpuCell = Cell<Option<*const Vcpu>>;
|
||||||
|
|
||||||
|
impl Vcpu {
|
||||||
|
thread_local!(static TLS_VCPU_PTR: VcpuCell = Cell::new(None));
|
||||||
|
|
||||||
|
/// Associates `self` with the current thread.
|
||||||
|
///
|
||||||
|
/// It is a prerequisite to successfully run `init_thread_local_data()` before using
|
||||||
|
/// `run_on_thread_local()` on the current thread.
|
||||||
|
/// This function will return an error if there already is a `Vcpu` present in the TLS.
|
||||||
|
fn init_thread_local_data(&mut self) -> Result<()> {
|
||||||
|
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
|
||||||
|
if cell.get().is_some() {
|
||||||
|
return Err(VcpuError::VcpuTlsInit);
|
||||||
|
}
|
||||||
|
cell.set(Some(self as *const Vcpu));
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Deassociates `self` from the current thread.
|
||||||
|
///
|
||||||
|
/// Should be called if the current `self` had called `init_thread_local_data()` and
|
||||||
|
/// now needs to move to a different thread.
|
||||||
|
///
|
||||||
|
/// Fails if `self` was not previously associated with the current thread.
|
||||||
|
fn reset_thread_local_data(&mut self) -> Result<()> {
|
||||||
|
// Best-effort to clean up TLS. If the `Vcpu` was moved to another thread
|
||||||
|
// _before_ running this, then there is nothing we can do.
|
||||||
|
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
|
||||||
|
if let Some(vcpu_ptr) = cell.get() {
|
||||||
|
if vcpu_ptr == self as *const Vcpu {
|
||||||
|
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| cell.take());
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(VcpuError::VcpuTlsNotPresent)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs `func` for the `Vcpu` associated with the current thread.
|
||||||
|
///
|
||||||
|
/// It requires that `init_thread_local_data()` was run on this thread.
|
||||||
|
///
|
||||||
|
/// Fails if there is no `Vcpu` associated with the current thread.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// This is marked unsafe as it allows temporary aliasing through
|
||||||
|
/// dereferencing from pointer an already borrowed `Vcpu`.
|
||||||
|
unsafe fn run_on_thread_local<F>(func: F) -> Result<()>
|
||||||
|
where
|
||||||
|
F: FnOnce(&Vcpu),
|
||||||
|
{
|
||||||
|
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
|
||||||
|
if let Some(vcpu_ptr) = cell.get() {
|
||||||
|
// Dereferencing here is safe since `TLS_VCPU_PTR` is populated/non-empty,
|
||||||
|
// and it is being cleared on `Vcpu::drop` so there is no dangling pointer.
|
||||||
|
let vcpu_ref: &Vcpu = &*vcpu_ptr;
|
||||||
|
func(vcpu_ref);
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(VcpuError::VcpuTlsNotPresent)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers a signal handler which makes use of TLS and kvm immediate exit to
|
||||||
|
/// kick the vcpu running on the current thread, if there is one.
|
||||||
|
pub fn register_kick_signal_handler() {
|
||||||
|
extern "C" fn handle_signal(_: c_int, _: *mut siginfo_t, _: *mut c_void) {
|
||||||
|
// This is safe because it's temporarily aliasing the `Vcpu` object, but we are
|
||||||
|
// only reading `vcpu.fd` which does not change for the lifetime of the `Vcpu`.
|
||||||
|
unsafe {
|
||||||
|
let _ = Vcpu::run_on_thread_local(|vcpu| {
|
||||||
|
vcpu.fd.set_kvm_immediate_exit(1);
|
||||||
|
fence(Ordering::Release);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
register_signal_handler(sigrtmin() + VCPU_RTSIG_OFFSET, handle_signal)
|
||||||
|
.expect("Failed to register vcpu signal handler");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the cpu index as seen by the guest OS.
|
||||||
|
pub fn cpu_index(&self) -> u8 {
|
||||||
|
self.id
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Moves the vcpu to its own thread and constructs a VcpuHandle.
|
||||||
|
/// The handle can be used to control the remote vcpu.
|
||||||
|
pub fn start_threaded(
|
||||||
|
mut self,
|
||||||
|
seccomp_filter: BpfProgram,
|
||||||
|
barrier: Arc<Barrier>,
|
||||||
|
) -> Result<VcpuHandle> {
|
||||||
|
let event_sender = self.event_sender.take().unwrap();
|
||||||
|
let response_receiver = self.response_receiver.take().unwrap();
|
||||||
|
|
||||||
|
let vcpu_thread = thread::Builder::new()
|
||||||
|
.name(format!("db_vcpu{}", self.cpu_index()))
|
||||||
|
.spawn(move || {
|
||||||
|
self.init_thread_local_data()
|
||||||
|
.expect("Cannot cleanly initialize vcpu TLS.");
|
||||||
|
barrier.wait();
|
||||||
|
self.run(seccomp_filter);
|
||||||
|
})
|
||||||
|
.map_err(VcpuError::VcpuSpawn)?;
|
||||||
|
|
||||||
|
Ok(VcpuHandle {
|
||||||
|
event_sender,
|
||||||
|
response_receiver,
|
||||||
|
vcpu_thread,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the vcpu running logic for test mocking.
|
||||||
|
#[cfg(not(test))]
|
||||||
|
pub fn emulate(fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||||
|
fd.run()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs the vCPU in KVM context and handles the kvm exit reason.
|
||||||
|
///
|
||||||
|
/// Returns error or enum specifying whether emulation was handled or interrupted.
|
||||||
|
fn run_emulation(&mut self) -> Result<VcpuEmulation> {
|
||||||
|
match Vcpu::emulate(&self.fd) {
|
||||||
|
Ok(run) => match run {
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
VcpuExit::IoIn(addr, data) => {
|
||||||
|
let _ = self.io_mgr.pio_read(addr, data);
|
||||||
|
METRICS.vcpu.exit_io_in.inc();
|
||||||
|
Ok(VcpuEmulation::Handled)
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
VcpuExit::IoOut(addr, data) => {
|
||||||
|
if !self.check_io_port_info(addr, data)? {
|
||||||
|
let _ = self.io_mgr.pio_write(addr, data);
|
||||||
|
}
|
||||||
|
METRICS.vcpu.exit_io_out.inc();
|
||||||
|
Ok(VcpuEmulation::Handled)
|
||||||
|
}
|
||||||
|
VcpuExit::MmioRead(addr, data) => {
|
||||||
|
let _ = self.io_mgr.mmio_read(addr, data);
|
||||||
|
METRICS.vcpu.exit_mmio_read.inc();
|
||||||
|
Ok(VcpuEmulation::Handled)
|
||||||
|
}
|
||||||
|
VcpuExit::MmioWrite(addr, data) => {
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
self.check_boot_complete_signal(addr, data);
|
||||||
|
|
||||||
|
let _ = self.io_mgr.mmio_write(addr, data);
|
||||||
|
METRICS.vcpu.exit_mmio_write.inc();
|
||||||
|
Ok(VcpuEmulation::Handled)
|
||||||
|
}
|
||||||
|
VcpuExit::Hlt => {
|
||||||
|
info!("Received KVM_EXIT_HLT signal");
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
VcpuExit::Shutdown => {
|
||||||
|
info!("Received KVM_EXIT_SHUTDOWN signal");
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
// Documentation specifies that below kvm exits are considered errors.
|
||||||
|
VcpuExit::FailEntry => {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
error!("Received KVM_EXIT_FAIL_ENTRY signal");
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
VcpuExit::InternalError => {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
error!("Received KVM_EXIT_INTERNAL_ERROR signal");
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
VcpuExit::SystemEvent(event_type, event_flags) => match event_type {
|
||||||
|
KVM_SYSTEM_EVENT_RESET | KVM_SYSTEM_EVENT_SHUTDOWN => {
|
||||||
|
info!(
|
||||||
|
"Received KVM_SYSTEM_EVENT: type: {}, event: {}",
|
||||||
|
event_type, event_flags
|
||||||
|
);
|
||||||
|
Ok(VcpuEmulation::Stopped)
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
error!(
|
||||||
|
"Received KVM_SYSTEM_EVENT signal type: {}, flag: {}",
|
||||||
|
event_type, event_flags
|
||||||
|
);
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
r => {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
// TODO: Are we sure we want to finish running a vcpu upon
|
||||||
|
// receiving a vm exit that is not necessarily an error?
|
||||||
|
error!("Unexpected exit reason on vcpu run: {:?}", r);
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// The unwrap on raw_os_error can only fail if we have a logic
|
||||||
|
// error in our code in which case it is better to panic.
|
||||||
|
Err(ref e) => {
|
||||||
|
match e.errno() {
|
||||||
|
libc::EAGAIN => Ok(VcpuEmulation::Handled),
|
||||||
|
libc::EINTR => {
|
||||||
|
self.fd.set_kvm_immediate_exit(0);
|
||||||
|
// Notify that this KVM_RUN was interrupted.
|
||||||
|
Ok(VcpuEmulation::Interrupted)
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
error!("Failure during vcpu run: {}", e);
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
{
|
||||||
|
error!(
|
||||||
|
"dump regs: {:?}, dump sregs: {:?}",
|
||||||
|
self.fd.get_regs(),
|
||||||
|
self.fd.get_sregs()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
// checkout the io port that dragonball used only
|
||||||
|
fn check_io_port_info(&self, addr: u16, data: &[u8]) -> Result<bool> {
|
||||||
|
let mut checked = false;
|
||||||
|
|
||||||
|
match addr {
|
||||||
|
// debug info signal
|
||||||
|
MAGIC_IOPORT_DEBUG_INFO => {
|
||||||
|
if data.len() == 4 {
|
||||||
|
let data = unsafe { std::ptr::read(data.as_ptr() as *const u32) };
|
||||||
|
log::warn!("KDBG: guest kernel debug info: 0x{:x}", data);
|
||||||
|
checked = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(checked)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gettid() -> u32 {
|
||||||
|
nix::unistd::gettid().as_raw() as u32
|
||||||
|
}
|
||||||
|
|
||||||
|
fn revalidate_cache(&mut self) -> Result<()> {
|
||||||
|
self.io_mgr.revalidate_cache();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main loop of the vCPU thread.
|
||||||
|
///
|
||||||
|
/// Runs the vCPU in KVM context in a loop. Handles KVM_EXITs then goes back in.
|
||||||
|
/// Note that the state of the VCPU and associated VM must be setup first for this to do
|
||||||
|
/// anything useful.
|
||||||
|
pub fn run(&mut self, seccomp_filter: BpfProgram) {
|
||||||
|
// Load seccomp filters for this vCPU thread.
|
||||||
|
// Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters
|
||||||
|
// altogether is the desired behaviour.
|
||||||
|
if let Err(e) = apply_filter(&seccomp_filter) {
|
||||||
|
if matches!(e, SecError::EmptyFilter) {
|
||||||
|
info!("vCPU thread {} use empty seccomp filters.", self.id);
|
||||||
|
} else {
|
||||||
|
panic!(
|
||||||
|
"Failed to set the requested seccomp filters on vCPU {}: Error: {}",
|
||||||
|
self.id, e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("vcpu {} is running", self.cpu_index());
|
||||||
|
|
||||||
|
// Start running the machine state in the `Paused` state.
|
||||||
|
StateMachine::run(self, Self::paused);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the main loop of the `Running` state.
|
||||||
|
fn running(&mut self) -> StateMachine<Self> {
|
||||||
|
// This loop is here just for optimizing the emulation path.
|
||||||
|
// No point in ticking the state machine if there are no external events.
|
||||||
|
loop {
|
||||||
|
match self.run_emulation() {
|
||||||
|
// Emulation ran successfully, continue.
|
||||||
|
Ok(VcpuEmulation::Handled) => {
|
||||||
|
// We need to break here if kvm doesn't support
|
||||||
|
// immediate_exit flag. Because the signal sent from vmm
|
||||||
|
// thread may occurs when handling the vcpu exit events, and
|
||||||
|
// in this case the external vcpu events may not be handled
|
||||||
|
// correctly, so we need to check the event_receiver channel
|
||||||
|
// after handle vcpu exit events to decrease the window that
|
||||||
|
// doesn't handle the vcpu external events.
|
||||||
|
if !self.support_immediate_exit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Emulation was interrupted, check external events.
|
||||||
|
Ok(VcpuEmulation::Interrupted) => break,
|
||||||
|
// Emulation was stopped due to reset or shutdown.
|
||||||
|
Ok(VcpuEmulation::Stopped) => return StateMachine::next(Self::waiting_exit),
|
||||||
|
// Emulation errors lead to vCPU exit.
|
||||||
|
Err(e) => {
|
||||||
|
error!("vcpu: {}, run_emulation failed: {:?}", self.id, e);
|
||||||
|
return StateMachine::next(Self::waiting_exit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// By default don't change state.
|
||||||
|
let mut state = StateMachine::next(Self::running);
|
||||||
|
|
||||||
|
// Break this emulation loop on any transition request/external event.
|
||||||
|
match self.event_receiver.try_recv() {
|
||||||
|
// Running ---- Exit ----> Exited
|
||||||
|
Ok(VcpuEvent::Exit) => {
|
||||||
|
// Move to 'exited' state.
|
||||||
|
state = StateMachine::next(Self::exited);
|
||||||
|
}
|
||||||
|
// Running ---- Pause ----> Paused
|
||||||
|
Ok(VcpuEvent::Pause) => {
|
||||||
|
// Nothing special to do.
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Paused)
|
||||||
|
.expect("failed to send pause status");
|
||||||
|
|
||||||
|
// TODO: we should call `KVM_KVMCLOCK_CTRL` here to make sure
|
||||||
|
// TODO continued: the guest soft lockup watchdog does not panic on Resume.
|
||||||
|
//let _ = self.fd.kvmclock_ctrl();
|
||||||
|
|
||||||
|
// Move to 'paused' state.
|
||||||
|
state = StateMachine::next(Self::paused);
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::Resume) => {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Resumed)
|
||||||
|
.expect("failed to send resume status");
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::Gettid) => {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid()))
|
||||||
|
.expect("failed to send vcpu thread tid");
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::RevalidateCache) => {
|
||||||
|
self.revalidate_cache()
|
||||||
|
.map(|()| {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::CacheRevalidated)
|
||||||
|
.expect("failed to revalidate vcpu IoManager cache");
|
||||||
|
})
|
||||||
|
.map_err(|e| self.response_sender.send(VcpuResponse::Error(e)))
|
||||||
|
.expect("failed to revalidate vcpu IoManager cache");
|
||||||
|
}
|
||||||
|
// Unhandled exit of the other end.
|
||||||
|
Err(TryRecvError::Disconnected) => {
|
||||||
|
// Move to 'exited' state.
|
||||||
|
state = StateMachine::next(Self::exited);
|
||||||
|
}
|
||||||
|
// All other events or lack thereof have no effect on current 'running' state.
|
||||||
|
Err(TryRecvError::Empty) => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
state
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the main loop of the `Paused` state.
|
||||||
|
fn paused(&mut self) -> StateMachine<Self> {
|
||||||
|
match self.event_receiver.recv() {
|
||||||
|
// Paused ---- Exit ----> Exited
|
||||||
|
Ok(VcpuEvent::Exit) => {
|
||||||
|
// Move to 'exited' state.
|
||||||
|
StateMachine::next(Self::exited)
|
||||||
|
}
|
||||||
|
// Paused ---- Resume ----> Running
|
||||||
|
Ok(VcpuEvent::Resume) => {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Resumed)
|
||||||
|
.expect("failed to send resume status");
|
||||||
|
// Move to 'running' state.
|
||||||
|
StateMachine::next(Self::running)
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::Pause) => {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Paused)
|
||||||
|
.expect("failed to send pause status");
|
||||||
|
// continue 'pause' state.
|
||||||
|
StateMachine::next(Self::paused)
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::Gettid) => {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid()))
|
||||||
|
.expect("failed to send vcpu thread tid");
|
||||||
|
StateMachine::next(Self::paused)
|
||||||
|
}
|
||||||
|
Ok(VcpuEvent::RevalidateCache) => {
|
||||||
|
self.revalidate_cache()
|
||||||
|
.map(|()| {
|
||||||
|
self.response_sender
|
||||||
|
.send(VcpuResponse::CacheRevalidated)
|
||||||
|
.expect("failed to revalidate vcpu IoManager cache");
|
||||||
|
})
|
||||||
|
.map_err(|e| self.response_sender.send(VcpuResponse::Error(e)))
|
||||||
|
.expect("failed to revalidate vcpu IoManager cache");
|
||||||
|
|
||||||
|
StateMachine::next(Self::paused)
|
||||||
|
}
|
||||||
|
// Unhandled exit of the other end.
|
||||||
|
Err(_) => {
|
||||||
|
// Move to 'exited' state.
|
||||||
|
StateMachine::next(Self::exited)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the main loop of the `WaitingExit` state.
|
||||||
|
fn waiting_exit(&mut self) -> StateMachine<Self> {
|
||||||
|
// trigger vmm to stop machine
|
||||||
|
if let Err(e) = self.exit_evt.write(1) {
|
||||||
|
METRICS.vcpu.failures.inc();
|
||||||
|
error!("Failed signaling vcpu exit event: {}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut state = StateMachine::next(Self::waiting_exit);
|
||||||
|
|
||||||
|
match self.event_receiver.recv() {
|
||||||
|
Ok(VcpuEvent::Exit) => state = StateMachine::next(Self::exited),
|
||||||
|
Ok(_) => error!(
|
||||||
|
"wrong state received in waiting exit state on vcpu {}",
|
||||||
|
self.id
|
||||||
|
),
|
||||||
|
Err(_) => {
|
||||||
|
error!(
|
||||||
|
"vcpu channel closed in waiting exit state on vcpu {}",
|
||||||
|
self.id
|
||||||
|
);
|
||||||
|
state = StateMachine::next(Self::exited);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
state
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the main loop of the `Exited` state.
|
||||||
|
fn exited(&mut self) -> StateMachine<Self> {
|
||||||
|
// State machine reached its end.
|
||||||
|
StateMachine::finish(Self::exited)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for Vcpu {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let _ = self.reset_thread_local_data();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub mod tests {
|
||||||
|
use std::os::unix::io::AsRawFd;
|
||||||
|
use std::sync::mpsc::{channel, Receiver};
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
use arc_swap::ArcSwap;
|
||||||
|
use dbs_device::device_manager::IoManager;
|
||||||
|
use kvm_ioctls::Kvm;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::kvm_context::KvmContext;
|
||||||
|
|
||||||
|
pub enum EmulationCase {
|
||||||
|
IoIn,
|
||||||
|
IoOut,
|
||||||
|
MmioRead,
|
||||||
|
MmioWrite,
|
||||||
|
Hlt,
|
||||||
|
Shutdown,
|
||||||
|
FailEntry,
|
||||||
|
InternalError,
|
||||||
|
Unknown,
|
||||||
|
SystemEvent(u32, u64),
|
||||||
|
Error(i32),
|
||||||
|
}
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
pub static ref EMULATE_RES: Mutex<EmulationCase> = Mutex::new(EmulationCase::Unknown);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Vcpu {
|
||||||
|
pub fn emulate(_fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||||
|
let res = &*EMULATE_RES.lock().unwrap();
|
||||||
|
match res {
|
||||||
|
EmulationCase::IoIn => Ok(VcpuExit::IoIn(0, &mut [])),
|
||||||
|
EmulationCase::IoOut => Ok(VcpuExit::IoOut(0, &[])),
|
||||||
|
EmulationCase::MmioRead => Ok(VcpuExit::MmioRead(0, &mut [])),
|
||||||
|
EmulationCase::MmioWrite => Ok(VcpuExit::MmioWrite(0, &[])),
|
||||||
|
EmulationCase::Hlt => Ok(VcpuExit::Hlt),
|
||||||
|
EmulationCase::Shutdown => Ok(VcpuExit::Shutdown),
|
||||||
|
EmulationCase::FailEntry => Ok(VcpuExit::FailEntry),
|
||||||
|
EmulationCase::InternalError => Ok(VcpuExit::InternalError),
|
||||||
|
EmulationCase::Unknown => Ok(VcpuExit::Unknown),
|
||||||
|
EmulationCase::SystemEvent(event_type, event_flags) => {
|
||||||
|
Ok(VcpuExit::SystemEvent(*event_type, *event_flags))
|
||||||
|
}
|
||||||
|
EmulationCase::Error(e) => Err(kvm_ioctls::Error::new(*e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
fn create_vcpu() -> (Vcpu, Receiver<VcpuStateEvent>) {
|
||||||
|
// Call for kvm too frequently would cause error in some host kernel.
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(5));
|
||||||
|
|
||||||
|
let kvm = Kvm::new().unwrap();
|
||||||
|
let vm = Arc::new(kvm.create_vm().unwrap());
|
||||||
|
let kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap();
|
||||||
|
let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap());
|
||||||
|
let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new()))));
|
||||||
|
let supported_cpuid = kvm_context
|
||||||
|
.supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
|
||||||
|
.unwrap();
|
||||||
|
let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap();
|
||||||
|
let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap();
|
||||||
|
let (tx, rx) = channel();
|
||||||
|
let time_stamp = TimestampUs::default();
|
||||||
|
|
||||||
|
let vcpu = Vcpu::new_x86_64(
|
||||||
|
0,
|
||||||
|
vcpu_fd,
|
||||||
|
io_manager,
|
||||||
|
supported_cpuid,
|
||||||
|
reset_event_fd,
|
||||||
|
vcpu_state_event,
|
||||||
|
tx,
|
||||||
|
time_stamp,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
(vcpu, rx)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[test]
|
||||||
|
fn test_vcpu_run_emulation() {
|
||||||
|
let (mut vcpu, _) = create_vcpu();
|
||||||
|
|
||||||
|
// Io in
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::IoIn;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
|
||||||
|
|
||||||
|
// Io out
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::IoOut;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
|
||||||
|
|
||||||
|
// Mmio read
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioRead;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
|
||||||
|
|
||||||
|
// Mmio write
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioWrite;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
|
||||||
|
|
||||||
|
// KVM_EXIT_HLT signal
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Hlt;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// KVM_EXIT_SHUTDOWN signal
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Shutdown;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// KVM_EXIT_FAIL_ENTRY signal
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::FailEntry;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// KVM_EXIT_INTERNAL_ERROR signal
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::InternalError;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// KVM_SYSTEM_EVENT_RESET
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, 0);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
|
||||||
|
|
||||||
|
// KVM_SYSTEM_EVENT_SHUTDOWN
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, 0);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
|
||||||
|
|
||||||
|
// Other system event
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, 0);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// Unknown exit reason
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Unknown;
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
|
||||||
|
// Error: EAGAIN
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EAGAIN);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Handled)));
|
||||||
|
|
||||||
|
// Error: EINTR
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINTR);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Ok(VcpuEmulation::Interrupted)));
|
||||||
|
|
||||||
|
// other error
|
||||||
|
*(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINVAL);
|
||||||
|
let res = vcpu.run_emulation();
|
||||||
|
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
#[test]
|
||||||
|
fn test_vcpu_check_io_port_info() {
|
||||||
|
let (vcpu, receiver) = create_vcpu();
|
||||||
|
|
||||||
|
// boot complete signal
|
||||||
|
let res = vcpu
|
||||||
|
.check_io_port_info(
|
||||||
|
MAGIC_IOPORT_SIGNAL_GUEST_BOOT_COMPLETE,
|
||||||
|
&[MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert!(res);
|
||||||
|
|
||||||
|
// debug info signal
|
||||||
|
let res = vcpu
|
||||||
|
.check_io_port_info(MAGIC_IOPORT_DEBUG_INFO, &[0, 0, 0, 0])
|
||||||
|
.unwrap();
|
||||||
|
assert!(res);
|
||||||
|
}
|
||||||
|
}
|
1039
src/dragonball/src/vcpu/vcpu_manager.rs
Normal file
1039
src/dragonball/src/vcpu/vcpu_manager.rs
Normal file
File diff suppressed because it is too large
Load Diff
149
src/dragonball/src/vcpu/x86_64.rs
Normal file
149
src/dragonball/src/vcpu/x86_64.rs
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||||
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the THIRD-PARTY file.
|
||||||
|
|
||||||
|
use std::sync::mpsc::{channel, Sender};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use dbs_arch::cpuid::{process_cpuid, VmSpec};
|
||||||
|
use dbs_arch::gdt::gdt_entry;
|
||||||
|
use dbs_utils::time::TimestampUs;
|
||||||
|
use kvm_bindings::CpuId;
|
||||||
|
use kvm_ioctls::{VcpuFd, VmFd};
|
||||||
|
use log::error;
|
||||||
|
use vm_memory::{Address, GuestAddress, GuestAddressSpace};
|
||||||
|
use vmm_sys_util::eventfd::EventFd;
|
||||||
|
|
||||||
|
use crate::address_space_manager::GuestAddressSpaceImpl;
|
||||||
|
use crate::metric::{IncMetric, METRICS};
|
||||||
|
use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent};
|
||||||
|
use crate::vcpu::VcpuConfig;
|
||||||
|
use crate::IoManagerCached;
|
||||||
|
|
||||||
|
impl Vcpu {
|
||||||
|
/// Constructs a new VCPU for `vm`.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `id` - Represents the CPU number between [0, max vcpus).
|
||||||
|
/// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu.
|
||||||
|
/// * `io_mgr` - The io-manager used to access port-io and mmio devices.
|
||||||
|
/// * `cpuid` - The `CpuId` listing the supported capabilities of this vcpu.
|
||||||
|
/// * `exit_evt` - An `EventFd` that will be written into when this vcpu
|
||||||
|
/// exits.
|
||||||
|
/// * `vcpu_state_event` - The eventfd which can notify vmm state of some
|
||||||
|
/// vcpu should change.
|
||||||
|
/// * `vcpu_state_sender` - The channel to send state change message from
|
||||||
|
/// vcpu thread to vmm thread.
|
||||||
|
/// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime.
|
||||||
|
/// * `support_immediate_exit` - whether kvm used supports immediate_exit flag.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn new_x86_64(
|
||||||
|
id: u8,
|
||||||
|
vcpu_fd: Arc<VcpuFd>,
|
||||||
|
io_mgr: IoManagerCached,
|
||||||
|
cpuid: CpuId,
|
||||||
|
exit_evt: EventFd,
|
||||||
|
vcpu_state_event: EventFd,
|
||||||
|
vcpu_state_sender: Sender<VcpuStateEvent>,
|
||||||
|
create_ts: TimestampUs,
|
||||||
|
support_immediate_exit: bool,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let (event_sender, event_receiver) = channel();
|
||||||
|
let (response_sender, response_receiver) = channel();
|
||||||
|
// Initially the cpuid per vCPU is the one supported by this VM.
|
||||||
|
Ok(Vcpu {
|
||||||
|
fd: vcpu_fd,
|
||||||
|
id,
|
||||||
|
io_mgr,
|
||||||
|
create_ts,
|
||||||
|
event_receiver,
|
||||||
|
event_sender: Some(event_sender),
|
||||||
|
response_receiver: Some(response_receiver),
|
||||||
|
response_sender,
|
||||||
|
vcpu_state_event,
|
||||||
|
vcpu_state_sender,
|
||||||
|
exit_evt,
|
||||||
|
support_immediate_exit,
|
||||||
|
cpuid,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configures a x86_64 specific vcpu and should be called once per vcpu.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `vm_config` - The machine configuration of this microvm needed for the CPUID configuration.
|
||||||
|
/// * `vm_fd` - The kvm `VmFd` for the virtual machine this vcpu will get attached to.
|
||||||
|
/// * `vm_memory` - The guest memory used by this microvm.
|
||||||
|
/// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts.
|
||||||
|
/// * `pgtable_addr` - pgtable address for ap vcpu
|
||||||
|
pub fn configure(
|
||||||
|
&mut self,
|
||||||
|
vcpu_config: &VcpuConfig,
|
||||||
|
_vm_fd: &VmFd,
|
||||||
|
vm_as: &GuestAddressSpaceImpl,
|
||||||
|
kernel_start_addr: Option<GuestAddress>,
|
||||||
|
_pgtable_addr: Option<GuestAddress>,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.set_cpuid(vcpu_config)?;
|
||||||
|
|
||||||
|
dbs_arch::regs::setup_msrs(&self.fd).map_err(VcpuError::MSRSConfiguration)?;
|
||||||
|
if let Some(start_addr) = kernel_start_addr {
|
||||||
|
dbs_arch::regs::setup_regs(
|
||||||
|
&self.fd,
|
||||||
|
start_addr.raw_value() as u64,
|
||||||
|
dbs_boot::layout::BOOT_STACK_POINTER,
|
||||||
|
dbs_boot::layout::BOOT_STACK_POINTER,
|
||||||
|
dbs_boot::layout::ZERO_PAGE_START,
|
||||||
|
)
|
||||||
|
.map_err(VcpuError::REGSConfiguration)?;
|
||||||
|
dbs_arch::regs::setup_fpu(&self.fd).map_err(VcpuError::FPUConfiguration)?;
|
||||||
|
let gdt_table: [u64; dbs_boot::layout::BOOT_GDT_MAX as usize] = [
|
||||||
|
gdt_entry(0, 0, 0), // NULL
|
||||||
|
gdt_entry(0xa09b, 0, 0xfffff), // CODE
|
||||||
|
gdt_entry(0xc093, 0, 0xfffff), // DATA
|
||||||
|
gdt_entry(0x808b, 0, 0xfffff), // TSS
|
||||||
|
];
|
||||||
|
let pgtable_addr =
|
||||||
|
dbs_boot::setup_identity_mapping(&*vm_as.memory()).map_err(VcpuError::PageTable)?;
|
||||||
|
dbs_arch::regs::setup_sregs(
|
||||||
|
&*vm_as.memory(),
|
||||||
|
&self.fd,
|
||||||
|
pgtable_addr,
|
||||||
|
&gdt_table,
|
||||||
|
dbs_boot::layout::BOOT_GDT_OFFSET,
|
||||||
|
dbs_boot::layout::BOOT_IDT_OFFSET,
|
||||||
|
)
|
||||||
|
.map_err(VcpuError::SREGSConfiguration)?;
|
||||||
|
}
|
||||||
|
dbs_arch::interrupts::set_lint(&self.fd).map_err(VcpuError::LocalIntConfiguration)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_cpuid(&mut self, vcpu_config: &VcpuConfig) -> Result<()> {
|
||||||
|
let cpuid_vm_spec = VmSpec::new(
|
||||||
|
self.id,
|
||||||
|
vcpu_config.max_vcpu_count as u8,
|
||||||
|
vcpu_config.threads_per_core,
|
||||||
|
vcpu_config.cores_per_die,
|
||||||
|
vcpu_config.dies_per_socket,
|
||||||
|
vcpu_config.vpmu_feature,
|
||||||
|
)
|
||||||
|
.map_err(VcpuError::CpuId)?;
|
||||||
|
process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| {
|
||||||
|
METRICS.vcpu.filter_cpuid.inc();
|
||||||
|
error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e);
|
||||||
|
VcpuError::CpuId(e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.fd
|
||||||
|
.set_cpuid2(&self.cpuid)
|
||||||
|
.map_err(VcpuError::SetSupportedCpusFailed)
|
||||||
|
}
|
||||||
|
}
|
@ -18,3 +18,79 @@ pub struct NumaRegionInfo {
|
|||||||
/// vcpu ids belonging to this region
|
/// vcpu ids belonging to this region
|
||||||
pub vcpu_ids: Vec<u32>,
|
pub vcpu_ids: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Information for cpu topology to guide guest init
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
|
||||||
|
pub struct CpuTopology {
|
||||||
|
/// threads per core to indicate hyperthreading is enabled or not
|
||||||
|
pub threads_per_core: u8,
|
||||||
|
/// cores per die to guide guest cpu topology init
|
||||||
|
pub cores_per_die: u8,
|
||||||
|
/// dies per socket to guide guest cpu topology
|
||||||
|
pub dies_per_socket: u8,
|
||||||
|
/// number of sockets
|
||||||
|
pub sockets: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CpuTopology {
|
||||||
|
fn default() -> Self {
|
||||||
|
CpuTopology {
|
||||||
|
threads_per_core: 1,
|
||||||
|
cores_per_die: 1,
|
||||||
|
dies_per_socket: 1,
|
||||||
|
sockets: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configuration information for virtual machine instance.
|
||||||
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
|
pub struct VmConfigInfo {
|
||||||
|
/// Number of vcpu to start.
|
||||||
|
pub vcpu_count: u8,
|
||||||
|
/// Max number of vcpu can be added
|
||||||
|
pub max_vcpu_count: u8,
|
||||||
|
/// Enable or disable hyperthreading.
|
||||||
|
pub ht_enabled: bool,
|
||||||
|
/// cpu power management.
|
||||||
|
pub cpu_pm: String,
|
||||||
|
/// cpu topology information
|
||||||
|
pub cpu_topology: CpuTopology,
|
||||||
|
/// vpmu support level
|
||||||
|
pub vpmu_feature: u8,
|
||||||
|
|
||||||
|
/// Memory type that can be either hugetlbfs or shmem, default is shmem
|
||||||
|
pub mem_type: String,
|
||||||
|
/// Memory file path
|
||||||
|
pub mem_file_path: String,
|
||||||
|
/// The memory size in MiB.
|
||||||
|
pub mem_size_mib: usize,
|
||||||
|
/// reserve memory bytes
|
||||||
|
pub reserve_memory_bytes: u64,
|
||||||
|
|
||||||
|
/// sock path
|
||||||
|
pub serial_path: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for VmConfigInfo {
|
||||||
|
fn default() -> Self {
|
||||||
|
VmConfigInfo {
|
||||||
|
vcpu_count: 1,
|
||||||
|
max_vcpu_count: 1,
|
||||||
|
ht_enabled: false,
|
||||||
|
cpu_pm: String::from("on"),
|
||||||
|
cpu_topology: CpuTopology {
|
||||||
|
threads_per_core: 1,
|
||||||
|
cores_per_die: 1,
|
||||||
|
dies_per_socket: 1,
|
||||||
|
sockets: 1,
|
||||||
|
},
|
||||||
|
vpmu_feature: 0,
|
||||||
|
mem_type: String::from("shmem"),
|
||||||
|
mem_file_path: String::from(""),
|
||||||
|
mem_size_mib: 128,
|
||||||
|
reserve_memory_bytes: 0,
|
||||||
|
serial_path: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user