mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-10 14:02:59 +00:00
Compare commits
27 Commits
dependabot
...
topic/runt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a89dfcfd4f | ||
|
|
2b17f9cbda | ||
|
|
1135f19d43 | ||
|
|
fd2e08db9e | ||
|
|
505f30dc26 | ||
|
|
cd5304a959 | ||
|
|
3257746678 | ||
|
|
ff84b5f8ca | ||
|
|
504101d77a | ||
|
|
598b11f206 | ||
|
|
aa2e72f94c | ||
|
|
2cf1001c37 | ||
|
|
fd6375d8d5 | ||
|
|
218077506b | ||
|
|
dca89485f0 | ||
|
|
72fb41d33b | ||
|
|
9e8069569e | ||
|
|
5e1ab0aa7d | ||
|
|
3b155ab0b1 | ||
|
|
31f9a5461b | ||
|
|
98ee385220 | ||
|
|
26ffe1223b | ||
|
|
38382a59c4 | ||
|
|
2bac201364 | ||
|
|
10b24a19c8 | ||
|
|
f133b81579 | ||
|
|
d6546f2a56 |
@@ -37,8 +37,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
environment: [
|
||||
{ name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100 },
|
||||
{ name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp },
|
||||
{ name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100, coco: false },
|
||||
{ name: nvidia-gpu (runtime-rs), vmm: qemu-nvidia-gpu-runtime-rs, runner: amd64-nvidia-a100, coco: false },
|
||||
{ name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp, coco: true },
|
||||
{ name: nvidia-gpu-snp (runtime-rs), vmm: qemu-nvidia-gpu-snp-runtime-rs, runner: amd64-nvidia-h100-snp, coco: true },
|
||||
]
|
||||
runs-on: ${{ matrix.environment.runner }}
|
||||
env:
|
||||
@@ -48,9 +50,9 @@ jobs:
|
||||
GH_PR_NUMBER: ${{ inputs.pr-number }}
|
||||
KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
|
||||
KUBERNETES: kubeadm
|
||||
KBS: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }}
|
||||
SNAPSHOTTER: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'nydus' || '' }}
|
||||
USE_EXPERIMENTAL_SNAPSHOTTER_SETUP: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }}
|
||||
KBS: ${{ matrix.environment.coco && 'true' || 'false' }}
|
||||
SNAPSHOTTER: ${{ matrix.environment.coco && 'nydus' || '' }}
|
||||
USE_EXPERIMENTAL_SNAPSHOTTER_SETUP: ${{ matrix.environment.coco && 'true' || 'false' }}
|
||||
K8S_TEST_HOST_TYPE: baremetal
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -75,12 +77,12 @@ jobs:
|
||||
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
|
||||
|
||||
- name: Uninstall previous `kbs-client`
|
||||
if: matrix.environment.name != 'nvidia-gpu'
|
||||
if: matrix.environment.coco
|
||||
timeout-minutes: 10
|
||||
run: bash tests/integration/kubernetes/gha-run.sh uninstall-kbs-client
|
||||
|
||||
- name: Deploy CoCo KBS
|
||||
if: matrix.environment.name != 'nvidia-gpu'
|
||||
if: matrix.environment.coco
|
||||
timeout-minutes: 10
|
||||
run: bash tests/integration/kubernetes/gha-run.sh deploy-coco-kbs
|
||||
env:
|
||||
@@ -88,7 +90,7 @@ jobs:
|
||||
KBS_INGRESS: nodeport
|
||||
|
||||
- name: Install `kbs-client`
|
||||
if: matrix.environment.name != 'nvidia-gpu'
|
||||
if: matrix.environment.coco
|
||||
timeout-minutes: 10
|
||||
run: bash tests/integration/kubernetes/gha-run.sh install-kbs-client
|
||||
|
||||
@@ -127,7 +129,7 @@ jobs:
|
||||
run: bash tests/integration/kubernetes/gha-run.sh cleanup
|
||||
|
||||
- name: Delete CoCo KBS
|
||||
if: always() && matrix.environment.name != 'nvidia-gpu'
|
||||
if: always() && matrix.environment.coco
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs
|
||||
|
||||
2
.github/workflows/run-kata-coco-tests.yaml
vendored
2
.github/workflows/run-kata-coco-tests.yaml
vendored
@@ -53,6 +53,8 @@ jobs:
|
||||
vmm: qemu-tdx
|
||||
- runner: sev-snp
|
||||
vmm: qemu-snp
|
||||
- runner: sev-snp
|
||||
vmm: qemu-snp-runtime-rs
|
||||
runs-on: ${{ matrix.runner }}
|
||||
env:
|
||||
DOCKER_REGISTRY: ${{ inputs.registry }}
|
||||
|
||||
727
Cargo.lock
generated
727
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -11,6 +11,7 @@ members = [
|
||||
"src/libs/kata-types",
|
||||
"src/libs/logging",
|
||||
"src/libs/mem-agent",
|
||||
"src/libs/pod-resources-rs",
|
||||
"src/libs/protocols",
|
||||
"src/libs/runtime-spec",
|
||||
"src/libs/safe-path",
|
||||
@@ -76,8 +77,8 @@ kvm-bindings = "0.14.0"
|
||||
kvm-ioctls = "0.24.0"
|
||||
linux-loader = "0.13.0"
|
||||
seccompiler = "0.5.0"
|
||||
vfio-bindings = "0.6.1"
|
||||
vfio-ioctls = "0.5.0"
|
||||
vfio-bindings = "0.6.2"
|
||||
vfio-ioctls = "0.6.0"
|
||||
virtio-bindings = "0.2.0"
|
||||
virtio-queue = "0.17.0"
|
||||
vm-fdt = "0.3.0"
|
||||
@@ -117,6 +118,7 @@ wasm_container = { path = "src/runtime-rs/crates/runtimes/wasm_container" }
|
||||
|
||||
# Local dependencies from `src/lib`
|
||||
kata-sys-util = { path = "src/libs/kata-sys-util" }
|
||||
pod-resources-rs = { path = "src/libs/pod-resources-rs" }
|
||||
kata-types = { path = "src/libs/kata-types", features = ["safe-path"] }
|
||||
logging = { path = "src/libs/logging" }
|
||||
mem-agent = { path = "src/libs/mem-agent" }
|
||||
|
||||
@@ -250,6 +250,21 @@ pub async fn add_devices(
|
||||
update_spec_devices(logger, spec, dev_updates)
|
||||
}
|
||||
|
||||
pub fn dump_nvidia_cdi_yaml(logger: &Logger) -> Result<()> {
|
||||
let file_path = "/var/run/cdi/nvidia.yaml";
|
||||
let path = PathBuf::from(file_path);
|
||||
|
||||
if !path.exists() {
|
||||
error!(logger, "file does not exist: {}", file_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let content = fs::read_to_string(path)?;
|
||||
info!(logger, "===== cdi filepath at {:?} with content: ===== \n {:?}", file_path, content);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub async fn handle_cdi_devices(
|
||||
logger: &Logger,
|
||||
@@ -308,9 +323,11 @@ pub async fn handle_cdi_devices(
|
||||
cdi_timeout.as_secs(),
|
||||
e
|
||||
);
|
||||
time::sleep(Duration::from_secs(1)).await;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
time::sleep(Duration::from_secs(1)).await;
|
||||
// time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
Err(anyhow!(
|
||||
"failed to inject devices after CDI timeout of {} seconds",
|
||||
@@ -561,6 +578,104 @@ fn update_spec_devices(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_pci_bdf_name(name: &str) -> Option<pci::Address> {
|
||||
pci::Address::from_str(name).ok()
|
||||
}
|
||||
|
||||
fn bus_of_addr(addr: &pci::Address) -> Result<String> {
|
||||
// addr.to_string() format: "0000:01:00.0"
|
||||
let s = addr.to_string();
|
||||
let mut parts = s.split(':');
|
||||
|
||||
let domain = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
|
||||
let bus = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
|
||||
|
||||
Ok(format!("{domain}:{bus}"))
|
||||
}
|
||||
|
||||
fn unique_bus_from_pci_addresses(addrs: &[pci::Address]) -> Result<String> {
|
||||
let mut buses = addrs.iter().map(bus_of_addr).collect::<Result<Vec<_>>>()?;
|
||||
|
||||
buses.sort();
|
||||
buses.dedup();
|
||||
|
||||
match buses.len() {
|
||||
1 => Ok(buses[0].clone()),
|
||||
0 => Err(anyhow!("no downstream PCI devices found")),
|
||||
_ => Err(anyhow!("multiple downstream buses found: {:?}", buses)),
|
||||
}
|
||||
}
|
||||
|
||||
fn read_single_bus_from_pci_bus_dir(bridgebuspath: &PathBuf) -> Result<String> {
|
||||
let mut files = Vec::new();
|
||||
|
||||
for entry in fs::read_dir(bridgebuspath)? {
|
||||
files.push(entry?);
|
||||
}
|
||||
|
||||
if files.len() != 1 {
|
||||
return Err(anyhow!(
|
||||
"expected exactly one PCI bus in {:?}, got {}",
|
||||
bridgebuspath,
|
||||
files.len()
|
||||
));
|
||||
}
|
||||
|
||||
files[0]
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("bad filename under {:?}: {:?}", bridgebuspath, e))
|
||||
}
|
||||
|
||||
fn infer_bus_from_child_devices(devpath: &PathBuf) -> Result<String> {
|
||||
let mut child_pci_addrs = Vec::new();
|
||||
|
||||
for entry in fs::read_dir(devpath)? {
|
||||
let entry = entry?;
|
||||
let file_type = entry.file_type()?;
|
||||
|
||||
if !file_type.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let name = entry.file_name();
|
||||
let name = name
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("non-utf8 filename under {:?}: {:?}", devpath, name))?;
|
||||
|
||||
if let Some(addr) = parse_pci_bdf_name(name) {
|
||||
child_pci_addrs.push(addr);
|
||||
}
|
||||
}
|
||||
|
||||
unique_bus_from_pci_addresses(&child_pci_addrs).with_context(|| {
|
||||
format!(
|
||||
"failed to infer downstream bus from child PCI devices under {:?}",
|
||||
devpath
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn get_next_bus_from_bridge(devpath: &PathBuf) -> Result<String> {
|
||||
let bridgebuspath = devpath.join("pci_bus");
|
||||
|
||||
if bridgebuspath.exists() {
|
||||
return read_single_bus_from_pci_bus_dir(&bridgebuspath)
|
||||
.with_context(|| format!("failed to read downstream bus from {:?}", bridgebuspath));
|
||||
}
|
||||
|
||||
infer_bus_from_child_devices(devpath).with_context(|| {
|
||||
format!(
|
||||
"bridge {:?} has no pci_bus directory; fallback to child device scan failed",
|
||||
devpath
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to
|
||||
// the sysfs path for the PCI host bridge, based on the PCI path
|
||||
// provided.
|
||||
@@ -569,6 +684,10 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
|
||||
let mut bus = "0000:00".to_string();
|
||||
let mut relpath = String::new();
|
||||
|
||||
if pcipath.is_empty() {
|
||||
return Err(anyhow!("empty PCI path"));
|
||||
}
|
||||
|
||||
for i in 0..pcipath.len() {
|
||||
let bdf = format!("{}:{}", bus, pcipath[i]);
|
||||
|
||||
@@ -579,26 +698,14 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
|
||||
break;
|
||||
}
|
||||
|
||||
// Find out the bus exposed by bridge
|
||||
let bridgebuspath = format!("{root_bus_sysfs}{relpath}/pci_bus");
|
||||
let mut files: Vec<_> = fs::read_dir(&bridgebuspath)?.collect();
|
||||
let devpath = PathBuf::from(root_bus_sysfs).join(relpath.trim_start_matches('/'));
|
||||
|
||||
match files.pop() {
|
||||
Some(busfile) if files.is_empty() => {
|
||||
bus = busfile?
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(
|
||||
"Expected exactly one PCI bus in {}, got {} instead",
|
||||
bridgebuspath,
|
||||
// Adjust to original value as we've already popped
|
||||
files.len() + 1
|
||||
));
|
||||
}
|
||||
};
|
||||
bus = get_next_bus_from_bridge(&devpath).with_context(|| {
|
||||
format!(
|
||||
"failed to resolve next bus for PCI path element {} (device {}) under root {}",
|
||||
i, bdf, root_bus_sysfs
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(relpath)
|
||||
@@ -1150,6 +1257,21 @@ mod tests {
|
||||
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0/0000:02:04.0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pcipath_to_sysfs_fallback_child_device_scan() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir");
|
||||
let rootbuspath = testdir.path().to_str().unwrap();
|
||||
|
||||
let path23 = pci::Path::from_str("02/03").unwrap();
|
||||
let bridge2path = format!("{}{}", rootbuspath, "/0000:00:02.0");
|
||||
let child_device_path = format!("{bridge2path}/0000:01:03.0");
|
||||
|
||||
fs::create_dir_all(child_device_path).unwrap();
|
||||
|
||||
let relpath = pcipath_to_sysfs(rootbuspath, &path23);
|
||||
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0");
|
||||
}
|
||||
|
||||
// We use device specific variants of this for real cases, but
|
||||
// they have some complications that make them troublesome to unit
|
||||
// test
|
||||
|
||||
@@ -69,7 +69,8 @@ impl DeviceHandler for VfioPciDeviceHandler {
|
||||
|
||||
let (root_complex, pcipath) = pcipath_from_dev_tree_path(pcipath)?;
|
||||
|
||||
let guestdev = wait_for_pci_device(ctx.sandbox, root_complex, &pcipath).await?;
|
||||
let guestdev =
|
||||
wait_for_pci_device(ctx.logger, ctx.sandbox, root_complex, &pcipath).await?;
|
||||
if vfio_in_guest {
|
||||
pci_driver_override(ctx.logger, SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?;
|
||||
|
||||
@@ -301,24 +302,62 @@ async fn associate_ap_device(apqn: &Apqn, mkvp: &str) -> Result<()> {
|
||||
Ok(apqn.set_associate_state(AssocState::Associated(secret_idx))?)
|
||||
}
|
||||
|
||||
fn pci_addr_from_sysfs_path(sysfs_abs: &Path) -> Result<pci::Address> {
|
||||
// sysfs_abs like: /sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0
|
||||
let name = sysfs_abs
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("bad sysfs path (no file_name): {:?}", sysfs_abs))?
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("bad sysfs path (non-utf8): {:?}", sysfs_abs))?;
|
||||
|
||||
pci::Address::from_str(name)
|
||||
.map_err(|e| anyhow!("failed to parse pci bdf from sysfs '{}': {e}", name))
|
||||
}
|
||||
|
||||
pub async fn wait_for_pci_device(
|
||||
logger: &Logger,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
root_complex: &str,
|
||||
pcipath: &pci::Path,
|
||||
) -> Result<pci::Address> {
|
||||
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path(root_complex));
|
||||
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
|
||||
info!(logger, "Xwait_for_pci_device at {}", pcipath);
|
||||
let root_bus_rel = create_pci_root_bus_path(root_complex); // "/devices/pci0000:00"
|
||||
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, &root_bus_rel); // "/sys/devices/pci0000:00"
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: root_bus_sysfs {} pcipath {}", &root_bus_sysfs, pcipath
|
||||
);
|
||||
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; // "/0000:00:06.0/0000:02:00.0"
|
||||
|
||||
// "/sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
|
||||
let sysfs_abs = format!("{root_bus_sysfs}{sysfs_rel_path}");
|
||||
let sysfs_abs_path = std::path::PathBuf::from(&sysfs_abs);
|
||||
|
||||
if tokio::fs::metadata(&sysfs_abs_path).await.is_ok() {
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: PCI device {} already exists at {}", pcipath, sysfs_abs
|
||||
);
|
||||
return pci_addr_from_sysfs_path(&sysfs_abs_path);
|
||||
} else {
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: Waiting uevent for PCI device {} at {}", pcipath, sysfs_abs
|
||||
);
|
||||
}
|
||||
|
||||
let matcher = PciMatcher::new(&sysfs_rel_path, root_complex)?;
|
||||
|
||||
let uev = wait_for_uevent(sandbox, matcher).await?;
|
||||
|
||||
// uev.devpath like "/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
|
||||
let addr = uev
|
||||
.devpath
|
||||
.rsplit('/')
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Bad device path {:?} in uevent", &uev.devpath))?;
|
||||
let addr = pci::Address::from_str(addr)?;
|
||||
Ok(addr)
|
||||
|
||||
pci::Address::from_str(addr)
|
||||
}
|
||||
|
||||
// Represents an IOMMU group
|
||||
|
||||
@@ -65,7 +65,7 @@ use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
|
||||
use crate::device::network_device_handler::wait_for_ccw_net_interface;
|
||||
#[cfg(not(target_arch = "s390x"))]
|
||||
use crate::device::network_device_handler::wait_for_pci_net_interface;
|
||||
use crate::device::{add_devices, handle_cdi_devices, update_env_pci};
|
||||
use crate::device::{add_devices, handle_cdi_devices, dump_nvidia_cdi_yaml, update_env_pci};
|
||||
use crate::features::get_build_features;
|
||||
use crate::metrics::get_metrics;
|
||||
use crate::mount::baremount;
|
||||
@@ -244,6 +244,8 @@ impl AgentService {
|
||||
// or other entities for a specifc device.
|
||||
// In Kata we only consider the directory "/var/run/cdi", "/etc" may be
|
||||
// readonly
|
||||
info!(sl(), "dump_nvidia_cdi_yaml at path: /var/run/cdi");
|
||||
dump_nvidia_cdi_yaml(&sl())?;
|
||||
handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?;
|
||||
|
||||
// Handle trusted storage configuration before mounting any storage
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::sync::Arc;
|
||||
|
||||
use crate::storage::{common_storage_handler, new_device, StorageContext, StorageHandler};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_types::device::{DRIVER_9P_TYPE, DRIVER_OVERLAYFS_TYPE, DRIVER_VIRTIOFS_TYPE};
|
||||
use kata_types::device::{DRIVER_OVERLAYFS_TYPE, DRIVER_VIRTIOFS_TYPE};
|
||||
use kata_types::mount::{StorageDevice, KATA_VOLUME_OVERLAYFS_CREATE_DIR};
|
||||
use protocols::agent::Storage;
|
||||
use tracing::instrument;
|
||||
@@ -69,27 +69,6 @@ impl StorageHandler for OverlayfsHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Virtio9pHandler {}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl StorageHandler for Virtio9pHandler {
|
||||
#[instrument]
|
||||
fn driver_types(&self) -> &[&str] {
|
||||
&[DRIVER_9P_TYPE]
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
async fn create_device(
|
||||
&self,
|
||||
storage: Storage,
|
||||
ctx: &mut StorageContext,
|
||||
) -> Result<Arc<dyn StorageDevice>> {
|
||||
let path = common_storage_handler(ctx.logger, &storage)?;
|
||||
new_device(path)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VirtioFsHandler {}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ use tracing::instrument;
|
||||
use self::bind_watcher_handler::BindWatcherHandler;
|
||||
use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler};
|
||||
use self::ephemeral_handler::EphemeralHandler;
|
||||
use self::fs_handler::{OverlayfsHandler, Virtio9pHandler, VirtioFsHandler};
|
||||
use self::fs_handler::{OverlayfsHandler, VirtioFsHandler};
|
||||
use self::image_pull_handler::ImagePullHandler;
|
||||
use self::local_handler::LocalHandler;
|
||||
use crate::mount::{baremount, is_mounted, remove_mounts};
|
||||
@@ -134,7 +134,6 @@ lazy_static! {
|
||||
pub static ref STORAGE_HANDLERS: StorageHandlerManager<Arc<dyn StorageHandler>> = {
|
||||
let mut manager: StorageHandlerManager<Arc<dyn StorageHandler>> = StorageHandlerManager::new();
|
||||
let handlers: Vec<Arc<dyn StorageHandler>> = vec![
|
||||
Arc::new(Virtio9pHandler {}),
|
||||
Arc::new(VirtioBlkMmioHandler {}),
|
||||
Arc::new(VirtioBlkPciHandler {}),
|
||||
Arc::new(EphemeralHandler {}),
|
||||
|
||||
@@ -425,7 +425,7 @@ impl SandboxStorages {
|
||||
/// or updated file to a target mount point, or remove the removed file from the target mount point. All WatchableStorage
|
||||
/// target mount points are expected to reside within a single tmpfs, whose root is created by the BindWatcher.
|
||||
///
|
||||
/// This is a temporary workaround to handle config map updates until we get inotify on 9p/virtio-fs.
|
||||
/// This is a temporary workaround to handle config map updates until we get inotify on virtio-fs.
|
||||
/// More context on this:
|
||||
/// - https://github.com/kata-containers/runtime/issues/1505
|
||||
/// - https://github.com/kata-containers/kata-containers/issues/1879
|
||||
|
||||
@@ -706,11 +706,13 @@ impl Region {
|
||||
|
||||
// FIXME: add readonly flag into vfio_dma_map in future PR when it is needed.
|
||||
// issue #8725
|
||||
if let Err(e) = vfio_container.vfio_dma_map(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size,
|
||||
host_addr as u64,
|
||||
) {
|
||||
if let Err(e) = unsafe {
|
||||
vfio_container.vfio_dma_map(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size as usize,
|
||||
host_addr as *mut u8,
|
||||
)
|
||||
} {
|
||||
error!("vfio dma map failed, pci p2p dma may not work, due to {e:?}");
|
||||
}
|
||||
}
|
||||
@@ -744,7 +746,7 @@ impl Region {
|
||||
|
||||
if let Err(e) = vfio_container.vfio_dma_unmap(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size,
|
||||
self.mmaps[i].mmap_size as usize,
|
||||
) {
|
||||
error!("vfio dma unmap failed, pci p2p dma may not work, due to {e:?}");
|
||||
}
|
||||
@@ -771,7 +773,7 @@ impl Region {
|
||||
for i in 0..self.mmaps.len() {
|
||||
if let Err(e) = vfio_container.vfio_dma_unmap(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size,
|
||||
self.mmaps[i].mmap_size as usize,
|
||||
) {
|
||||
error!("vfio dma unmap failed, pci p2p dma may not work, due to {e:?}");
|
||||
}
|
||||
@@ -779,11 +781,13 @@ impl Region {
|
||||
self.set_user_memory_region(i, true, vm)?;
|
||||
// FIXME: add readonly flag into vfio_dma_map in future PR when it is needed.
|
||||
// issue #8725
|
||||
if let Err(e) = vfio_container.vfio_dma_map(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size,
|
||||
self.mmaps[i].mmap_host_addr,
|
||||
) {
|
||||
if let Err(e) = unsafe {
|
||||
vfio_container.vfio_dma_map(
|
||||
self.start.raw_value() + self.mmaps[i].mmap_offset,
|
||||
self.mmaps[i].mmap_size as usize,
|
||||
self.mmaps[i].mmap_host_addr as *mut u8,
|
||||
)
|
||||
} {
|
||||
error!("vfio dma map failed, pci p2p dma may not work, due to {e:?}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -499,9 +499,11 @@ impl VfioDeviceMgr {
|
||||
"readonly" => readonly,
|
||||
);
|
||||
//FIXME: add readonly flag when related commit is pushed to upstream vfio-ioctls
|
||||
self.get_vfio_container()?
|
||||
.vfio_dma_map(iova, size, user_addr)
|
||||
.map_err(VfioDeviceError::VfioIoctlError)?;
|
||||
unsafe {
|
||||
self.get_vfio_container()?
|
||||
.vfio_dma_map(iova, size as usize, user_addr as *mut u8)
|
||||
}
|
||||
.map_err(VfioDeviceError::VfioIoctlError)?;
|
||||
self.locked_vm_size += size;
|
||||
Ok(())
|
||||
}
|
||||
@@ -516,7 +518,7 @@ impl VfioDeviceMgr {
|
||||
let size = region.len();
|
||||
|
||||
self.get_vfio_container()?
|
||||
.vfio_dma_unmap(gpa, size)
|
||||
.vfio_dma_unmap(gpa, size as usize)
|
||||
.map_err(VfioDeviceError::VfioIoctlError)?;
|
||||
|
||||
self.locked_vm_size -= size;
|
||||
|
||||
@@ -25,7 +25,7 @@ subprocess = "0.2.8"
|
||||
rand = "0.8.5"
|
||||
thiserror = "1.0.30"
|
||||
hex = "0.4.3"
|
||||
pci-ids = "0.2.5"
|
||||
pci-ids = "0.2.6"
|
||||
mockall = "0.13.1"
|
||||
|
||||
kata-types = { path = "../kata-types" }
|
||||
|
||||
@@ -2,7 +2,12 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::pcilibs::pci_manager::{
|
||||
calc_next_power_of_2, PCI_BASE_ADDRESS_MEM_TYPE64, PCI_BASE_ADDRESS_MEM_TYPE_MASK,
|
||||
};
|
||||
|
||||
use super::pci_manager::{MemoryResourceTrait, PCIDevice, PCIDeviceManager, PCIDevices};
|
||||
|
||||
@@ -24,21 +29,24 @@ impl NvidiaPCIDevice {
|
||||
}
|
||||
|
||||
pub fn get_bars_max_addressable_memory(&self) -> (u64, u64) {
|
||||
let mut max_32bit = 2 * 1024 * 1024;
|
||||
let mut max_64bit = 2 * 1024 * 1024;
|
||||
let mut total_32bit = 0u64;
|
||||
let mut total_64bit = 0u64;
|
||||
|
||||
let nvgpu_devices = self.get_pci_devices(Some(self.vendor_id));
|
||||
for dev in nvgpu_devices {
|
||||
let (mem_size_32bit, mem_size_64bit) = dev.resources.get_total_addressable_memory(true);
|
||||
if max_32bit < mem_size_32bit {
|
||||
max_32bit = mem_size_32bit;
|
||||
}
|
||||
if max_64bit < mem_size_64bit {
|
||||
max_64bit = mem_size_64bit;
|
||||
}
|
||||
let (mem_size_32bit, mem_size_64bit) =
|
||||
dev.resources.get_total_addressable_memory(false);
|
||||
total_32bit += mem_size_32bit;
|
||||
total_64bit += mem_size_64bit;
|
||||
}
|
||||
|
||||
(max_32bit * 2, max_64bit)
|
||||
total_32bit = total_32bit.max(2 * 1024 * 1024);
|
||||
total_64bit = total_64bit.max(2 * 1024 * 1024);
|
||||
|
||||
(
|
||||
calc_next_power_of_2(total_32bit) * 2,
|
||||
calc_next_power_of_2(total_64bit),
|
||||
)
|
||||
}
|
||||
|
||||
fn is_vga_controller(&self, device: &PCIDevice) -> bool {
|
||||
@@ -77,6 +85,46 @@ pub fn get_bars_max_addressable_memory() -> (u64, u64) {
|
||||
(max_32bit, max_64bit)
|
||||
}
|
||||
|
||||
pub fn calc_fw_cfg_mmio64_mb(pci_addr: &str) -> u64 {
|
||||
const FALLBACK_MB: u64 = 256 * 1024; // 256GB
|
||||
|
||||
let manager = PCIDeviceManager::new("/sys/bus/pci/devices");
|
||||
let mut cache = HashMap::new();
|
||||
|
||||
let device = match manager
|
||||
.get_device_by_pci_bus_id(pci_addr, None, &mut cache)
|
||||
.ok()
|
||||
.flatten()
|
||||
{
|
||||
Some(dev) => dev,
|
||||
None => return FALLBACK_MB,
|
||||
};
|
||||
|
||||
let mem_64bit_raw: u64 = device
|
||||
.resources
|
||||
.iter()
|
||||
.filter_map(|(_, region)| {
|
||||
if region.end <= region.start {
|
||||
return None;
|
||||
}
|
||||
let flags = region.flags & PCI_BASE_ADDRESS_MEM_TYPE_MASK;
|
||||
if flags != PCI_BASE_ADDRESS_MEM_TYPE64 {
|
||||
return None;
|
||||
}
|
||||
Some(region.end - region.start + 1)
|
||||
})
|
||||
.sum();
|
||||
|
||||
if mem_64bit_raw == 0 {
|
||||
return FALLBACK_MB;
|
||||
}
|
||||
|
||||
// Perform round_up only once, then convert directly to MB
|
||||
// Bytes -> round_up -> MB (strictly aligned with pref64-reserve source)
|
||||
let rounded_bytes = calc_next_power_of_2(mem_64bit_raw);
|
||||
rounded_bytes / (1024 * 1024) // No need for a second round_up
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -4,3 +4,6 @@
|
||||
//
|
||||
mod devices;
|
||||
mod pci_manager;
|
||||
|
||||
pub use devices::calc_fw_cfg_mmio64_mb;
|
||||
pub use devices::get_bars_max_addressable_memory;
|
||||
|
||||
@@ -19,7 +19,7 @@ const UNKNOWN_DEVICE: &str = "UNKNOWN_DEVICE";
|
||||
const UNKNOWN_CLASS: &str = "UNKNOWN_CLASS";
|
||||
|
||||
const PCI_IOV_NUM_BAR: usize = 6;
|
||||
const PCI_BASE_ADDRESS_MEM_TYPE_MASK: u64 = 0x06;
|
||||
pub const PCI_BASE_ADDRESS_MEM_TYPE_MASK: u64 = 0x06;
|
||||
|
||||
pub(crate) const PCI_BASE_ADDRESS_MEM_TYPE32: u64 = 0x00; // 32 bit address
|
||||
pub(crate) const PCI_BASE_ADDRESS_MEM_TYPE64: u64 = 0x04; // 64 bit address
|
||||
@@ -30,7 +30,7 @@ fn address_to_id(address: &str) -> u64 {
|
||||
}
|
||||
|
||||
// Calculate the next power of 2.
|
||||
fn calc_next_power_of_2(mut n: u64) -> u64 {
|
||||
pub fn calc_next_power_of_2(mut n: u64) -> u64 {
|
||||
if n < 1 {
|
||||
return 1_u64;
|
||||
}
|
||||
@@ -67,22 +67,19 @@ impl MemoryResourceTrait for MemoryResources {
|
||||
let mut keys: Vec<_> = self.keys().cloned().collect();
|
||||
keys.sort();
|
||||
|
||||
for (num_bar, key) in keys.into_iter().enumerate() {
|
||||
if key >= PCI_IOV_NUM_BAR || num_bar == PCI_IOV_NUM_BAR {
|
||||
break;
|
||||
}
|
||||
|
||||
for key in keys.into_iter() {
|
||||
if let Some(region) = self.get(&key) {
|
||||
if region.end <= region.start {
|
||||
continue;
|
||||
}
|
||||
|
||||
let flags = region.flags & PCI_BASE_ADDRESS_MEM_TYPE_MASK;
|
||||
let mem_type_32bit = flags == PCI_BASE_ADDRESS_MEM_TYPE32;
|
||||
let mem_type_64bit = flags == PCI_BASE_ADDRESS_MEM_TYPE64;
|
||||
let mem_size = region.end - region.start + 1;
|
||||
|
||||
if mem_type_32bit {
|
||||
mem_size_32bit += mem_size;
|
||||
}
|
||||
if mem_type_64bit {
|
||||
mem_size_64bit += mem_size;
|
||||
match flags {
|
||||
PCI_BASE_ADDRESS_MEM_TYPE32 => mem_size_32bit += mem_size,
|
||||
PCI_BASE_ADDRESS_MEM_TYPE64 => mem_size_64bit += mem_size,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -148,7 +145,7 @@ impl PCIDeviceManager {
|
||||
Ok(pci_devices)
|
||||
}
|
||||
|
||||
fn get_device_by_pci_bus_id(
|
||||
pub fn get_device_by_pci_bus_id(
|
||||
&self,
|
||||
address: &str,
|
||||
vendor: Option<u16>,
|
||||
|
||||
@@ -257,7 +257,7 @@ pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_ROOTLESS_HYPERVISOR: &str =
|
||||
"io.katacontainers.config.hypervisor.rootless";
|
||||
|
||||
// Hypervisor Shared File System related annotations
|
||||
/// A sandbox annotation to specify the shared file system type, either inline-virtio-fs (default), virtio-9p, virtio-fs or virtio-fs-nydus.
|
||||
/// A sandbox annotation to specify the shared file system type, either virtio-fs(default), inline-virtio-fs, virtio-fs-nydus or none.
|
||||
pub const KATA_ANNO_CFG_HYPERVISOR_SHARED_FS: &str =
|
||||
"io.katacontainers.config.hypervisor.shared_fs";
|
||||
/// A sandbox annotations to specify virtio-fs vhost-user daemon path.
|
||||
@@ -272,8 +272,6 @@ pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_CACHE_SIZE: &str =
|
||||
/// A sandbox annotation to pass options to virtiofsd daemon.
|
||||
pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_EXTRA_ARGS: &str =
|
||||
"io.katacontainers.config.hypervisor.virtio_fs_extra_args";
|
||||
/// A sandbox annotation to specify as the msize for 9p shares.
|
||||
pub const KATA_ANNO_CFG_HYPERVISOR_MSIZE_9P: &str = "io.katacontainers.config.hypervisor.msize_9p";
|
||||
/// The initdata annotation passed in when CVM launchs
|
||||
pub const KATA_ANNO_CFG_HYPERVISOR_INIT_DATA: &str =
|
||||
"io.katacontainers.config.hypervisor.cc_init_data";
|
||||
@@ -975,14 +973,6 @@ impl Annotation {
|
||||
hv.shared_fs.virtio_fs_extra_args.push(arg.to_string());
|
||||
}
|
||||
}
|
||||
KATA_ANNO_CFG_HYPERVISOR_MSIZE_9P => match self.get_value::<u32>(key) {
|
||||
Ok(v) => {
|
||||
hv.shared_fs.msize_9p = v.unwrap_or_default();
|
||||
}
|
||||
Err(_e) => {
|
||||
return Err(u32_err);
|
||||
}
|
||||
},
|
||||
KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_NUM_QUEUES => {
|
||||
match self.get_value::<usize>(key) {
|
||||
Ok(v) => {
|
||||
|
||||
@@ -47,9 +47,6 @@ pub const DEFAULT_BLOCK_DEVICE_QUEUE_SIZE: u32 = 128;
|
||||
pub const DEFAULT_SHARED_FS_TYPE: &str = "virtio-fs";
|
||||
pub const DEFAULT_VIRTIO_FS_CACHE_MODE: &str = "never";
|
||||
pub const DEFAULT_VIRTIO_FS_DAX_SIZE_MB: u32 = 1024;
|
||||
pub const DEFAULT_SHARED_9PFS_SIZE_MB: u32 = 8 * 1024;
|
||||
pub const MIN_SHARED_9PFS_SIZE_MB: u32 = 4 * 1024;
|
||||
pub const MAX_SHARED_9PFS_SIZE_MB: u32 = 8 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_GUEST_HOOK_PATH: &str = "/opt/kata/hooks";
|
||||
pub const DEFAULT_GUEST_DNS_FILE: &str = "/etc/resolv.conf";
|
||||
|
||||
@@ -346,7 +346,7 @@ mod drop_in_directory_handling {
|
||||
|
||||
let dropin_override_data = r#"
|
||||
[hypervisor.qemu]
|
||||
shared_fs = "virtio-9p"
|
||||
shared_fs = "none"
|
||||
[runtime]
|
||||
vfio_mode="vfio"
|
||||
"#;
|
||||
@@ -372,7 +372,7 @@ mod drop_in_directory_handling {
|
||||
assert_eq!(config.hypervisor["qemu"].device_info.default_bridges, 4);
|
||||
assert_eq!(
|
||||
config.hypervisor["qemu"].shared_fs.shared_fs.as_deref(),
|
||||
Some("virtio-9p")
|
||||
Some("none")
|
||||
);
|
||||
assert!(config.runtime.debug);
|
||||
assert!(config.runtime.sandbox_cgroup_only);
|
||||
|
||||
@@ -68,7 +68,6 @@ mod firecracker;
|
||||
pub use self::firecracker::{FirecrackerConfig, HYPERVISOR_NAME_FIRECRACKER};
|
||||
|
||||
const NO_VIRTIO_FS: &str = "none";
|
||||
const VIRTIO_9P: &str = "virtio-9p";
|
||||
const VIRTIO_FS: &str = "virtio-fs";
|
||||
const VIRTIO_FS_INLINE: &str = "inline-virtio-fs";
|
||||
const MAX_BRIDGE_SIZE: u32 = 5;
|
||||
@@ -1419,12 +1418,13 @@ impl SecurityInfo {
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for shared filesystems, such as virtio-9p and virtio-fs.
|
||||
/// Configuration information for shared filesystems, such as virtio-fs-nydus and virtio-fs.
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct SharedFsInfo {
|
||||
/// Type of shared file system to use:
|
||||
/// - `virtio-fs` (default)
|
||||
/// - `virtio-9p`
|
||||
/// - `inline-virtio-fs`
|
||||
/// - `virtio-fs-nydus`
|
||||
/// - `none` (disables shared filesystem)
|
||||
pub shared_fs: Option<String>,
|
||||
|
||||
@@ -1466,18 +1466,13 @@ pub struct SharedFsInfo {
|
||||
/// Enables `virtio-fs` DAX (Direct Access) window if `true`.
|
||||
#[serde(default)]
|
||||
pub virtio_fs_is_dax: bool,
|
||||
|
||||
/// This is the `msize` used for 9p shares. It represents the number of bytes
|
||||
/// used for the 9p packet payload.
|
||||
#[serde(default)]
|
||||
pub msize_9p: u32,
|
||||
}
|
||||
|
||||
impl SharedFsInfo {
|
||||
/// Adjusts the shared filesystem configuration after loading from a configuration file.
|
||||
///
|
||||
/// Handles default values for `shared_fs` type, `virtio-fs` specific settings
|
||||
/// (daemon path, cache mode, DAX), and `virtio-9p` msize.
|
||||
/// (daemon path, cache mode, DAX) or `inline-virtio-fs` settings.
|
||||
pub fn adjust_config(&mut self) -> Result<()> {
|
||||
if self.shared_fs.as_deref() == Some(NO_VIRTIO_FS) {
|
||||
self.shared_fs = None;
|
||||
@@ -1490,11 +1485,6 @@ impl SharedFsInfo {
|
||||
match self.shared_fs.as_deref() {
|
||||
Some(VIRTIO_FS) => self.adjust_virtio_fs(false)?,
|
||||
Some(VIRTIO_FS_INLINE) => self.adjust_virtio_fs(true)?,
|
||||
Some(VIRTIO_9P) => {
|
||||
if self.msize_9p == 0 {
|
||||
self.msize_9p = default::DEFAULT_SHARED_9PFS_SIZE_MB;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
@@ -1504,23 +1494,12 @@ impl SharedFsInfo {
|
||||
/// Validates the shared filesystem configuration.
|
||||
///
|
||||
/// Checks the validity of the selected `shared_fs` type and
|
||||
/// performs specific validations for `virtio-fs` and `virtio-9p` settings.
|
||||
/// performs specific validations for `virtio-fs` and `inline-virtio-fs` settings.
|
||||
pub fn validate(&self) -> Result<()> {
|
||||
match self.shared_fs.as_deref() {
|
||||
None => Ok(()),
|
||||
Some(VIRTIO_FS) => self.validate_virtio_fs(false),
|
||||
Some(VIRTIO_FS_INLINE) => self.validate_virtio_fs(true),
|
||||
Some(VIRTIO_9P) => {
|
||||
if self.msize_9p < default::MIN_SHARED_9PFS_SIZE_MB
|
||||
|| self.msize_9p > default::MAX_SHARED_9PFS_SIZE_MB
|
||||
{
|
||||
return Err(std::io::Error::other(format!(
|
||||
"Invalid 9p configuration msize 0x{:x}, min value is 0x{:x}, max value is 0x{:x}",
|
||||
self.msize_9p,default::MIN_SHARED_9PFS_SIZE_MB, default::MAX_SHARED_9PFS_SIZE_MB
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Some(v) => Err(std::io::Error::other(format!("Invalid shared_fs type {v}"))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,6 +181,26 @@ pub struct Runtime {
|
||||
/// If fd passthrough io is enabled, the runtime will attempt to use the specified port instead of the default port.
|
||||
#[serde(default = "default_passfd_listener_port")]
|
||||
pub passfd_listener_port: u32,
|
||||
|
||||
/// pod_resource_api_sock specifies the unix socket for the Kubelet's
|
||||
/// PodResource API endpoint. If empty, kubernetes based cold plug
|
||||
/// will not be attempted. In order for this feature to work, the
|
||||
/// KubeletPodResourcesGet featureGate must be enabled in Kubelet,
|
||||
/// if using Kubelet older than 1.34.
|
||||
|
||||
/// The pod resource API's socket is relative to the Kubelet's root-dir,
|
||||
/// which is defined by the cluster admin, and its location is:
|
||||
/// ${KubeletRootDir}/pod-resources/kubelet.sock
|
||||
|
||||
/// cold_plug_vfio(see hypervisor config) acts as a feature gate:
|
||||
/// cold_plug_vfio = no_port (default) => no cold plug
|
||||
/// cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need
|
||||
/// explicit CDI annotation for cold plug (applies mainly
|
||||
/// to non-k8s cases)
|
||||
/// cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet
|
||||
/// based cold plug.
|
||||
#[serde(default)]
|
||||
pub pod_resource_api_sock: String,
|
||||
}
|
||||
|
||||
fn default_passfd_listener_port() -> u32 {
|
||||
|
||||
@@ -27,8 +27,6 @@ pub const DRIVER_VFIO_AP_TYPE: &str = "vfio-ap";
|
||||
/// DRIVER_VFIO_AP_COLD_TYPE is the device driver for vfio-ap coldplug.
|
||||
pub const DRIVER_VFIO_AP_COLD_TYPE: &str = "vfio-ap-cold";
|
||||
|
||||
/// DRIVER_9P_TYPE is the driver for 9pfs volume.
|
||||
pub const DRIVER_9P_TYPE: &str = "9p";
|
||||
/// DRIVER_EPHEMERAL_TYPE is the driver for ephemeral volume.
|
||||
pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral";
|
||||
/// DRIVER_LOCAL_TYPE is the driver for local volume.
|
||||
|
||||
@@ -48,7 +48,6 @@ file_mem_backend = "/dev/shm"
|
||||
valid_file_mem_backends = ["/dev/shm","/dev/snd","./test_file_backend_mem_root"]
|
||||
pflashes = ["/proc/mounts"]
|
||||
enable_debug = true
|
||||
msize_9p = 16384
|
||||
disable_image_nvdimm = true
|
||||
hotplug_vfio_on_root_bus = true
|
||||
pcie_root_port = 2
|
||||
|
||||
@@ -47,7 +47,6 @@ file_mem_backend = "/dev/shm"
|
||||
valid_file_mem_backends = ["/dev/shm"]
|
||||
pflashes = ["/proc/mounts"]
|
||||
enable_debug = true
|
||||
msize_9p = 16384
|
||||
disable_image_nvdimm = true
|
||||
hotplug_vfio_on_root_bus = true
|
||||
pcie_root_port = 2
|
||||
|
||||
22
src/libs/pod-resources-rs/Cargo.toml
Normal file
22
src/libs/pod-resources-rs/Cargo.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "pod-resources-rs"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.100"
|
||||
tokio = "1.48"
|
||||
tokio-util = "0.7.17"
|
||||
tower = "0.5"
|
||||
hyper-util = { version = "0.1", features = ["tokio"] }
|
||||
# gRPC dependencies for kubelet pod-resources API
|
||||
tonic = "0.14"
|
||||
prost = "0.14"
|
||||
tonic-prost = "0.14"
|
||||
oci-spec = { version = "0.8.1", features = ["runtime"] }
|
||||
container-device-interface = "0.1.2"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.4.0"
|
||||
|
||||
[build-dependencies]
|
||||
tonic-prost-build = "0.14"
|
||||
16
src/libs/pod-resources-rs/build.rs
Normal file
16
src/libs/pod-resources-rs/build.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
/// This generates Device Plugin code (in v1beta1.rs) from pluginapi.proto
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
tonic_prost_build::configure()
|
||||
.build_server(false) // We only need the client
|
||||
.build_client(true)
|
||||
.out_dir("src/pod_resources")
|
||||
.compile_protos(&["proto/pod_resources.proto"], &["proto"])
|
||||
.expect("failed to compile protos");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
117
src/libs/pod-resources-rs/proto/pod_resources.proto
Normal file
117
src/libs/pod-resources-rs/proto/pod_resources.proto
Normal file
@@ -0,0 +1,117 @@
|
||||
// To regenerate api.pb.go run `hack/update-codegen.sh protobindings`
|
||||
syntax = "proto3";
|
||||
|
||||
package v1;
|
||||
option go_package = "k8s.io/kubelet/pkg/apis/podresources/v1";
|
||||
|
||||
|
||||
// PodResourcesLister is a service provided by the kubelet that provides information about the
|
||||
// node resources consumed by pods and containers on the node
|
||||
service PodResourcesLister {
|
||||
/// List returns the node resources assigned to pods and containers.
|
||||
rpc List(ListPodResourcesRequest) returns (ListPodResourcesResponse) {}
|
||||
/// GetAllocatableResources returns the node resources that are available for assignment to pods and containers.
|
||||
rpc GetAllocatableResources(AllocatableResourcesRequest) returns (AllocatableResourcesResponse) {}
|
||||
/// Get returns the node resources assigned to a specific pod.
|
||||
rpc Get(GetPodResourcesRequest) returns (GetPodResourcesResponse) {}
|
||||
}
|
||||
|
||||
// AllocatableResourcesRequest is the request made to the GetAllocatableResources service
|
||||
message AllocatableResourcesRequest {}
|
||||
|
||||
// AllocatableResourcesResponses contains informations about all the devices known by the kubelet
|
||||
message AllocatableResourcesResponse {
|
||||
repeated ContainerDevices devices = 1;
|
||||
repeated int64 cpu_ids = 2;
|
||||
repeated ContainerMemory memory = 3;
|
||||
}
|
||||
|
||||
// ListPodResourcesRequest is the request made to the PodResources service
|
||||
message ListPodResourcesRequest {}
|
||||
|
||||
// ListPodResourcesResponse is the response returned by List function
|
||||
message ListPodResourcesResponse {
|
||||
repeated PodResources pod_resources = 1;
|
||||
}
|
||||
|
||||
// GetPodResourcesRequest is the request made to the Get service
|
||||
message GetPodResourcesRequest {
|
||||
string pod_name = 1;
|
||||
string pod_namespace = 2;
|
||||
}
|
||||
|
||||
// GetPodResourcesResponse is the response returned by Get function
|
||||
message GetPodResourcesResponse {
|
||||
PodResources pod_resources = 1;
|
||||
}
|
||||
|
||||
// PodResources contains information about the node resources assigned to a pod
|
||||
message PodResources {
|
||||
string name = 1;
|
||||
string namespace = 2;
|
||||
repeated ContainerResources containers = 3;
|
||||
}
|
||||
|
||||
// ContainerResources contains information about the resources assigned to a container
|
||||
message ContainerResources {
|
||||
string name = 1;
|
||||
repeated ContainerDevices devices = 2;
|
||||
repeated int64 cpu_ids = 3;
|
||||
repeated ContainerMemory memory = 4;
|
||||
repeated DynamicResource dynamic_resources = 5;
|
||||
}
|
||||
|
||||
// ContainerDevices contains information about the devices assigned to a container
|
||||
message ContainerDevices {
|
||||
string resource_name = 1;
|
||||
repeated string device_ids = 2;
|
||||
TopologyInfo topology = 3;
|
||||
}
|
||||
|
||||
// ContainerMemory contains information about memory and hugepages assigned to a container
|
||||
message ContainerMemory {
|
||||
string memory_type = 1;
|
||||
uint64 size = 2;
|
||||
TopologyInfo topology = 3;
|
||||
}
|
||||
|
||||
// DynamicResource contains information about the devices assigned to a container by DRA
|
||||
message DynamicResource {
|
||||
// tombstone: removed in 1.31 because claims are no longer associated with one class
|
||||
// string class_name = 1;
|
||||
string claim_name = 2;
|
||||
string claim_namespace = 3;
|
||||
repeated ClaimResource claim_resources = 4;
|
||||
}
|
||||
|
||||
// ClaimResource contains resource information. The driver name/pool name/device name
|
||||
// triplet uniquely identifies the device. Should DRA get extended to other kinds
|
||||
// of resources, then device_name will be empty and other fields will get added.
|
||||
// Each device at the DRA API level may map to zero or more CDI devices.
|
||||
message ClaimResource {
|
||||
repeated CDIDevice cdi_devices = 1;
|
||||
string driver_name = 2;
|
||||
string pool_name = 3;
|
||||
string device_name = 4;
|
||||
}
|
||||
|
||||
// Topology describes hardware topology of the resource
|
||||
message TopologyInfo {
|
||||
repeated NUMANode nodes = 1;
|
||||
}
|
||||
|
||||
// NUMA representation of NUMA node
|
||||
message NUMANode {
|
||||
int64 ID = 1;
|
||||
}
|
||||
|
||||
// CDIDevice specifies a CDI device information
|
||||
message CDIDevice {
|
||||
// Fully qualified CDI device name
|
||||
// for example: vendor.com/gpu=gpudevice1
|
||||
// see more details in the CDI specification:
|
||||
// https://github.com/container-orchestrated-devices/container-device-interface/blob/main/SPEC.md
|
||||
string name = 1;
|
||||
}
|
||||
|
||||
|
||||
82
src/libs/pod-resources-rs/src/lib.rs
Normal file
82
src/libs/pod-resources-rs/src/lib.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
pub mod pod_resources;
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use cdi::specs::config::DeviceNode;
|
||||
// use cdi::container_edits::DeviceNode;
|
||||
use cdi::cache::{CdiOption, new_cache, with_auto_refresh};
|
||||
use cdi::spec_dirs::with_spec_dirs;
|
||||
use container_device_interface as cdi;
|
||||
|
||||
use slog::info;
|
||||
use std::sync::Arc;
|
||||
use tokio::time;
|
||||
|
||||
/// DEFAULT_DYNAMIC_CDI_SPEC_PATH is the default directory for dynamic CDI Specs,
|
||||
/// which can be overridden by specifying a different path when creating the cache.
|
||||
const DEFAULT_DYNAMIC_CDI_SPEC_PATH: &str = "/var/run/cdi";
|
||||
/// DEFAULT_STATIC_CDI_SPEC_PATH is the default directory for static CDI Specs,
|
||||
/// which can be overridden by specifying a different path when creating the cache.
|
||||
const DEFAULT_STATIC_CDI_SPEC_PATH: &str = "/etc/cdi";
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger()
|
||||
};
|
||||
}
|
||||
|
||||
pub async fn handle_cdi_devices(
|
||||
devices: &[String],
|
||||
_cdi_timeout: time::Duration,
|
||||
) -> Result<Vec<DeviceNode>> {
|
||||
if devices.is_empty() {
|
||||
info!(sl!(), "no pod CDI devices requested.");
|
||||
return Ok(vec![]);
|
||||
}
|
||||
// Explicitly set the cache options to disable auto-refresh and
|
||||
// to use the default spec dirs for dynamic and static CDI Specs
|
||||
let options: Vec<CdiOption> = vec![with_auto_refresh(false), with_spec_dirs(&[DEFAULT_DYNAMIC_CDI_SPEC_PATH, DEFAULT_STATIC_CDI_SPEC_PATH])];
|
||||
let cache: Arc<std::sync::Mutex<cdi::cache::Cache>> = new_cache(options);
|
||||
|
||||
let target_devices = {
|
||||
let mut target_devices = vec![];
|
||||
// Lock cache within this scope, std::sync::Mutex has no Send
|
||||
// and await will not work with time::sleep
|
||||
let mut cache = cache.lock().unwrap();
|
||||
match cache.refresh() {
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Refreshing cache failed: {:?}", e));
|
||||
}
|
||||
}
|
||||
|
||||
for dev in devices.iter() {
|
||||
info!(sl!(), "Requested CDI device with FQN: {}", dev);
|
||||
match cache.get_device(dev) {
|
||||
Some(device) => {
|
||||
info!(
|
||||
sl!(),
|
||||
"Target CDI device: {}",
|
||||
device.get_qualified_name()
|
||||
);
|
||||
if let Some(devnodes) = device.edits().container_edits.device_nodes {
|
||||
target_devices.extend(devnodes.iter().cloned());
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(anyhow!("Failed to get device node for CDI device: {} in cache", dev));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
target_devices
|
||||
};
|
||||
info!(sl!(), "target CDI devices to inject: {:?}", target_devices);
|
||||
|
||||
Ok(target_devices)
|
||||
}
|
||||
120
src/libs/pod-resources-rs/src/pod_resources/mod.rs
Normal file
120
src/libs/pod-resources-rs/src/pod_resources/mod.rs
Normal file
@@ -0,0 +1,120 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
pub mod v1;
|
||||
|
||||
use v1::pod_resources_lister_client::PodResourcesListerClient;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use hyper_util::rt::TokioIo;
|
||||
use tokio::net::UnixStream;
|
||||
use tokio::time::{Duration, timeout};
|
||||
use tonic::transport::{Channel, Endpoint, Uri};
|
||||
use tower::service_fn;
|
||||
|
||||
use crate::pod_resources::v1::GetPodResourcesRequest;
|
||||
const SANDBOX_NAME_ANNOTATION: &str = "io.kubernetes.cri.sandbox-name";
|
||||
const SANDBOX_NAMESPACE_ANNOTATION: &str = "io.kubernetes.cri.sandbox-namespace";
|
||||
pub const DEFAULT_POD_RESOURCES_PATH: &str = "/var/lib/kubelet/pod-resources";
|
||||
pub const DEFAULT_POD_RESOURCES_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
pub const CDI_K8S_PREFIX: &str = "cdi.k8s.io/";
|
||||
const MAX_RECV_MSG_SIZE: usize = 16 * 1024 * 1024; // 16MB
|
||||
|
||||
// Create a gRPC channel to the specified Unix socket
|
||||
async fn create_grpc_channel(socket_path: &str) -> Result<Channel> {
|
||||
let socket_path = socket_path.trim_start_matches("unix://");
|
||||
let socket_path_owned = socket_path.to_string();
|
||||
|
||||
// Create a gRPC endpoint with a timeout
|
||||
let endpoint = Endpoint::try_from("http://[::]:50051")
|
||||
.context("failed to create endpoint")?
|
||||
.timeout(DEFAULT_POD_RESOURCES_TIMEOUT);
|
||||
|
||||
// Connect to the Unix socket using a custom connector
|
||||
let channel = endpoint
|
||||
.connect_with_connector(service_fn(move |_: Uri| {
|
||||
let socket_path = socket_path_owned.clone();
|
||||
async move {
|
||||
let stream = UnixStream::connect(&socket_path).await.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
e.kind(),
|
||||
format!("failed to connect to {}: {}", socket_path, e),
|
||||
)
|
||||
})?;
|
||||
Ok::<_, std::io::Error>(TokioIo::new(stream))
|
||||
}
|
||||
}))
|
||||
.await
|
||||
.context("failed to connect to unix socket")?;
|
||||
|
||||
Ok(channel)
|
||||
}
|
||||
|
||||
pub async fn get_pod_cdi_devices(
|
||||
socket: &str,
|
||||
annotations: &HashMap<String, String>,
|
||||
) -> Result<Vec<String>> {
|
||||
let pod_name = annotations.get(SANDBOX_NAME_ANNOTATION).ok_or_else(|| {
|
||||
anyhow::anyhow!("cold plug: missing annotation {}", SANDBOX_NAME_ANNOTATION)
|
||||
})?;
|
||||
|
||||
let pod_namespace = annotations
|
||||
.get(SANDBOX_NAMESPACE_ANNOTATION)
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"cold plug: missing annotation {}",
|
||||
SANDBOX_NAMESPACE_ANNOTATION
|
||||
)
|
||||
})?;
|
||||
|
||||
// Create gRPC channel to kubelet pod-resources socket
|
||||
let channel = create_grpc_channel(socket)
|
||||
.await
|
||||
.context("cold plug: failed to connect to kubelet")?;
|
||||
|
||||
// Create PodResourcesLister client
|
||||
let mut client = PodResourcesListerClient::new(channel)
|
||||
.max_decoding_message_size(MAX_RECV_MSG_SIZE)
|
||||
.max_encoding_message_size(MAX_RECV_MSG_SIZE);
|
||||
|
||||
// Prepare and send GetPodResources request
|
||||
let request = tonic::Request::new(GetPodResourcesRequest {
|
||||
pod_name: pod_name.to_string(),
|
||||
pod_namespace: pod_namespace.to_string(),
|
||||
});
|
||||
|
||||
// Await response with timeout
|
||||
let response = timeout(DEFAULT_POD_RESOURCES_TIMEOUT, client.get(request))
|
||||
.await
|
||||
.context("cold plug: GetPodResources timeout")?
|
||||
.context("cold plug: GetPodResources RPC failed")?;
|
||||
|
||||
// Extract PodResources from response
|
||||
let pod_resources = response
|
||||
.into_inner()
|
||||
.pod_resources
|
||||
.ok_or_else(|| anyhow!("cold plug: PodResources is nil"))?;
|
||||
|
||||
// Format device specifications
|
||||
let format_cdi_device_ids = |resource_name: &str, device_ids: &[String]| -> Vec<String> {
|
||||
device_ids
|
||||
.iter()
|
||||
.map(|id| format!("{}={}", resource_name, id))
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Collect all device specifications from all containers
|
||||
let mut devices = Vec::new();
|
||||
for container in &pod_resources.containers {
|
||||
for device in &container.devices {
|
||||
let cdi_devices = format_cdi_device_ids(&device.resource_name, &device.device_ids);
|
||||
devices.extend(cdi_devices);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(devices)
|
||||
}
|
||||
301
src/libs/pod-resources-rs/src/pod_resources/v1.rs
Normal file
301
src/libs/pod-resources-rs/src/pod_resources/v1.rs
Normal file
@@ -0,0 +1,301 @@
|
||||
// This file is @generated by prost-build.
|
||||
/// AllocatableResourcesRequest is the request made to the GetAllocatableResources service
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct AllocatableResourcesRequest {}
|
||||
/// AllocatableResourcesResponses contains informations about all the devices known by the kubelet
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct AllocatableResourcesResponse {
|
||||
#[prost(message, repeated, tag = "1")]
|
||||
pub devices: ::prost::alloc::vec::Vec<ContainerDevices>,
|
||||
#[prost(int64, repeated, tag = "2")]
|
||||
pub cpu_ids: ::prost::alloc::vec::Vec<i64>,
|
||||
#[prost(message, repeated, tag = "3")]
|
||||
pub memory: ::prost::alloc::vec::Vec<ContainerMemory>,
|
||||
}
|
||||
/// ListPodResourcesRequest is the request made to the PodResources service
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct ListPodResourcesRequest {}
|
||||
/// ListPodResourcesResponse is the response returned by List function
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ListPodResourcesResponse {
|
||||
#[prost(message, repeated, tag = "1")]
|
||||
pub pod_resources: ::prost::alloc::vec::Vec<PodResources>,
|
||||
}
|
||||
/// GetPodResourcesRequest is the request made to the Get service
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct GetPodResourcesRequest {
|
||||
#[prost(string, tag = "1")]
|
||||
pub pod_name: ::prost::alloc::string::String,
|
||||
#[prost(string, tag = "2")]
|
||||
pub pod_namespace: ::prost::alloc::string::String,
|
||||
}
|
||||
/// GetPodResourcesResponse is the response returned by Get function
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct GetPodResourcesResponse {
|
||||
#[prost(message, optional, tag = "1")]
|
||||
pub pod_resources: ::core::option::Option<PodResources>,
|
||||
}
|
||||
/// PodResources contains information about the node resources assigned to a pod
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct PodResources {
|
||||
#[prost(string, tag = "1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
#[prost(string, tag = "2")]
|
||||
pub namespace: ::prost::alloc::string::String,
|
||||
#[prost(message, repeated, tag = "3")]
|
||||
pub containers: ::prost::alloc::vec::Vec<ContainerResources>,
|
||||
}
|
||||
/// ContainerResources contains information about the resources assigned to a container
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ContainerResources {
|
||||
#[prost(string, tag = "1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
#[prost(message, repeated, tag = "2")]
|
||||
pub devices: ::prost::alloc::vec::Vec<ContainerDevices>,
|
||||
#[prost(int64, repeated, tag = "3")]
|
||||
pub cpu_ids: ::prost::alloc::vec::Vec<i64>,
|
||||
#[prost(message, repeated, tag = "4")]
|
||||
pub memory: ::prost::alloc::vec::Vec<ContainerMemory>,
|
||||
#[prost(message, repeated, tag = "5")]
|
||||
pub dynamic_resources: ::prost::alloc::vec::Vec<DynamicResource>,
|
||||
}
|
||||
/// ContainerDevices contains information about the devices assigned to a container
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ContainerDevices {
|
||||
#[prost(string, tag = "1")]
|
||||
pub resource_name: ::prost::alloc::string::String,
|
||||
#[prost(string, repeated, tag = "2")]
|
||||
pub device_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
|
||||
#[prost(message, optional, tag = "3")]
|
||||
pub topology: ::core::option::Option<TopologyInfo>,
|
||||
}
|
||||
/// ContainerMemory contains information about memory and hugepages assigned to a container
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ContainerMemory {
|
||||
#[prost(string, tag = "1")]
|
||||
pub memory_type: ::prost::alloc::string::String,
|
||||
#[prost(uint64, tag = "2")]
|
||||
pub size: u64,
|
||||
#[prost(message, optional, tag = "3")]
|
||||
pub topology: ::core::option::Option<TopologyInfo>,
|
||||
}
|
||||
/// DynamicResource contains information about the devices assigned to a container by DRA
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct DynamicResource {
|
||||
/// tombstone: removed in 1.31 because claims are no longer associated with one class
|
||||
/// string class_name = 1;
|
||||
#[prost(string, tag = "2")]
|
||||
pub claim_name: ::prost::alloc::string::String,
|
||||
#[prost(string, tag = "3")]
|
||||
pub claim_namespace: ::prost::alloc::string::String,
|
||||
#[prost(message, repeated, tag = "4")]
|
||||
pub claim_resources: ::prost::alloc::vec::Vec<ClaimResource>,
|
||||
}
|
||||
/// ClaimResource contains resource information. The driver name/pool name/device name
|
||||
/// triplet uniquely identifies the device. Should DRA get extended to other kinds
|
||||
/// of resources, then device_name will be empty and other fields will get added.
|
||||
/// Each device at the DRA API level may map to zero or more CDI devices.
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ClaimResource {
|
||||
#[prost(message, repeated, tag = "1")]
|
||||
pub cdi_devices: ::prost::alloc::vec::Vec<CdiDevice>,
|
||||
#[prost(string, tag = "2")]
|
||||
pub driver_name: ::prost::alloc::string::String,
|
||||
#[prost(string, tag = "3")]
|
||||
pub pool_name: ::prost::alloc::string::String,
|
||||
#[prost(string, tag = "4")]
|
||||
pub device_name: ::prost::alloc::string::String,
|
||||
}
|
||||
/// Topology describes hardware topology of the resource
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct TopologyInfo {
|
||||
#[prost(message, repeated, tag = "1")]
|
||||
pub nodes: ::prost::alloc::vec::Vec<NumaNode>,
|
||||
}
|
||||
/// NUMA representation of NUMA node
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct NumaNode {
|
||||
#[prost(int64, tag = "1")]
|
||||
pub id: i64,
|
||||
}
|
||||
/// CDIDevice specifies a CDI device information
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct CdiDevice {
|
||||
/// Fully qualified CDI device name
|
||||
/// for example: vendor.com/gpu=gpudevice1
|
||||
/// see more details in the CDI specification:
|
||||
/// <https://github.com/container-orchestrated-devices/container-device-interface/blob/main/SPEC.md>
|
||||
#[prost(string, tag = "1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
}
|
||||
/// Generated client implementations.
|
||||
pub mod pod_resources_lister_client {
|
||||
#![allow(
|
||||
unused_variables,
|
||||
dead_code,
|
||||
missing_docs,
|
||||
clippy::wildcard_imports,
|
||||
clippy::let_unit_value,
|
||||
)]
|
||||
use tonic::codegen::*;
|
||||
use tonic::codegen::http::Uri;
|
||||
/// PodResourcesLister is a service provided by the kubelet that provides information about the
|
||||
/// node resources consumed by pods and containers on the node
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PodResourcesListerClient<T> {
|
||||
inner: tonic::client::Grpc<T>,
|
||||
}
|
||||
impl PodResourcesListerClient<tonic::transport::Channel> {
|
||||
/// Attempt to create a new client by connecting to a given endpoint.
|
||||
pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
|
||||
where
|
||||
D: TryInto<tonic::transport::Endpoint>,
|
||||
D::Error: Into<StdError>,
|
||||
{
|
||||
let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
|
||||
Ok(Self::new(conn))
|
||||
}
|
||||
}
|
||||
impl<T> PodResourcesListerClient<T>
|
||||
where
|
||||
T: tonic::client::GrpcService<tonic::body::Body>,
|
||||
T::Error: Into<StdError>,
|
||||
T::ResponseBody: Body<Data = Bytes> + std::marker::Send + 'static,
|
||||
<T::ResponseBody as Body>::Error: Into<StdError> + std::marker::Send,
|
||||
{
|
||||
pub fn new(inner: T) -> Self {
|
||||
let inner = tonic::client::Grpc::new(inner);
|
||||
Self { inner }
|
||||
}
|
||||
pub fn with_origin(inner: T, origin: Uri) -> Self {
|
||||
let inner = tonic::client::Grpc::with_origin(inner, origin);
|
||||
Self { inner }
|
||||
}
|
||||
pub fn with_interceptor<F>(
|
||||
inner: T,
|
||||
interceptor: F,
|
||||
) -> PodResourcesListerClient<InterceptedService<T, F>>
|
||||
where
|
||||
F: tonic::service::Interceptor,
|
||||
T::ResponseBody: Default,
|
||||
T: tonic::codegen::Service<
|
||||
http::Request<tonic::body::Body>,
|
||||
Response = http::Response<
|
||||
<T as tonic::client::GrpcService<tonic::body::Body>>::ResponseBody,
|
||||
>,
|
||||
>,
|
||||
<T as tonic::codegen::Service<
|
||||
http::Request<tonic::body::Body>,
|
||||
>>::Error: Into<StdError> + std::marker::Send + std::marker::Sync,
|
||||
{
|
||||
PodResourcesListerClient::new(InterceptedService::new(inner, interceptor))
|
||||
}
|
||||
/// Compress requests with the given encoding.
|
||||
///
|
||||
/// This requires the server to support it otherwise it might respond with an
|
||||
/// error.
|
||||
#[must_use]
|
||||
pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
|
||||
self.inner = self.inner.send_compressed(encoding);
|
||||
self
|
||||
}
|
||||
/// Enable decompressing responses.
|
||||
#[must_use]
|
||||
pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
|
||||
self.inner = self.inner.accept_compressed(encoding);
|
||||
self
|
||||
}
|
||||
/// Limits the maximum size of a decoded message.
|
||||
///
|
||||
/// Default: `4MB`
|
||||
#[must_use]
|
||||
pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
|
||||
self.inner = self.inner.max_decoding_message_size(limit);
|
||||
self
|
||||
}
|
||||
/// Limits the maximum size of an encoded message.
|
||||
///
|
||||
/// Default: `usize::MAX`
|
||||
#[must_use]
|
||||
pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
|
||||
self.inner = self.inner.max_encoding_message_size(limit);
|
||||
self
|
||||
}
|
||||
/// / List returns the node resources assigned to pods and containers.
|
||||
pub async fn list(
|
||||
&mut self,
|
||||
request: impl tonic::IntoRequest<super::ListPodResourcesRequest>,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<super::ListPodResourcesResponse>,
|
||||
tonic::Status,
|
||||
> {
|
||||
self.inner
|
||||
.ready()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tonic::Status::unknown(
|
||||
format!("Service was not ready: {}", e.into()),
|
||||
)
|
||||
})?;
|
||||
let codec = tonic_prost::ProstCodec::default();
|
||||
let path = http::uri::PathAndQuery::from_static(
|
||||
"/v1.PodResourcesLister/List",
|
||||
);
|
||||
let mut req = request.into_request();
|
||||
req.extensions_mut()
|
||||
.insert(GrpcMethod::new("v1.PodResourcesLister", "List"));
|
||||
self.inner.unary(req, path, codec).await
|
||||
}
|
||||
/// / GetAllocatableResources returns the node resources that are available for assignment to pods and containers.
|
||||
pub async fn get_allocatable_resources(
|
||||
&mut self,
|
||||
request: impl tonic::IntoRequest<super::AllocatableResourcesRequest>,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<super::AllocatableResourcesResponse>,
|
||||
tonic::Status,
|
||||
> {
|
||||
self.inner
|
||||
.ready()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tonic::Status::unknown(
|
||||
format!("Service was not ready: {}", e.into()),
|
||||
)
|
||||
})?;
|
||||
let codec = tonic_prost::ProstCodec::default();
|
||||
let path = http::uri::PathAndQuery::from_static(
|
||||
"/v1.PodResourcesLister/GetAllocatableResources",
|
||||
);
|
||||
let mut req = request.into_request();
|
||||
req.extensions_mut()
|
||||
.insert(
|
||||
GrpcMethod::new("v1.PodResourcesLister", "GetAllocatableResources"),
|
||||
);
|
||||
self.inner.unary(req, path, codec).await
|
||||
}
|
||||
/// / Get returns the node resources assigned to a specific pod.
|
||||
pub async fn get(
|
||||
&mut self,
|
||||
request: impl tonic::IntoRequest<super::GetPodResourcesRequest>,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<super::GetPodResourcesResponse>,
|
||||
tonic::Status,
|
||||
> {
|
||||
self.inner
|
||||
.ready()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tonic::Status::unknown(
|
||||
format!("Service was not ready: {}", e.into()),
|
||||
)
|
||||
})?;
|
||||
let codec = tonic_prost::ProstCodec::default();
|
||||
let path = http::uri::PathAndQuery::from_static(
|
||||
"/v1.PodResourcesLister/Get",
|
||||
);
|
||||
let mut req = request.into_request();
|
||||
req.extensions_mut().insert(GrpcMethod::new("v1.PodResourcesLister", "Get"));
|
||||
self.inner.unary(req, path, codec).await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -493,7 +493,7 @@ message SharedMount {
|
||||
// could have been defined through the Mount list of the OCI specification.
|
||||
message Storage {
|
||||
// Driver is used to define the way the storage is passed through the
|
||||
// virtual machine. It can be "9p", "blk", or something else, but for
|
||||
// virtual machine. It can be "blk", or something else, but for
|
||||
// all cases, this will define if some extra steps are required before
|
||||
// this storage gets mounted into the container.
|
||||
string driver = 1;
|
||||
@@ -509,7 +509,7 @@ message Storage {
|
||||
string source = 3;
|
||||
// Fstype represents the filesystem that needs to be used to mount the
|
||||
// storage inside the VM. For instance, it could be "xfs" for block
|
||||
// device, "9p" for shared filesystem, or "tmpfs" for shared /dev/shm.
|
||||
// device, or "tmpfs" for shared /dev/shm.
|
||||
string fstype = 4;
|
||||
// Options describes the additional options that might be needed to
|
||||
// mount properly the storage filesystem.
|
||||
|
||||
@@ -29,3 +29,4 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
shim = { path = "crates/shim" }
|
||||
common = { workspace = true }
|
||||
runtimes = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
|
||||
@@ -26,23 +26,14 @@ ARCH_DIR = arch
|
||||
ARCH_FILE_SUFFIX = -options.mk
|
||||
ARCH_FILE = $(ARCH_DIR)/$(ARCH)$(ARCH_FILE_SUFFIX)
|
||||
|
||||
ifeq ($(ARCH), s390x)
|
||||
UNSUPPORTED_ARCHS := s390x powerpc64le riscv64gc
|
||||
|
||||
ifeq ($(filter $(ARCH), $(UNSUPPORTED_ARCHS)),$(ARCH))
|
||||
default: runtime show-header
|
||||
test:
|
||||
@echo "s390x is not currently supported"
|
||||
@echo "$(ARCH) is not currently supported"
|
||||
exit 0
|
||||
install: install-runtime install-configs
|
||||
else ifeq ($(ARCH), powerpc64le)
|
||||
default: runtime show-header
|
||||
test:
|
||||
@echo "powerpc64le is not currently supported"
|
||||
exit 0
|
||||
install: install-runtime install-configs
|
||||
else ifeq ($(ARCH), riscv64gc)
|
||||
default: runtime show-header
|
||||
test:
|
||||
@echo "RISC-V 64 is not currently supported"
|
||||
exit 0
|
||||
else
|
||||
##TARGET default: build code
|
||||
default: runtime show-header
|
||||
@@ -133,6 +124,9 @@ FCVALIDJAILERPATHS = [\"$(FCJAILERPATH)\"]
|
||||
PKGLIBEXECDIR := $(LIBEXECDIR)/$(PROJECT_DIR)
|
||||
|
||||
# EDK2 firmware names per architecture
|
||||
ifeq ($(ARCH), x86_64)
|
||||
EDK2_NAME := ovmf
|
||||
endif
|
||||
ifeq ($(ARCH), aarch64)
|
||||
EDK2_NAME := aavmf
|
||||
endif
|
||||
@@ -145,6 +139,15 @@ ifneq (,$(QEMUCMD))
|
||||
endif
|
||||
endif
|
||||
|
||||
# Firmware path for qemu-nvidia-gpu (OVMF / AAVMF); parity with src/runtime/Makefile FIRMWAREPATH_NV
|
||||
FIRMWAREPATH_NV :=
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FIRMWAREPATH_NV := $(PREFIXDEPS)/share/$(EDK2_NAME)/OVMF.fd
|
||||
endif
|
||||
ifeq ($(ARCH), aarch64)
|
||||
FIRMWAREPATH_NV := $(PREFIXDEPS)/share/$(EDK2_NAME)/AAVMF_CODE.fd
|
||||
endif
|
||||
|
||||
KERNELVERITYPARAMS ?= ""
|
||||
|
||||
# TDX
|
||||
@@ -365,10 +368,80 @@ ifneq (,$(QEMUCMD))
|
||||
|
||||
CONFIGS += $(CONFIG_QEMU_COCO_DEV)
|
||||
|
||||
CONFIG_FILE_QEMU_NVIDIA_GPU = configuration-qemu-nvidia-gpu-runtime-rs.toml
|
||||
CONFIG_QEMU_NVIDIA_GPU = config/$(CONFIG_FILE_QEMU_NVIDIA_GPU)
|
||||
CONFIG_QEMU_NVIDIA_GPU_IN = $(CONFIG_QEMU_NVIDIA_GPU).in
|
||||
|
||||
CONFIG_PATH_QEMU_NVIDIA_GPU = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU))
|
||||
CONFIG_PATHS += $(CONFIG_PATH_QEMU_NVIDIA_GPU)
|
||||
|
||||
SYSCONFIG_QEMU_NVIDIA_GPU = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU))
|
||||
SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_NVIDIA_GPU)
|
||||
|
||||
CONFIGS += $(CONFIG_QEMU_NVIDIA_GPU)
|
||||
|
||||
CONFIG_FILE_QEMU_NVIDIA_GPU_SNP = configuration-qemu-nvidia-gpu-snp-runtime-rs.toml
|
||||
CONFIG_QEMU_NVIDIA_GPU_SNP = config/$(CONFIG_FILE_QEMU_NVIDIA_GPU_SNP)
|
||||
CONFIG_QEMU_NVIDIA_GPU_SNP_IN = $(CONFIG_QEMU_NVIDIA_GPU_SNP).in
|
||||
|
||||
CONFIG_PATH_QEMU_NVIDIA_GPU_SNP = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU_SNP))
|
||||
CONFIG_PATHS += $(CONFIG_PATH_QEMU_NVIDIA_GPU_SNP)
|
||||
|
||||
SYSCONFIG_QEMU_NVIDIA_GPU_SNP = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU_SNP))
|
||||
SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_NVIDIA_GPU_SNP)
|
||||
|
||||
CONFIGS += $(CONFIG_QEMU_NVIDIA_GPU_SNP)
|
||||
|
||||
CONFIG_FILE_QEMU_NVIDIA_GPU_TDX = configuration-qemu-nvidia-gpu-tdx-runtime-rs.toml
|
||||
CONFIG_QEMU_NVIDIA_GPU_TDX = config/$(CONFIG_FILE_QEMU_NVIDIA_GPU_TDX)
|
||||
CONFIG_QEMU_NVIDIA_GPU_TDX_IN = $(CONFIG_QEMU_NVIDIA_GPU_TDX).in
|
||||
|
||||
CONFIG_PATH_QEMU_NVIDIA_GPU_TDX = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU_TDX))
|
||||
CONFIG_PATHS += $(CONFIG_PATH_QEMU_NVIDIA_GPU_TDX)
|
||||
|
||||
SYSCONFIG_QEMU_NVIDIA_GPU_TDX = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_NVIDIA_GPU_TDX))
|
||||
SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_NVIDIA_GPU_TDX)
|
||||
|
||||
CONFIGS += $(CONFIG_QEMU_NVIDIA_GPU_TDX)
|
||||
|
||||
KERNELTYPE_QEMU = uncompressed
|
||||
KERNEL_NAME_QEMU = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_QEMU))
|
||||
KERNELPATH_QEMU = $(KERNELDIR)/$(KERNEL_NAME_QEMU)
|
||||
|
||||
# NVIDIA GPU reference stack (parity with Go configuration-qemu-nvidia-gpu.toml.in)
|
||||
KERNELTYPE_NV = compressed
|
||||
KERNELNAME_NV = $(call MAKE_KERNEL_NAME_NV,$(KERNELTYPE_NV))
|
||||
KERNELPATH_NV = $(KERNELDIR)/$(KERNELNAME_NV)
|
||||
IMAGENAME_NV = $(PROJECT_TAG)-nvidia-gpu.img
|
||||
IMAGEPATH_NV = $(PKGDATADIR)/$(IMAGENAME_NV)
|
||||
KERNELPARAMS_NV := cgroup_no_v1=all pci=realloc pci=nocrs pci=assign-busses
|
||||
KERNELVERITYPARAMS_NV ?=
|
||||
DEFAULTVCPUS_NV := 1
|
||||
DEFAULTMEMORY_NV := 8192
|
||||
DEFAULTTIMEOUT_NV := 1200
|
||||
DEFAULTPCIEROOTPORT_NV := 8
|
||||
DEFDISABLEIMAGENVDIMM_NV := true
|
||||
DEFSANDBOXCGROUPONLY_NV := true
|
||||
DEFSTATICRESOURCEMGMT_NV := true
|
||||
DEFVFIOMODE_NV := guest-kernel
|
||||
DEFKUBELETROOTDIR := /var/lib/kubelet
|
||||
DEFPODRESOURCEAPISOCK_NV := "$(DEFKUBELETROOTDIR)/pod-resources/kubelet.sock"
|
||||
# NVIDIA profile: rootfs block driver (avoid virtio-pmem/DAX for this stack)
|
||||
VMROOTFSDRIVER_NV := virtio-blk-pci
|
||||
# Agent client: hybrid vsock retry delay (ms); vsock path uses this as initial backoff (clamped in agent)
|
||||
DEFDIALTIMEOUTMS_NV := 1000
|
||||
# Go qemu-nvidia-gpu dial_timeout is 1200s; runtime-rs uses ms for reconnect_timeout_ms
|
||||
DEFRECONNECTTIMEOUTMS_NV := 1200000
|
||||
|
||||
# NVIDIA GPU confidential (SNP) reference stack
|
||||
IMAGENAME_CONFIDENTIAL_NV = $(PROJECT_TAG)-nvidia-gpu-confidential.img
|
||||
IMAGEPATH_CONFIDENTIAL_NV = $(PKGDATADIR)/$(IMAGENAME_CONFIDENTIAL_NV)
|
||||
KERNELNAME_CONFIDENTIAL_NV = $(call MAKE_KERNEL_NAME_NV,compressed)
|
||||
KERNELPATH_CONFIDENTIAL_NV = $(KERNELDIR)/$(KERNELNAME_CONFIDENTIAL_NV)
|
||||
KERNELVERITYPARAMS_CONFIDENTIAL_NV ?=
|
||||
FIRMWARESNPPATH_NV := $(FIRMWARE_SNP_PATH)
|
||||
FIRMWARETDVFPATH_NV := $(FIRMWARETDVFPATH)
|
||||
|
||||
KERNEL_NAME_QEMU_SE = kata-containers-se.img
|
||||
KERNELPATH_QEMU_SE = $(KERNELDIR)/$(KERNEL_NAME_QEMU_SE)
|
||||
|
||||
@@ -475,6 +548,7 @@ USER_VARS += CONFIG_QEMU_IN
|
||||
USER_VARS += CONFIG_QEMU_SE_IN
|
||||
USER_VARS += CONFIG_REMOTE_IN
|
||||
USER_VARS += CONFIG_QEMU_COCO_DEV_IN
|
||||
USER_VARS += CONFIG_QEMU_NVIDIA_GPU_IN
|
||||
USER_VARS += DESTDIR
|
||||
USER_VARS += HYPERVISOR
|
||||
USER_VARS += USE_BUILTIN_DB
|
||||
@@ -500,8 +574,10 @@ USER_VARS += FCVALIDJAILERPATHS
|
||||
USER_VARS += DEFMAXMEMSZ_FC
|
||||
USER_VARS += SYSCONFIG
|
||||
USER_VARS += IMAGENAME
|
||||
USER_VARS += IMAGENAME_NV
|
||||
USER_VARS += IMAGECONFIDENTIALNAME
|
||||
USER_VARS += IMAGEPATH
|
||||
USER_VARS += IMAGEPATH_NV
|
||||
USER_VARS += IMAGECONFIDENTIALPATH
|
||||
USER_VARS += INITRDNAME
|
||||
USER_VARS += INITRDCONFIDENTIALNAME
|
||||
@@ -516,12 +592,15 @@ USER_VARS += KERNELDIR
|
||||
USER_VARS += KERNELTYPE
|
||||
USER_VARS += KERNELPATH_DB
|
||||
USER_VARS += KERNELPATH_QEMU
|
||||
USER_VARS += KERNELPATH_NV
|
||||
USER_VARS += KERNELPATH_QEMU_SE
|
||||
USER_VARS += KERNELPATH_FC
|
||||
USER_VARS += KERNELPATH_COCO
|
||||
USER_VARS += KERNELPATH
|
||||
USER_VARS += KERNELVIRTIOFSPATH
|
||||
USER_VARS += FIRMWAREPATH
|
||||
USER_VARS += FIRMWAREPATH_NV
|
||||
USER_VARS += FIRMWAREVOLUMEPATH
|
||||
USER_VARS += MACHINEACCELERATORS
|
||||
USER_VARS += CPUFEATURES
|
||||
USER_VARS += DEFMACHINETYPE_CLH
|
||||
@@ -626,6 +705,31 @@ USER_VARS += FIRMWARE_SNP_PATH
|
||||
USER_VARS += KERNELTDXPARAMS
|
||||
USER_VARS += DEFSHAREDFS_QEMU_TDX_VIRTIOFS
|
||||
USER_VARS += FIRMWARETDVFPATH
|
||||
USER_VARS += DEFPODRESOURCEAPISOCK
|
||||
USER_VARS += KERNELPARAMS_NV
|
||||
USER_VARS += KERNELVERITYPARAMS_NV
|
||||
USER_VARS += DEFAULTVCPUS_NV
|
||||
USER_VARS += DEFAULTMEMORY_NV
|
||||
USER_VARS += DEFAULTTIMEOUT_NV
|
||||
USER_VARS += DEFAULTPCIEROOTPORT_NV
|
||||
USER_VARS += DEFDISABLEIMAGENVDIMM_NV
|
||||
USER_VARS += DEFSANDBOXCGROUPONLY_NV
|
||||
USER_VARS += DEFSTATICRESOURCEMGMT_NV
|
||||
USER_VARS += DEFVFIOMODE_NV
|
||||
USER_VARS += DEFKUBELETROOTDIR
|
||||
USER_VARS += DEFPODRESOURCEAPISOCK_NV
|
||||
USER_VARS += VMROOTFSDRIVER_NV
|
||||
USER_VARS += DEFDIALTIMEOUTMS_NV
|
||||
USER_VARS += DEFRECONNECTTIMEOUTMS_NV
|
||||
USER_VARS += IMAGENAME_CONFIDENTIAL_NV
|
||||
USER_VARS += IMAGEPATH_CONFIDENTIAL_NV
|
||||
USER_VARS += KERNELNAME_CONFIDENTIAL_NV
|
||||
USER_VARS += KERNELPATH_CONFIDENTIAL_NV
|
||||
USER_VARS += KERNELVERITYPARAMS_CONFIDENTIAL_NV
|
||||
USER_VARS += FIRMWARESNPPATH_NV
|
||||
USER_VARS += FIRMWARETDVFPATH_NV
|
||||
USER_VARS += CONFIG_QEMU_NVIDIA_GPU_SNP_IN
|
||||
USER_VARS += CONFIG_QEMU_NVIDIA_GPU_TDX_IN
|
||||
|
||||
SOURCES := \
|
||||
$(shell find . 2>&1 | grep -E '.*\.rs$$') \
|
||||
@@ -665,6 +769,9 @@ GENERATED_VARS = \
|
||||
CONFIG_FC_IN \
|
||||
CONFIG_QEMU_TDX_IN \
|
||||
CONFIG_QEMU_SNP_IN \
|
||||
CONFIG_QEMU_NVIDIA_GPU_IN \
|
||||
CONFIG_QEMU_NVIDIA_GPU_SNP_IN \
|
||||
CONFIG_QEMU_NVIDIA_GPU_TDX_IN \
|
||||
$(USER_VARS)
|
||||
|
||||
|
||||
@@ -716,6 +823,10 @@ define MAKE_KERNEL_NAME
|
||||
$(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container)
|
||||
endef
|
||||
|
||||
define MAKE_KERNEL_NAME_NV
|
||||
$(if $(findstring uncompressed,$1),vmlinux-nvidia-gpu.container,vmlinuz-nvidia-gpu.container)
|
||||
endef
|
||||
|
||||
.DEFAULT_GOAL := default
|
||||
|
||||
GENERATED_FILES += $(CONFIGS)
|
||||
|
||||
@@ -174,7 +174,6 @@ guest_hook_path = ""
|
||||
# Shared file system type:
|
||||
# - inline-virtio-fs (default)
|
||||
# - virtio-fs
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# "inline-virtio-fs" is the same as "virtio-fs", but it is running in the same process
|
||||
# of shim, does not need an external virtiofsd process.
|
||||
|
||||
@@ -179,7 +179,6 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_COCO_DEV_VIRTIOFS@"
|
||||
|
||||
@@ -0,0 +1,825 @@
|
||||
# Copyright (c) 2017-2019 Intel Corporation
|
||||
# Copyright (c) 2021 Adobe Inc.
|
||||
# Copyright (c) 2024 IBM Corp.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
# XXX: WARNING: this file is auto-generated.
|
||||
# XXX:
|
||||
# XXX: Source file: "@CONFIG_QEMU_NVIDIA_GPU_IN@"
|
||||
# XXX: Project:
|
||||
# XXX: Name: @PROJECT_NAME@
|
||||
# XXX: Type: @PROJECT_TYPE@
|
||||
|
||||
[hypervisor.qemu]
|
||||
path = "@QEMUPATH@"
|
||||
kernel = "@KERNELPATH_NV@"
|
||||
image = "@IMAGEPATH_NV@"
|
||||
machine_type = "@MACHINETYPE@"
|
||||
|
||||
# rootfs filesystem type:
|
||||
# - ext4 (default)
|
||||
# - xfs
|
||||
# - erofs
|
||||
rootfs_type = @DEFROOTFSTYPE@
|
||||
|
||||
# Block storage driver to be used for the VM rootfs is backed
|
||||
# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm
|
||||
# (default @VMROOTFSDRIVER_NV@ via Makefile VMROOTFSDRIVER_NV)
|
||||
vm_rootfs_driver = "@VMROOTFSDRIVER_NV@"
|
||||
|
||||
# Enable running QEMU VMM as a non-root user.
|
||||
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
|
||||
# a non-root random user. See documentation for the limitations of this mode.
|
||||
rootless = false
|
||||
|
||||
# List of valid annotation names for the hypervisor
|
||||
# Each member of the list is a regular expression, which is the base name
|
||||
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
|
||||
enable_annotations = @DEFENABLEANNOTATIONS@
|
||||
|
||||
# List of valid annotations values for the hypervisor
|
||||
# Each member of the list is a path pattern as described by glob(3).
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@
|
||||
valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@
|
||||
|
||||
# Optional space-separated list of options to pass to the guest kernel.
|
||||
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
|
||||
# trouble running pre-2.15 glibc.
|
||||
#
|
||||
# WARNING: - any parameter specified here will take priority over the default
|
||||
# parameter value of the same name used to start the virtual machine.
|
||||
# Do not set values here unless you understand the impact of doing so as you
|
||||
# may stop the virtual machine from booting.
|
||||
# To see the list of default parameters, enable hypervisor debug, create a
|
||||
# container and look for 'default-kernel-parameters' log entries.
|
||||
kernel_params = "@KERNELPARAMS_NV@"
|
||||
|
||||
# Optional dm-verity parameters (comma-separated key=value list):
|
||||
# root_hash=...,salt=...,data_blocks=...,data_block_size=...,hash_block_size=...
|
||||
# These are used by the runtime to assemble dm-verity kernel params.
|
||||
kernel_verity_params = "@KERNELVERITYPARAMS_NV@"
|
||||
|
||||
# Path to the firmware.
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWAREPATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
# For example, `cpu_features = "pmu=off,vmx=off"
|
||||
cpu_features = "@CPUFEATURES@"
|
||||
|
||||
# Default number of vCPUs per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFVCPUS@
|
||||
# < 0 --> will be set to the actual number of physical cores
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores
|
||||
default_vcpus = @DEFAULTVCPUS_NV@
|
||||
|
||||
# Default maximum number of vCPUs per SB/VM:
|
||||
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
|
||||
# the actual number of physical cores is greater than it.
|
||||
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
|
||||
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
|
||||
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
|
||||
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
|
||||
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
|
||||
# unless you know what are you doing.
|
||||
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
|
||||
default_maxvcpus = @DEFMAXVCPUS_QEMU@
|
||||
|
||||
# Bridges can be used to hot plug devices.
|
||||
# Limitations:
|
||||
# * Currently only pci bridges are supported
|
||||
# * Until 30 devices per bridge can be hot plugged.
|
||||
# * Until 5 PCI bridges can be cold plugged per VM.
|
||||
# This limitation could be a bug in qemu or in the kernel
|
||||
# Default number of bridges per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFBRIDGES@
|
||||
# > 1 <= 5 --> will be set to the specified number
|
||||
# > 5 --> will be set to 5
|
||||
default_bridges = @DEFBRIDGES@
|
||||
|
||||
# Reclaim guest freed memory.
|
||||
# Enabling this will result in the VM balloon device having f_reporting=on set.
|
||||
# Then the hypervisor will use it to reclaim guest freed memory.
|
||||
# This is useful for reducing the amount of memory used by a VM.
|
||||
# Enabling this feature may sometimes reduce the speed of memory access in
|
||||
# the VM.
|
||||
#
|
||||
# Default false
|
||||
reclaim_guest_freed_memory = false
|
||||
|
||||
# Default memory size in MiB for SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSZ@ MiB.
|
||||
default_memory = @DEFAULTMEMORY_NV@
|
||||
#
|
||||
# Default memory slots per SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSLOTS@.
|
||||
# This is will determine the times that memory will be hotadded to sandbox/VM.
|
||||
memory_slots = @DEFMEMSLOTS@
|
||||
|
||||
# Default maximum memory in MiB per SB / VM
|
||||
# unspecified or == 0 --> will be set to the actual amount of physical RAM
|
||||
# > 0 <= amount of physical RAM --> will be set to the specified number
|
||||
# > amount of physical RAM --> will be set to the actual amount of physical RAM
|
||||
default_maxmemory = @DEFMAXMEMSZ@
|
||||
|
||||
# The size in MiB will be plused to max memory of hypervisor.
|
||||
# It is the memory address space for the NVDIMM devie.
|
||||
# If set block storage driver (block_device_driver) to "nvdimm",
|
||||
# should set memory_offset to the size of block device.
|
||||
# Default 0
|
||||
memory_offset = 0
|
||||
|
||||
# Specifies virtio-mem will be enabled or not.
|
||||
# Please note that this option should be used with the command
|
||||
# "echo 1 > /proc/sys/vm/overcommit_memory".
|
||||
# Default false
|
||||
enable_virtio_mem = false
|
||||
|
||||
# Disable block device from being used for a container's rootfs.
|
||||
# In case of a storage driver like devicemapper where a container's
|
||||
# root file system is backed by a block device, the block device is passed
|
||||
# directly to the hypervisor for performance reasons.
|
||||
# This flag prevents the block device from being passed to the hypervisor,
|
||||
# virtio-fs is used instead to pass the rootfs.
|
||||
disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
|
||||
|
||||
# List of valid annotations values for the virtiofs daemon
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
|
||||
# Default size of DAX cache in MiB
|
||||
virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
|
||||
|
||||
# Default size of virtqueues
|
||||
virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@
|
||||
|
||||
# Extra args for virtiofsd daemon
|
||||
#
|
||||
# Format example:
|
||||
# ["--arg1=xxx", "--arg2=yyy"]
|
||||
# Examples:
|
||||
# Set virtiofsd log level to debug : ["--log-level=debug"]
|
||||
#
|
||||
# see `virtiofsd -h` for possible options.
|
||||
virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
|
||||
|
||||
# Cache mode:
|
||||
#
|
||||
# - never
|
||||
# Metadata, data, and pathname lookup are not cached in guest. They are
|
||||
# always fetched from host and any changes are immediately pushed to host.
|
||||
#
|
||||
# - auto
|
||||
# Metadata and pathname lookup cache expires after a configured amount of
|
||||
# time (default is 1 second). Data is cached while the file is open (close
|
||||
# to open consistency).
|
||||
#
|
||||
# - always
|
||||
# Metadata, data, and pathname lookup are cached in guest and never expire.
|
||||
virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
|
||||
|
||||
# Block device driver to be used by the hypervisor when a container's
|
||||
# storage is backed by a block device or a file. This driver facilitates attaching
|
||||
# the storage directly to the guest VM.
|
||||
#
|
||||
# Examples include:
|
||||
# - virtio-blk-pci
|
||||
# - virtio-blk-ccw
|
||||
# - virtio-scsi
|
||||
# - nvidmm
|
||||
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@"
|
||||
|
||||
# aio is the I/O mechanism used by qemu
|
||||
# Options:
|
||||
#
|
||||
# - threads
|
||||
# Pthread based disk I/O.
|
||||
#
|
||||
# - native
|
||||
# Native Linux I/O.
|
||||
#
|
||||
# - io_uring
|
||||
# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and
|
||||
# qemu >=5.0.
|
||||
block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@"
|
||||
|
||||
# Specifies cache-related options will be set to block devices or not.
|
||||
# Default false
|
||||
block_device_cache_set = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
|
||||
# Default false
|
||||
block_device_cache_direct = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether flush requests for the device are ignored.
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently only implemented
|
||||
# for SCSI.
|
||||
#
|
||||
enable_iothreads = @DEFENABLEIOTHREADS@
|
||||
|
||||
# Virtio queue size. Size: byte. default 128
|
||||
queue_size = 128
|
||||
|
||||
# Block device multi-queue, default 1
|
||||
num_queues = 1
|
||||
|
||||
# Enable pre allocation of VM RAM, default false
|
||||
# Enabling this will result in lower container density
|
||||
# as all of the memory will be allocated and locked
|
||||
# This is useful when you want to reserve all the memory
|
||||
# upfront or in the cases where you want memory latencies
|
||||
# to be very predictable
|
||||
# Default false
|
||||
enable_mem_prealloc = false
|
||||
|
||||
# Enable huge pages for VM RAM, default false
|
||||
# Enabling this will result in the VM memory
|
||||
# being allocated using huge pages.
|
||||
# This is useful when you want to use vhost-user network
|
||||
# stacks within the container. This will automatically
|
||||
# result in memory pre allocation
|
||||
enable_hugepages = false
|
||||
|
||||
# Enable vhost-user storage device, default false
|
||||
# Enabling this will result in some Linux reserved block type
|
||||
# major range 240-254 being chosen to represent vhost-user devices.
|
||||
enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@
|
||||
|
||||
# The base directory specifically used for vhost-user devices.
|
||||
# Its sub-path "block" is used for block devices; "block/sockets" is
|
||||
# where we expect vhost-user sockets to live; "block/devices" is where
|
||||
# simulated block device nodes for vhost-user devices to live.
|
||||
vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@"
|
||||
|
||||
# Enable vIOMMU, default false
|
||||
# Enabling this will result in the VM having a vIOMMU device
|
||||
# This will also add the following options to the kernel's
|
||||
# command line: intel_iommu=on,iommu=pt
|
||||
enable_iommu = false
|
||||
|
||||
# Enable IOMMU_PLATFORM, default false
|
||||
# Enabling this will result in the VM device having iommu_platform=on set
|
||||
enable_iommu_platform = false
|
||||
|
||||
# List of valid annotations values for the vhost user store path
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
# This option will be ignored if VM templating is enabled.
|
||||
file_mem_backend = "@DEFFILEMEMBACKEND@"
|
||||
|
||||
# List of valid annotations values for the file_mem_backend annotation
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@
|
||||
valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@
|
||||
|
||||
# -pflash can add image file to VM. The arguments of it should be in format
|
||||
# of ["/path/to/flash0.img", "/path/to/flash1.img"]
|
||||
pflashes = []
|
||||
|
||||
# This option changes the default hypervisor and kernel parameters
|
||||
# to enable debug output where available.
|
||||
#
|
||||
# Default false
|
||||
enable_debug = false
|
||||
|
||||
# This option allows to add an extra HMP or QMP socket when `enable_debug = true`
|
||||
#
|
||||
# WARNING: Anyone with access to the extra socket can take full control of
|
||||
# Qemu. This is for debugging purpose only and must *NEVER* be used in
|
||||
# production.
|
||||
#
|
||||
# Valid values are :
|
||||
# - "hmp"
|
||||
# - "qmp"
|
||||
# - "qmp-pretty" (same as "qmp" with pretty json formatting)
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
# behaving as it would when running on bare metal.
|
||||
#
|
||||
disable_nesting_checks = true
|
||||
|
||||
# If false and nvdimm is supported, use nvdimm device to plug guest image.
|
||||
# Otherwise virtio-block device is used.
|
||||
#
|
||||
# nvdimm is not supported when `confidential_guest = true`.
|
||||
#
|
||||
# Default is false
|
||||
disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM_NV@
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hotplugging on root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hotplugging on
|
||||
# a bridge.
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "root-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_root_port = @DEFAULTPCIEROOTPORT_NV@
|
||||
|
||||
# Before hot plugging a PCIe device onto a switch port, you need add a pcie_switch_port device fist.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means how many devices attached onto pcie_switch_port will be created.
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true, and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_switch_port = 0
|
||||
|
||||
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
|
||||
# security (vhost-net runs ring0) for network I/O performance.
|
||||
disable_vhost_net = false
|
||||
|
||||
#
|
||||
# Default entropy source.
|
||||
# The path to a host source of entropy (including a real hardware RNG)
|
||||
# /dev/urandom and /dev/random are two main options.
|
||||
# Be aware that /dev/random is a blocking source of entropy. If the host
|
||||
# runs out of entropy, the VMs boot time will increase leading to get startup
|
||||
# timeouts.
|
||||
# The source of entropy /dev/urandom is non-blocking and provides a
|
||||
# generally acceptable source of entropy. It should work well for pretty much
|
||||
# all practical purposes.
|
||||
entropy_source = "@DEFENTROPYSOURCE@"
|
||||
|
||||
# List of valid annotations values for entropy_source
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
|
||||
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
|
||||
|
||||
# Path to OCI hook binaries in the *guest rootfs*.
|
||||
# This does not affect host-side hooks which must instead be added to
|
||||
# the OCI spec passed to the runtime.
|
||||
#
|
||||
# You can create a rootfs with hooks by customizing the osbuilder scripts:
|
||||
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
|
||||
#
|
||||
# Hooks must be stored in a subdirectory of guest_hook_path according to their
|
||||
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
|
||||
# The agent will scan these directories for executable files and add them, in
|
||||
# lexicographical order, to the lifecycle of the guest container.
|
||||
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
|
||||
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
|
||||
# Warnings will be logged if any error is encountered while scanning for hooks,
|
||||
# but it will not abort container execution.
|
||||
# Recommended value when enabling: "/usr/share/oci/hooks"
|
||||
guest_hook_path = ""
|
||||
|
||||
# Enable connection to Quote Generation Service (QGS)
|
||||
# The "tdx_quote_generation_service_socket_port" parameter configures how QEMU connects to the TDX Quote Generation Service (QGS).
|
||||
# This connection is essential for Trusted Domain (TD) attestation, as QGS signs the TDREPORT sent by QEMU via the GetQuote hypercall.
|
||||
# By default QGS runs on vsock port 4050, but can be modified by the host admin. For QEMU's tdx-guest object, this connection needs to
|
||||
# be specified in a JSON format, for example:
|
||||
# -object '{"qom-type":"tdx-guest","id":"tdx","quote-generation-socket":{"type":"vsock","cid":"2","port":"4050"}}'
|
||||
# It's important to note that setting "tdx_quote_generation_service_socket_port" to 0 enables communication via Unix Domain Sockets (UDS).
|
||||
# To activate UDS, the QGS service itself must be launched with the "-port=0" parameter and the UDS will always be located at /var/run/tdx-qgs/qgs.socket.
|
||||
# -object '{"qom-type":"tdx-guest","id":"tdx","quote-generation-socket":{"type":"unix","path":"/var/run/tdx-qgs/qgs.socket"}}'
|
||||
tdx_quote_generation_service_socket_port = @QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT@
|
||||
|
||||
#
|
||||
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
rx_rate_limiter_max_rate = 0
|
||||
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block)
|
||||
# to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
tx_rate_limiter_max_rate = 0
|
||||
# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest.
|
||||
# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs).
|
||||
# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale
|
||||
# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net).
|
||||
# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads),
|
||||
# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend.
|
||||
# Default: 1, Range: 1..=256
|
||||
network_queues = @DEFNETQUEUES@
|
||||
|
||||
# Set where to save the guest memory dump file.
|
||||
# If set, when GUEST_PANICKED event occurred,
|
||||
# guest memeory will be dumped to host filesystem under guest_memory_dump_path,
|
||||
# This directory will be created automatically if it does not exist.
|
||||
#
|
||||
# The dumped file(also called vmcore) can be processed with crash or gdb.
|
||||
#
|
||||
# WARNING:
|
||||
# Dump guest's memory can take very long depending on the amount of guest memory
|
||||
# and use much disk space.
|
||||
# Recommended value when enabling: "/var/crash/kata"
|
||||
guest_memory_dump_path = ""
|
||||
|
||||
# If enable paging.
|
||||
# Basically, if you want to use "gdb" rather than "crash",
|
||||
# or need the guest-virtual addresses in the ELF vmcore,
|
||||
# then you should enable paging.
|
||||
#
|
||||
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
|
||||
guest_memory_dump_paging = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
# disable applying SELinux on the container process
|
||||
# If set to false, the type `container_t` is applied to the container process by default.
|
||||
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
|
||||
# with `SELINUX=yes`.
|
||||
# (default: true)
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
# creation and saves a lot of memory if there are many kata containers running
|
||||
# on the same host.
|
||||
#
|
||||
# When disabled, new VMs are created from scratch.
|
||||
#
|
||||
# Note: Requires "initrd=" to be set ("image=" is not supported).
|
||||
#
|
||||
# Default false
|
||||
enable_template = false
|
||||
|
||||
# Specifies the path of template.
|
||||
#
|
||||
# Default "/run/vc/vm/template"
|
||||
template_path = "/run/vc/vm/template"
|
||||
|
||||
[agent.@PROJECT_TYPE@]
|
||||
# If enabled, make the agent display debug-level messages.
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Enable agent tracing.
|
||||
#
|
||||
# If enabled, the agent will generate OpenTelemetry trace spans.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# - If the runtime also has tracing enabled, the agent spans will be
|
||||
# associated with the appropriate runtime parent span.
|
||||
# - If enabled, the runtime will wait for the container to shutdown,
|
||||
# increasing the container shutdown time slightly.
|
||||
#
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Comma separated list of kernel modules and their parameters.
|
||||
# These modules will be loaded in the guest kernel using modprobe(8).
|
||||
# The following example can be used to load two kernel modules with parameters
|
||||
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
|
||||
# The first word is considered as the module name and the rest as its parameters.
|
||||
# Container will not be started when:
|
||||
# * A kernel module is specified and the modprobe command is not installed in the guest
|
||||
# or it fails loading the module.
|
||||
# * The module is not available in the guest or it doesn't met the guest kernel
|
||||
# requirements, like architecture and version.
|
||||
#
|
||||
kernel_modules = []
|
||||
|
||||
# Enable debug console.
|
||||
|
||||
# If enabled, user can connect guest OS running inside hypervisor
|
||||
# through "kata-runtime exec <sandbox-id>" command
|
||||
|
||||
debug_console_enabled = false
|
||||
|
||||
# Agent dial backoff in milliseconds (retry delay for hybrid vsock / remote;
|
||||
# vsock uses this as initial backoff, clamped by the agent implementation).
|
||||
# Makefile DEFDIALTIMEOUTMS_NV; total patience is reconnect_timeout_ms.
|
||||
dial_timeout_ms = @DEFDIALTIMEOUTMS_NV@
|
||||
|
||||
# Total time budget (ms) for the agent client to connect (vsock deadline).
|
||||
# Makefile DEFRECONNECTTIMEOUTMS_NV (default 1200000 ms = Go qemu-nvidia-gpu dial_timeout 1200 s).
|
||||
# For hybrid vsock, retry count is reconnect_timeout_ms / dial_timeout_ms.
|
||||
reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
|
||||
# Create Container Request Timeout
|
||||
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
|
||||
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,
|
||||
# have sufficient time to complete.
|
||||
#
|
||||
# Effective Timeout Determination:
|
||||
# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values:
|
||||
# - create_container_timeout: The timeout value configured for creating containers (default: 30 seconds).
|
||||
# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below:
|
||||
# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout)
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
[agent.@PROJECT_TYPE@.mem_agent]
|
||||
# Control the mem-agent function enable or disable.
|
||||
# Default to false
|
||||
mem_agent_enable = false
|
||||
|
||||
# Control the mem-agent memcg function disable or enable
|
||||
# Default to false
|
||||
memcg_disable = false
|
||||
|
||||
# Control the mem-agent function swap enable or disable.
|
||||
# Default to false
|
||||
memcg_swap = false
|
||||
|
||||
# Control the mem-agent function swappiness max number.
|
||||
# Default to 50
|
||||
memcg_swappiness_max = 50
|
||||
|
||||
# Control the mem-agent memcg function wait period seconds
|
||||
# Default to 600
|
||||
memcg_period_secs = 600
|
||||
|
||||
# Control the mem-agent memcg wait period PSI percent limit.
|
||||
# If the percentage of memory and IO PSI stall time within
|
||||
# the memcg waiting period for a cgroup exceeds this value,
|
||||
# then the aging and eviction for this cgroup will not be
|
||||
# executed after this waiting period.
|
||||
# Default to 1
|
||||
memcg_period_psi_percent_limit = 1
|
||||
|
||||
# Control the mem-agent memcg eviction PSI percent limit.
|
||||
# If the percentage of memory and IO PSI stall time for a cgroup
|
||||
# exceeds this value during an eviction cycle, the eviction for
|
||||
# this cgroup will immediately stop and will not resume until
|
||||
# the next memcg waiting period.
|
||||
# Default to 1
|
||||
memcg_eviction_psi_percent_limit = 1
|
||||
|
||||
# Control the mem-agent memcg eviction run aging count min.
|
||||
# A cgroup will only perform eviction when the number of aging cycles
|
||||
# in memcg is greater than or equal to memcg_eviction_run_aging_count_min.
|
||||
# Default to 3
|
||||
memcg_eviction_run_aging_count_min = 3
|
||||
|
||||
# Control the mem-agent compact function disable or enable
|
||||
# Default to false
|
||||
compact_disable = false
|
||||
|
||||
# Control the mem-agent compaction function wait period seconds
|
||||
# Default to 600
|
||||
compact_period_secs = 600
|
||||
|
||||
# Control the mem-agent compaction function wait period PSI percent limit.
|
||||
# If the percentage of memory and IO PSI stall time within
|
||||
# the compaction waiting period exceeds this value,
|
||||
# then the compaction will not be executed after this waiting period.
|
||||
# Default to 1
|
||||
compact_period_psi_percent_limit = 1
|
||||
|
||||
# Control the mem-agent compaction function compact PSI percent limit.
|
||||
# During compaction, the percentage of memory and IO PSI stall time
|
||||
# is checked every second. If this percentage exceeds
|
||||
# compact_psi_percent_limit, the compaction process will stop.
|
||||
# Default to 5
|
||||
compact_psi_percent_limit = 5
|
||||
|
||||
# Control the maximum number of seconds for each compaction of mem-agent compact function.
|
||||
# Default to 300
|
||||
compact_sec_max = 300
|
||||
|
||||
# Control the mem-agent compaction function compact order.
|
||||
# compact_order is use with compact_threshold.
|
||||
# Default to 9
|
||||
compact_order = 9
|
||||
|
||||
# Control the mem-agent compaction function compact threshold.
|
||||
# compact_threshold is the pages number.
|
||||
# When examining the /proc/pagetypeinfo, if there's an increase in the
|
||||
# number of movable pages of orders smaller than the compact_order
|
||||
# compared to the amount following the previous compaction,
|
||||
# and this increase surpasses a certain threshold—specifically,
|
||||
# more than 'compact_threshold' number of pages.
|
||||
# Or the number of free pages has decreased by 'compact_threshold'
|
||||
# since the previous compaction.
|
||||
# then the system should initiate another round of memory compaction.
|
||||
# Default to 1024
|
||||
compact_threshold = 1024
|
||||
|
||||
# Control the mem-agent compaction function force compact times.
|
||||
# After one compaction, if there has not been a compaction within
|
||||
# the next compact_force_times times, a compaction will be forced
|
||||
# regardless of the system's memory situation.
|
||||
# If compact_force_times is set to 0, will do force compaction each time.
|
||||
# If compact_force_times is set to 18446744073709551615, will never do force compaction.
|
||||
# Default to 18446744073709551615
|
||||
# Note: Using a large but valid u64 value (within i64::MAX range) instead of u64::MAX to avoid TOML parser issues
|
||||
# Using 9223372036854775807 (i64::MAX) which is effectively "never" for practical purposes
|
||||
compact_force_times = 9223372036854775807
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Internetworking model
|
||||
# Determines how the VM should be connected to the
|
||||
# the container network interface
|
||||
# Options:
|
||||
#
|
||||
# - macvtap
|
||||
# Used when the Container network interface can be bridged using
|
||||
# macvtap.
|
||||
#
|
||||
# - none
|
||||
# Used when customize network. Only creates a tap device. No veth pair.
|
||||
#
|
||||
# - tcfilter
|
||||
# Uses tc filter rules to redirect traffic from the network interface
|
||||
# provided by plugin to a tap interface connected to the VM.
|
||||
#
|
||||
internetworking_model = "@DEFNETWORKMODEL_QEMU@"
|
||||
|
||||
name = "@RUNTIMENAME@"
|
||||
hypervisor_name = "@HYPERVISOR_QEMU@"
|
||||
agent_name = "@PROJECT_TYPE@"
|
||||
|
||||
# disable guest seccomp
|
||||
# Determines whether container seccomp profiles are passed to the virtual
|
||||
# machine and applied by the kata agent. If set to true, seccomp is not applied
|
||||
# within the guest
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Set the full url to the Jaeger HTTP Thrift collector.
|
||||
# The default if not set will be "http://localhost:14268/api/traces"
|
||||
jaeger_endpoint = ""
|
||||
|
||||
# Sets the username to be used if basic auth is required for Jaeger.
|
||||
jaeger_user = ""
|
||||
|
||||
# Sets the password to be used if basic auth is required for Jaeger.
|
||||
jaeger_password = ""
|
||||
|
||||
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
|
||||
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
|
||||
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
|
||||
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
|
||||
# (like OVS) directly.
|
||||
# (default: false)
|
||||
disable_new_netns = false
|
||||
|
||||
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
|
||||
# The container cgroups in the host are not created, just one single cgroup per sandbox.
|
||||
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
|
||||
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
|
||||
# The sandbox cgroup is constrained if there is no container type annotation.
|
||||
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
|
||||
sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_NV@
|
||||
|
||||
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
|
||||
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
|
||||
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
|
||||
# Compatibility for determining appropriate sandbox (VM) size:
|
||||
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
|
||||
# does not yet support sandbox sizing annotations.
|
||||
# - When running single containers using a tool like ctr, container sizing information will be available.
|
||||
static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_NV@
|
||||
|
||||
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
|
||||
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
|
||||
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
|
||||
# These will not be exposed to the container workloads, and are only provided for potential guest services.
|
||||
sandbox_bind_mounts = @DEFBINDMOUNTS@
|
||||
|
||||
# VFIO Mode
|
||||
# Determines how VFIO devices should be be presented to the container.
|
||||
# Options:
|
||||
#
|
||||
# - vfio
|
||||
# Matches behaviour of OCI runtimes (e.g. runc) as much as
|
||||
# possible. VFIO devices will appear in the container as VFIO
|
||||
# character devices under /dev/vfio. The exact names may differ
|
||||
# from the host (they need to match the VM's IOMMU group numbers
|
||||
# rather than the host's)
|
||||
#
|
||||
# - guest-kernel
|
||||
# This is a Kata-specific behaviour that's useful in certain cases.
|
||||
# The VFIO device is managed by whatever driver in the VM kernel
|
||||
# claims it. This means it will appear as one or more device nodes
|
||||
# or network interfaces depending on the nature of the device.
|
||||
# Using this mode requires specially built workloads that know how
|
||||
# to locate the relevant device interfaces within the VM.
|
||||
#
|
||||
vfio_mode = "@DEFVFIOMODE_NV@"
|
||||
|
||||
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
|
||||
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
|
||||
disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
# they may break compatibility, and are prepared for a big version bump.
|
||||
# Supported experimental features:
|
||||
# (default: [])
|
||||
experimental = @DEFAULTEXPFEATURES@
|
||||
|
||||
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
|
||||
# (default: false)
|
||||
enable_pprof = false
|
||||
|
||||
# kubelet_root_dir is the kubelet root directory used to match ConfigMap/Secret
|
||||
# volume paths for propagation. Override for distros that use a different path
|
||||
# (e.g. k0s: /var/lib/k0s/kubelet).
|
||||
kubelet_root_dir = "@DEFKUBELETROOTDIR@"
|
||||
|
||||
# pod_resource_api_sock specifies the unix socket for the Kubelet's PodResource API endpoint.
|
||||
# When set (together with a non-"no-port" cold_plug_vfio), the runtime can cold-plug
|
||||
# devices discovered via the Pod Resources API. Path is typically under kubelet root-dir.
|
||||
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK_NV@"
|
||||
@@ -0,0 +1,762 @@
|
||||
# Copyright (c) 2017-2019 Intel Corporation
|
||||
# Copyright (c) 2021 Adobe Inc.
|
||||
# Copyright (c) 2024 IBM Corp.
|
||||
# Copyright (c) 2025-2026 NVIDIA Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
# XXX: WARNING: this file is auto-generated.
|
||||
# XXX:
|
||||
# XXX: Source file: "@CONFIG_QEMU_NVIDIA_GPU_SNP_IN@"
|
||||
# XXX: Project:
|
||||
# XXX: Name: @PROJECT_NAME@
|
||||
# XXX: Type: @PROJECT_TYPE@
|
||||
|
||||
[hypervisor.qemu]
|
||||
path = "@QEMUPATH@"
|
||||
kernel = "@KERNELPATH_CONFIDENTIAL_NV@"
|
||||
image = "@IMAGEPATH_CONFIDENTIAL_NV@"
|
||||
machine_type = "@MACHINETYPE@"
|
||||
|
||||
# Enable confidential guest support.
|
||||
# Toggling that setting may trigger different hardware features, ranging
|
||||
# from memory encryption to both memory and CPU-state encryption and integrity.
|
||||
# The Kata Containers runtime dynamically detects the available feature set and
|
||||
# aims at enabling the largest possible one, returning an error if none is
|
||||
# available, or none is supported by the hypervisor.
|
||||
#
|
||||
# Known limitations:
|
||||
# * Does not work by design:
|
||||
# - CPU Hotplug
|
||||
# - Memory Hotplug
|
||||
# - NVDIMM devices
|
||||
#
|
||||
# Default false
|
||||
confidential_guest = true
|
||||
|
||||
# Enable AMD SEV-SNP confidential guests
|
||||
# In case of using confidential guests on AMD hardware that supports SEV-SNP,
|
||||
# the following enables SEV-SNP guests. Default true
|
||||
sev_snp_guest = true
|
||||
|
||||
# SNP 'ID Block' and 'ID Authentication Information Structure'.
|
||||
# If one of snp_id_block or snp_id_auth is specified, the other must be specified, too.
|
||||
# Notice that the default SNP policy of QEMU (0x30000) is used by Kata, if not explicitly
|
||||
# set via 'snp_guest_policy' option. The IDBlock contains the guest policy as field, and
|
||||
# it must match the value from 'snp_guest_policy' or, if unset, the QEMU default policy.
|
||||
#
|
||||
# 96-byte, base64-encoded blob to provide the 'ID Block' structure for the
|
||||
# SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero)
|
||||
snp_id_block = ""
|
||||
# 4096-byte, base64-encoded blob to provide the 'ID Authentication Information Structure'
|
||||
# for the SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero)
|
||||
snp_id_auth = ""
|
||||
|
||||
# SNP Guest Policy, the 'POLICY' parameter to the SNP_LAUNCH_START command.
|
||||
# If unset, the QEMU default policy (0x30000) will be used.
|
||||
# Notice that the guest policy is enforced at VM launch, and your pod VMs
|
||||
# won't start at all if the policy denys it. This will be indicated by a
|
||||
# 'SNP_LAUNCH_START' error.
|
||||
snp_guest_policy = 196608
|
||||
|
||||
# rootfs filesystem type:
|
||||
# - ext4 (default)
|
||||
# - xfs
|
||||
# - erofs
|
||||
rootfs_type = @DEFROOTFSTYPE@
|
||||
|
||||
# Block storage driver to be used for the VM rootfs is backed
|
||||
# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm
|
||||
# (default @VMROOTFSDRIVER_NV@ via Makefile VMROOTFSDRIVER_NV)
|
||||
vm_rootfs_driver = "@VMROOTFSDRIVER_NV@"
|
||||
|
||||
# Enable running QEMU VMM as a non-root user.
|
||||
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
|
||||
# a non-root random user. See documentation for the limitations of this mode.
|
||||
rootless = false
|
||||
|
||||
# List of valid annotation names for the hypervisor
|
||||
# Each member of the list is a regular expression, which is the base name
|
||||
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
|
||||
enable_annotations = @DEFENABLEANNOTATIONS_COCO@
|
||||
|
||||
# List of valid annotations values for the hypervisor
|
||||
# Each member of the list is a path pattern as described by glob(3).
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@
|
||||
valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@
|
||||
|
||||
# Optional space-separated list of options to pass to the guest kernel.
|
||||
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
|
||||
# trouble running pre-2.15 glibc.
|
||||
#
|
||||
# WARNING: - any parameter specified here will take priority over the default
|
||||
# parameter value of the same name used to start the virtual machine.
|
||||
# Do not set values here unless you understand the impact of doing so as you
|
||||
# may stop the virtual machine from booting.
|
||||
# To see the list of default parameters, enable hypervisor debug, create a
|
||||
# container and look for 'default-kernel-parameters' log entries.
|
||||
kernel_params = "@KERNELPARAMS_NV@"
|
||||
|
||||
# Optional dm-verity parameters (comma-separated key=value list):
|
||||
# root_hash=...,salt=...,data_blocks=...,data_block_size=...,hash_block_size=...
|
||||
# These are used by the runtime to assemble dm-verity kernel params.
|
||||
kernel_verity_params = "@KERNELVERITYPARAMS_CONFIDENTIAL_NV@"
|
||||
|
||||
# Path to the firmware.
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWARESNPPATH_NV@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
# For example, `cpu_features = "pmu=off,vmx=off"
|
||||
cpu_features = "@CPUFEATURES@"
|
||||
|
||||
# Default number of vCPUs per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFVCPUS@
|
||||
# < 0 --> will be set to the actual number of physical cores
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores
|
||||
default_vcpus = @DEFAULTVCPUS_NV@
|
||||
|
||||
# Default maximum number of vCPUs per SB/VM:
|
||||
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
|
||||
# the actual number of physical cores is greater than it.
|
||||
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
|
||||
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
|
||||
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
|
||||
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
|
||||
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
|
||||
# unless you know what are you doing.
|
||||
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
|
||||
default_maxvcpus = @DEFMAXVCPUS_QEMU@
|
||||
|
||||
# Bridges can be used to hot plug devices.
|
||||
# Limitations:
|
||||
# * Currently only pci bridges are supported
|
||||
# * Until 30 devices per bridge can be hot plugged.
|
||||
# * Until 5 PCI bridges can be cold plugged per VM.
|
||||
# This limitation could be a bug in qemu or in the kernel
|
||||
# Default number of bridges per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFBRIDGES@
|
||||
# > 1 <= 5 --> will be set to the specified number
|
||||
# > 5 --> will be set to 5
|
||||
default_bridges = @DEFBRIDGES@
|
||||
|
||||
# Reclaim guest freed memory.
|
||||
# Enabling this will result in the VM balloon device having f_reporting=on set.
|
||||
# Then the hypervisor will use it to reclaim guest freed memory.
|
||||
# This is useful for reducing the amount of memory used by a VM.
|
||||
# Enabling this feature may sometimes reduce the speed of memory access in
|
||||
# the VM.
|
||||
#
|
||||
# Default false
|
||||
reclaim_guest_freed_memory = false
|
||||
|
||||
# Default memory size in MiB for SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSZ@ MiB.
|
||||
default_memory = @DEFAULTMEMORY_NV@
|
||||
#
|
||||
# Default memory slots per SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSLOTS@.
|
||||
# This is will determine the times that memory will be hotadded to sandbox/VM.
|
||||
memory_slots = @DEFMEMSLOTS@
|
||||
|
||||
# Default maximum memory in MiB per SB / VM
|
||||
# unspecified or == 0 --> will be set to the actual amount of physical RAM
|
||||
# > 0 <= amount of physical RAM --> will be set to the specified number
|
||||
# > amount of physical RAM --> will be set to the actual amount of physical RAM
|
||||
default_maxmemory = @DEFMAXMEMSZ@
|
||||
|
||||
# The size in MiB will be plused to max memory of hypervisor.
|
||||
# It is the memory address space for the NVDIMM devie.
|
||||
# If set block storage driver (block_device_driver) to "nvdimm",
|
||||
# should set memory_offset to the size of block device.
|
||||
# Default 0
|
||||
memory_offset = 0
|
||||
|
||||
# Specifies virtio-mem will be enabled or not.
|
||||
# Please note that this option should be used with the command
|
||||
# "echo 1 > /proc/sys/vm/overcommit_memory".
|
||||
# Default false
|
||||
enable_virtio_mem = false
|
||||
|
||||
# Disable block device from being used for a container's rootfs.
|
||||
# In case of a storage driver like devicemapper where a container's
|
||||
# root file system is backed by a block device, the block device is passed
|
||||
# directly to the hypervisor for performance reasons.
|
||||
# This flag prevents the block device from being passed to the hypervisor,
|
||||
# virtio-fs is used instead to pass the rootfs.
|
||||
disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "none"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
|
||||
|
||||
# List of valid annotations values for the virtiofs daemon
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
|
||||
# Default size of DAX cache in MiB
|
||||
virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
|
||||
|
||||
# Default size of virtqueues
|
||||
virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@
|
||||
|
||||
# Extra args for virtiofsd daemon
|
||||
#
|
||||
# Format example:
|
||||
# ["--arg1=xxx", "--arg2=yyy"]
|
||||
# Examples:
|
||||
# Set virtiofsd log level to debug : ["--log-level=debug"]
|
||||
#
|
||||
# see `virtiofsd -h` for possible options.
|
||||
virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
|
||||
|
||||
# Cache mode:
|
||||
#
|
||||
# - never
|
||||
# Metadata, data, and pathname lookup are not cached in guest. They are
|
||||
# always fetched from host and any changes are immediately pushed to host.
|
||||
#
|
||||
# - auto
|
||||
# Metadata and pathname lookup cache expires after a configured amount of
|
||||
# time (default is 1 second). Data is cached while the file is open (close
|
||||
# to open consistency).
|
||||
#
|
||||
# - always
|
||||
# Metadata, data, and pathname lookup are cached in guest and never expire.
|
||||
virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
|
||||
|
||||
# Block device driver to be used by the hypervisor when a container's
|
||||
# storage is backed by a block device or a file. This driver facilitates attaching
|
||||
# the storage directly to the guest VM.
|
||||
#
|
||||
# Examples include:
|
||||
# - virtio-blk-pci
|
||||
# - virtio-blk-ccw
|
||||
# - virtio-scsi
|
||||
# - nvidmm
|
||||
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@"
|
||||
|
||||
# aio is the I/O mechanism used by qemu
|
||||
# Options:
|
||||
#
|
||||
# - threads
|
||||
# Pthread based disk I/O.
|
||||
#
|
||||
# - native
|
||||
# Native Linux I/O.
|
||||
#
|
||||
# - io_uring
|
||||
# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and
|
||||
# qemu >=5.0.
|
||||
block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@"
|
||||
|
||||
# Specifies cache-related options will be set to block devices or not.
|
||||
# Default false
|
||||
block_device_cache_set = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
|
||||
# Default false
|
||||
block_device_cache_direct = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether flush requests for the device are ignored.
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently only implemented
|
||||
# for SCSI.
|
||||
#
|
||||
enable_iothreads = @DEFENABLEIOTHREADS@
|
||||
|
||||
# Independent IOThreads enables IO to be processed in a separate thread, it is
|
||||
# for QEMU hotplug device attach to iothread, like virtio-blk.
|
||||
indep_iothreads = @DEFINDEPIOTHREADS@
|
||||
|
||||
# Virtio queue size. Size: byte. default 128
|
||||
queue_size = 128
|
||||
|
||||
# Block device multi-queue, default 1
|
||||
num_queues = 1
|
||||
|
||||
# Enable pre allocation of VM RAM, default false
|
||||
# Enabling this will result in lower container density
|
||||
# as all of the memory will be allocated and locked
|
||||
# This is useful when you want to reserve all the memory
|
||||
# upfront or in the cases where you want memory latencies
|
||||
# to be very predictable
|
||||
# Default false
|
||||
enable_mem_prealloc = false
|
||||
|
||||
# Enable huge pages for VM RAM, default false
|
||||
# Enabling this will result in the VM memory
|
||||
# being allocated using huge pages.
|
||||
# This is useful when you want to use vhost-user network
|
||||
# stacks within the container. This will automatically
|
||||
# result in memory pre allocation
|
||||
enable_hugepages = false
|
||||
|
||||
# Enable vhost-user storage device, default false
|
||||
# Enabling this will result in some Linux reserved block type
|
||||
# major range 240-254 being chosen to represent vhost-user devices.
|
||||
enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@
|
||||
|
||||
# The base directory specifically used for vhost-user devices.
|
||||
# Its sub-path "block" is used for block devices; "block/sockets" is
|
||||
# where we expect vhost-user sockets to live; "block/devices" is where
|
||||
# simulated block device nodes for vhost-user devices to live.
|
||||
vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@"
|
||||
|
||||
# Enable vIOMMU, default false
|
||||
# Enabling this will result in the VM having a vIOMMU device
|
||||
# This will also add the following options to the kernel's
|
||||
# command line: intel_iommu=on,iommu=pt
|
||||
enable_iommu = false
|
||||
|
||||
# Enable IOMMU_PLATFORM, default false
|
||||
# Enabling this will result in the VM device having iommu_platform=on set
|
||||
enable_iommu_platform = false
|
||||
|
||||
# List of valid annotations values for the vhost user store path
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
# This option will be ignored if VM templating is enabled.
|
||||
file_mem_backend = "@DEFFILEMEMBACKEND@"
|
||||
|
||||
# List of valid annotations values for the file_mem_backend annotation
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@
|
||||
valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@
|
||||
|
||||
# -pflash can add image file to VM. The arguments of it should be in format
|
||||
# of ["/path/to/flash0.img", "/path/to/flash1.img"]
|
||||
pflashes = []
|
||||
|
||||
# This option changes the default hypervisor and kernel parameters
|
||||
# to enable debug output where available.
|
||||
#
|
||||
# Default false
|
||||
enable_debug = false
|
||||
|
||||
# This option allows to add an extra HMP or QMP socket when `enable_debug = true`
|
||||
#
|
||||
# WARNING: Anyone with access to the extra socket can take full control of
|
||||
# Qemu. This is for debugging purpose only and must *NEVER* be used in
|
||||
# production.
|
||||
#
|
||||
# Valid values are :
|
||||
# - "hmp"
|
||||
# - "qmp"
|
||||
# - "qmp-pretty" (same as "qmp" with pretty json formatting)
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
# behaving as it would when running on bare metal.
|
||||
#
|
||||
disable_nesting_checks = true
|
||||
|
||||
# If false and nvdimm is supported, use nvdimm device to plug guest image.
|
||||
# Otherwise virtio-block device is used.
|
||||
#
|
||||
# nvdimm is not supported when `confidential_guest = true`.
|
||||
#
|
||||
# Default is false
|
||||
disable_image_nvdimm = true
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hotplugging on root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hotplugging on
|
||||
# a bridge.
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "root-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_root_port = @DEFAULTPCIEROOTPORT_NV@
|
||||
|
||||
# Before hot plugging a PCIe device onto a switch port, you need add a pcie_switch_port device fist.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means how many devices attached onto pcie_switch_port will be created.
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true, and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_switch_port = 0
|
||||
|
||||
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
|
||||
# security (vhost-net runs ring0) for network I/O performance.
|
||||
disable_vhost_net = false
|
||||
|
||||
#
|
||||
# Default entropy source.
|
||||
# The path to a host source of entropy (including a real hardware RNG)
|
||||
# /dev/urandom and /dev/random are two main options.
|
||||
# Be aware that /dev/random is a blocking source of entropy. If the host
|
||||
# runs out of entropy, the VMs boot time will increase leading to get startup
|
||||
# timeouts.
|
||||
# The source of entropy /dev/urandom is non-blocking and provides a
|
||||
# generally acceptable source of entropy. It should work well for pretty much
|
||||
# all practical purposes.
|
||||
entropy_source = "@DEFENTROPYSOURCE@"
|
||||
|
||||
# List of valid annotations values for entropy_source
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
|
||||
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
|
||||
|
||||
# Path to OCI hook binaries in the *guest rootfs*.
|
||||
# This does not affect host-side hooks which must instead be added to
|
||||
# the OCI spec passed to the runtime.
|
||||
#
|
||||
# You can create a rootfs with hooks by customizing the osbuilder scripts:
|
||||
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
|
||||
#
|
||||
# Hooks must be stored in a subdirectory of guest_hook_path according to their
|
||||
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
|
||||
# The agent will scan these directories for executable files and add them, in
|
||||
# lexicographical order, to the lifecycle of the guest container.
|
||||
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
|
||||
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
|
||||
# Warnings will be logged if any error is encountered while scanning for hooks,
|
||||
# but it will not abort container execution.
|
||||
# Recommended value when enabling: "/usr/share/oci/hooks"
|
||||
guest_hook_path = ""
|
||||
|
||||
#
|
||||
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
rx_rate_limiter_max_rate = 0
|
||||
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block)
|
||||
# to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
tx_rate_limiter_max_rate = 0
|
||||
# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest.
|
||||
# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs).
|
||||
# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale
|
||||
# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net).
|
||||
# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads),
|
||||
# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend.
|
||||
# Default: 1, Range: 1..=256
|
||||
network_queues = @DEFNETQUEUES@
|
||||
|
||||
# Set where to save the guest memory dump file.
|
||||
# If set, when GUEST_PANICKED event occurred,
|
||||
# guest memeory will be dumped to host filesystem under guest_memory_dump_path,
|
||||
# This directory will be created automatically if it does not exist.
|
||||
#
|
||||
# The dumped file(also called vmcore) can be processed with crash or gdb.
|
||||
#
|
||||
# WARNING:
|
||||
# Dump guest's memory can take very long depending on the amount of guest memory
|
||||
# and use much disk space.
|
||||
# Recommended value when enabling: "/var/crash/kata"
|
||||
guest_memory_dump_path = ""
|
||||
|
||||
# If enable paging.
|
||||
# Basically, if you want to use "gdb" rather than "crash",
|
||||
# or need the guest-virtual addresses in the ELF vmcore,
|
||||
# then you should enable paging.
|
||||
#
|
||||
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
|
||||
guest_memory_dump_paging = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
# disable applying SELinux on the container process
|
||||
# If set to false, the type `container_t` is applied to the container process by default.
|
||||
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
|
||||
# with `SELINUX=yes`.
|
||||
# (default: true)
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
# creation and saves a lot of memory if there are many kata containers running
|
||||
# on the same host.
|
||||
#
|
||||
# When disabled, new VMs are created from scratch.
|
||||
#
|
||||
# Note: Requires "initrd=" to be set ("image=" is not supported).
|
||||
#
|
||||
# Default false
|
||||
enable_template = false
|
||||
|
||||
# Specifies the path of template.
|
||||
#
|
||||
# Default "/run/vc/vm/template"
|
||||
template_path = "/run/vc/vm/template"
|
||||
|
||||
[agent.@PROJECT_TYPE@]
|
||||
# If enabled, make the agent display debug-level messages.
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Enable agent tracing.
|
||||
#
|
||||
# If enabled, the agent will generate OpenTelemetry trace spans.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# - If the runtime also has tracing enabled, the agent spans will be
|
||||
# associated with the appropriate runtime parent span.
|
||||
# - If enabled, the runtime will wait for the container to shutdown,
|
||||
# increasing the container shutdown time slightly.
|
||||
#
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Comma separated list of kernel modules and their parameters.
|
||||
# These modules will be loaded in the guest kernel using modprobe(8).
|
||||
# The following example can be used to load two kernel modules with parameters
|
||||
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
|
||||
# The first word is considered as the module name and the rest as its parameters.
|
||||
# Container will not be started when:
|
||||
# * A kernel module is specified and the modprobe command is not installed in the guest
|
||||
# or it fails loading the module.
|
||||
# * The module is not available in the guest or it doesn't met the guest kernel
|
||||
# requirements, like architecture and version.
|
||||
#
|
||||
kernel_modules = []
|
||||
|
||||
# Enable debug console.
|
||||
|
||||
# If enabled, user can connect guest OS running inside hypervisor
|
||||
# through "kata-runtime exec <sandbox-id>" command
|
||||
|
||||
debug_console_enabled = false
|
||||
|
||||
# Agent dial backoff in milliseconds (retry delay for hybrid vsock / remote;
|
||||
# vsock uses this as initial backoff, clamped by the agent implementation).
|
||||
# Makefile DEFDIALTIMEOUTMS_NV; total patience is reconnect_timeout_ms.
|
||||
dial_timeout_ms = @DEFDIALTIMEOUTMS_NV@
|
||||
|
||||
# Total time budget (ms) for the agent client to connect (vsock deadline).
|
||||
# Makefile DEFRECONNECTTIMEOUTMS_NV (default 1200000 ms = Go qemu-nvidia-gpu dial_timeout 1200 s).
|
||||
# For hybrid vsock, retry count is reconnect_timeout_ms / dial_timeout_ms.
|
||||
reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
|
||||
# Create Container Request Timeout
|
||||
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
|
||||
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,
|
||||
# have sufficient time to complete.
|
||||
#
|
||||
# Effective Timeout Determination:
|
||||
# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values:
|
||||
# - create_container_timeout: The timeout value configured for creating containers (default: 30 seconds).
|
||||
# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below:
|
||||
# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout)
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Internetworking model
|
||||
# Determines how the VM should be connected to the
|
||||
# the container network interface
|
||||
# Options:
|
||||
#
|
||||
# - macvtap
|
||||
# Used when the Container network interface can be bridged using
|
||||
# macvtap.
|
||||
#
|
||||
# - none
|
||||
# Used when customize network. Only creates a tap device. No veth pair.
|
||||
#
|
||||
# - tcfilter
|
||||
# Uses tc filter rules to redirect traffic from the network interface
|
||||
# provided by plugin to a tap interface connected to the VM.
|
||||
#
|
||||
internetworking_model = "@DEFNETWORKMODEL_QEMU@"
|
||||
|
||||
name = "@RUNTIMENAME@"
|
||||
hypervisor_name = "@HYPERVISOR_QEMU@"
|
||||
agent_name = "@PROJECT_TYPE@"
|
||||
|
||||
# disable guest seccomp
|
||||
# Determines whether container seccomp profiles are passed to the virtual
|
||||
# machine and applied by the kata agent. If set to true, seccomp is not applied
|
||||
# within the guest
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Set the full url to the Jaeger HTTP Thrift collector.
|
||||
# The default if not set will be "http://localhost:14268/api/traces"
|
||||
jaeger_endpoint = ""
|
||||
|
||||
# Sets the username to be used if basic auth is required for Jaeger.
|
||||
jaeger_user = ""
|
||||
|
||||
# Sets the password to be used if basic auth is required for Jaeger.
|
||||
jaeger_password = ""
|
||||
|
||||
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
|
||||
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
|
||||
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
|
||||
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
|
||||
# (like OVS) directly.
|
||||
# (default: false)
|
||||
disable_new_netns = false
|
||||
|
||||
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
|
||||
# The container cgroups in the host are not created, just one single cgroup per sandbox.
|
||||
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
|
||||
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
|
||||
# The sandbox cgroup is constrained if there is no container type annotation.
|
||||
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
|
||||
sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_NV@
|
||||
|
||||
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
|
||||
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
|
||||
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
|
||||
# Compatibility for determining appropriate sandbox (VM) size:
|
||||
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
|
||||
# does not yet support sandbox sizing annotations.
|
||||
# - When running single containers using a tool like ctr, container sizing information will be available.
|
||||
static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_NV@
|
||||
|
||||
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
|
||||
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
|
||||
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
|
||||
# These will not be exposed to the container workloads, and are only provided for potential guest services.
|
||||
sandbox_bind_mounts = @DEFBINDMOUNTS@
|
||||
|
||||
# VFIO Mode
|
||||
# Determines how VFIO devices should be be presented to the container.
|
||||
# Options:
|
||||
#
|
||||
# - vfio
|
||||
# Matches behaviour of OCI runtimes (e.g. runc) as much as
|
||||
# possible. VFIO devices will appear in the container as VFIO
|
||||
# character devices under /dev/vfio. The exact names may differ
|
||||
# from the host (they need to match the VM's IOMMU group numbers
|
||||
# rather than the host's)
|
||||
#
|
||||
# - guest-kernel
|
||||
# This is a Kata-specific behaviour that's useful in certain cases.
|
||||
# The VFIO device is managed by whatever driver in the VM kernel
|
||||
# claims it. This means it will appear as one or more device nodes
|
||||
# or network interfaces depending on the nature of the device.
|
||||
# Using this mode requires specially built workloads that know how
|
||||
# to locate the relevant device interfaces within the VM.
|
||||
#
|
||||
vfio_mode = "@DEFVFIOMODE_NV@"
|
||||
|
||||
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
|
||||
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
|
||||
disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
# they may break compatibility, and are prepared for a big version bump.
|
||||
# Supported experimental features:
|
||||
# (default: [])
|
||||
experimental = @DEFAULTEXPFEATURES@
|
||||
|
||||
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
|
||||
# (default: false)
|
||||
enable_pprof = false
|
||||
|
||||
# kubelet_root_dir is the kubelet root directory used to match ConfigMap/Secret
|
||||
# volume paths for propagation. Override for distros that use a different path
|
||||
# (e.g. k0s: /var/lib/k0s/kubelet).
|
||||
kubelet_root_dir = "@DEFKUBELETROOTDIR@"
|
||||
|
||||
# pod_resource_api_sock specifies the unix socket for the Kubelet's PodResource API endpoint.
|
||||
# When set (together with a non-"no-port" cold_plug_vfio), the runtime can cold-plug
|
||||
# devices discovered via the Pod Resources API. Path is typically under kubelet root-dir.
|
||||
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK_NV@"
|
||||
@@ -0,0 +1,738 @@
|
||||
# Copyright (c) 2017-2019 Intel Corporation
|
||||
# Copyright (c) 2021 Adobe Inc.
|
||||
# Copyright (c) 2024 IBM Corp.
|
||||
# Copyright (c) 2025-2026 NVIDIA Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
# XXX: WARNING: this file is auto-generated.
|
||||
# XXX:
|
||||
# XXX: Source file: "@CONFIG_QEMU_NVIDIA_GPU_TDX_IN@"
|
||||
# XXX: Project:
|
||||
# XXX: Name: @PROJECT_NAME@
|
||||
# XXX: Type: @PROJECT_TYPE@
|
||||
|
||||
[hypervisor.qemu]
|
||||
path = "@QEMUPATH@"
|
||||
kernel = "@KERNELPATH_CONFIDENTIAL_NV@"
|
||||
image = "@IMAGEPATH_CONFIDENTIAL_NV@"
|
||||
machine_type = "@MACHINETYPE@"
|
||||
tdx_quote_generation_service_socket_port = @QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT@
|
||||
|
||||
# Enable confidential guest support.
|
||||
# Toggling that setting may trigger different hardware features, ranging
|
||||
# from memory encryption to both memory and CPU-state encryption and integrity.
|
||||
# The Kata Containers runtime dynamically detects the available feature set and
|
||||
# aims at enabling the largest possible one, returning an error if none is
|
||||
# available, or none is supported by the hypervisor.
|
||||
#
|
||||
# Known limitations:
|
||||
# * Does not work by design:
|
||||
# - CPU Hotplug
|
||||
# - Memory Hotplug
|
||||
# - NVDIMM devices
|
||||
#
|
||||
# Default false
|
||||
confidential_guest = true
|
||||
|
||||
# rootfs filesystem type:
|
||||
# - ext4 (default)
|
||||
# - xfs
|
||||
# - erofs
|
||||
rootfs_type = @DEFROOTFSTYPE@
|
||||
|
||||
# Block storage driver to be used for the VM rootfs is backed
|
||||
# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm
|
||||
# (default @VMROOTFSDRIVER_NV@ via Makefile VMROOTFSDRIVER_NV)
|
||||
vm_rootfs_driver = "@VMROOTFSDRIVER_NV@"
|
||||
|
||||
# Enable running QEMU VMM as a non-root user.
|
||||
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
|
||||
# a non-root random user. See documentation for the limitations of this mode.
|
||||
rootless = false
|
||||
|
||||
# List of valid annotation names for the hypervisor
|
||||
# Each member of the list is a regular expression, which is the base name
|
||||
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
|
||||
enable_annotations = @DEFENABLEANNOTATIONS_COCO@
|
||||
|
||||
# List of valid annotations values for the hypervisor
|
||||
# Each member of the list is a path pattern as described by glob(3).
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@
|
||||
valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@
|
||||
|
||||
# Optional space-separated list of options to pass to the guest kernel.
|
||||
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
|
||||
# trouble running pre-2.15 glibc.
|
||||
#
|
||||
# WARNING: - any parameter specified here will take priority over the default
|
||||
# parameter value of the same name used to start the virtual machine.
|
||||
# Do not set values here unless you understand the impact of doing so as you
|
||||
# may stop the virtual machine from booting.
|
||||
# To see the list of default parameters, enable hypervisor debug, create a
|
||||
# container and look for 'default-kernel-parameters' log entries.
|
||||
kernel_params = "@KERNELPARAMS_NV@"
|
||||
|
||||
# Optional dm-verity parameters (comma-separated key=value list):
|
||||
# root_hash=...,salt=...,data_blocks=...,data_block_size=...,hash_block_size=...
|
||||
# These are used by the runtime to assemble dm-verity kernel params.
|
||||
kernel_verity_params = "@KERNELVERITYPARAMS_CONFIDENTIAL_NV@"
|
||||
|
||||
# Path to the firmware.
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWARETDVFPATH_NV@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
# For example, `cpu_features = "pmu=off,vmx=off"
|
||||
cpu_features = "@CPUFEATURES@"
|
||||
|
||||
# Default number of vCPUs per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFVCPUS@
|
||||
# < 0 --> will be set to the actual number of physical cores
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores
|
||||
default_vcpus = @DEFAULTVCPUS_NV@
|
||||
|
||||
# Default maximum number of vCPUs per SB/VM:
|
||||
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# > 0 <= number of physical cores --> will be set to the specified number
|
||||
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
|
||||
# of vCPUs supported by KVM if that number is exceeded
|
||||
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
|
||||
# the actual number of physical cores is greater than it.
|
||||
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
|
||||
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
|
||||
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
|
||||
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
|
||||
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
|
||||
# unless you know what are you doing.
|
||||
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
|
||||
default_maxvcpus = @DEFMAXVCPUS_QEMU@
|
||||
|
||||
# Bridges can be used to hot plug devices.
|
||||
# Limitations:
|
||||
# * Currently only pci bridges are supported
|
||||
# * Until 30 devices per bridge can be hot plugged.
|
||||
# * Until 5 PCI bridges can be cold plugged per VM.
|
||||
# This limitation could be a bug in qemu or in the kernel
|
||||
# Default number of bridges per SB/VM:
|
||||
# unspecified or 0 --> will be set to @DEFBRIDGES@
|
||||
# > 1 <= 5 --> will be set to the specified number
|
||||
# > 5 --> will be set to 5
|
||||
default_bridges = @DEFBRIDGES@
|
||||
|
||||
# Reclaim guest freed memory.
|
||||
# Enabling this will result in the VM balloon device having f_reporting=on set.
|
||||
# Then the hypervisor will use it to reclaim guest freed memory.
|
||||
# This is useful for reducing the amount of memory used by a VM.
|
||||
# Enabling this feature may sometimes reduce the speed of memory access in
|
||||
# the VM.
|
||||
#
|
||||
# Default false
|
||||
reclaim_guest_freed_memory = false
|
||||
|
||||
# Default memory size in MiB for SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSZ@ MiB.
|
||||
default_memory = @DEFAULTMEMORY_NV@
|
||||
#
|
||||
# Default memory slots per SB/VM.
|
||||
# If unspecified then it will be set @DEFMEMSLOTS@.
|
||||
# This is will determine the times that memory will be hotadded to sandbox/VM.
|
||||
memory_slots = @DEFMEMSLOTS@
|
||||
|
||||
# Default maximum memory in MiB per SB / VM
|
||||
# unspecified or == 0 --> will be set to the actual amount of physical RAM
|
||||
# > 0 <= amount of physical RAM --> will be set to the specified number
|
||||
# > amount of physical RAM --> will be set to the actual amount of physical RAM
|
||||
default_maxmemory = @DEFMAXMEMSZ@
|
||||
|
||||
# The size in MiB will be plused to max memory of hypervisor.
|
||||
# It is the memory address space for the NVDIMM devie.
|
||||
# If set block storage driver (block_device_driver) to "nvdimm",
|
||||
# should set memory_offset to the size of block device.
|
||||
# Default 0
|
||||
memory_offset = 0
|
||||
|
||||
# Specifies virtio-mem will be enabled or not.
|
||||
# Please note that this option should be used with the command
|
||||
# "echo 1 > /proc/sys/vm/overcommit_memory".
|
||||
# Default false
|
||||
enable_virtio_mem = false
|
||||
|
||||
# Disable block device from being used for a container's rootfs.
|
||||
# In case of a storage driver like devicemapper where a container's
|
||||
# root file system is backed by a block device, the block device is passed
|
||||
# directly to the hypervisor for performance reasons.
|
||||
# This flag prevents the block device from being passed to the hypervisor,
|
||||
# virtio-fs is used instead to pass the rootfs.
|
||||
disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
|
||||
|
||||
# List of valid annotations values for the virtiofs daemon
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
|
||||
|
||||
# Default size of DAX cache in MiB
|
||||
virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
|
||||
|
||||
# Default size of virtqueues
|
||||
virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@
|
||||
|
||||
# Extra args for virtiofsd daemon
|
||||
#
|
||||
# Format example:
|
||||
# ["--arg1=xxx", "--arg2=yyy"]
|
||||
# Examples:
|
||||
# Set virtiofsd log level to debug : ["--log-level=debug"]
|
||||
#
|
||||
# see `virtiofsd -h` for possible options.
|
||||
virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
|
||||
|
||||
# Cache mode:
|
||||
#
|
||||
# - never
|
||||
# Metadata, data, and pathname lookup are not cached in guest. They are
|
||||
# always fetched from host and any changes are immediately pushed to host.
|
||||
#
|
||||
# - auto
|
||||
# Metadata and pathname lookup cache expires after a configured amount of
|
||||
# time (default is 1 second). Data is cached while the file is open (close
|
||||
# to open consistency).
|
||||
#
|
||||
# - always
|
||||
# Metadata, data, and pathname lookup are cached in guest and never expire.
|
||||
virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
|
||||
|
||||
# Block device driver to be used by the hypervisor when a container's
|
||||
# storage is backed by a block device or a file. This driver facilitates attaching
|
||||
# the storage directly to the guest VM.
|
||||
#
|
||||
# Examples include:
|
||||
# - virtio-blk-pci
|
||||
# - virtio-blk-ccw
|
||||
# - virtio-scsi
|
||||
# - nvidmm
|
||||
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@"
|
||||
|
||||
# aio is the I/O mechanism used by qemu
|
||||
# Options:
|
||||
#
|
||||
# - threads
|
||||
# Pthread based disk I/O.
|
||||
#
|
||||
# - native
|
||||
# Native Linux I/O.
|
||||
#
|
||||
# - io_uring
|
||||
# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and
|
||||
# qemu >=5.0.
|
||||
block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@"
|
||||
|
||||
# Specifies cache-related options will be set to block devices or not.
|
||||
# Default false
|
||||
block_device_cache_set = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
|
||||
# Default false
|
||||
block_device_cache_direct = false
|
||||
|
||||
# Specifies cache-related options for block devices.
|
||||
# Denotes whether flush requests for the device are ignored.
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently only implemented
|
||||
# for SCSI.
|
||||
#
|
||||
enable_iothreads = @DEFENABLEIOTHREADS@
|
||||
|
||||
# Independent IOThreads enables IO to be processed in a separate thread, it is
|
||||
# for QEMU hotplug device attach to iothread, like virtio-blk.
|
||||
indep_iothreads = @DEFINDEPIOTHREADS@
|
||||
|
||||
# Virtio queue size. Size: byte. default 128
|
||||
queue_size = 128
|
||||
|
||||
# Block device multi-queue, default 1
|
||||
num_queues = 1
|
||||
|
||||
# Enable pre allocation of VM RAM, default false
|
||||
# Enabling this will result in lower container density
|
||||
# as all of the memory will be allocated and locked
|
||||
# This is useful when you want to reserve all the memory
|
||||
# upfront or in the cases where you want memory latencies
|
||||
# to be very predictable
|
||||
# Default false
|
||||
enable_mem_prealloc = false
|
||||
|
||||
# Enable huge pages for VM RAM, default false
|
||||
# Enabling this will result in the VM memory
|
||||
# being allocated using huge pages.
|
||||
# This is useful when you want to use vhost-user network
|
||||
# stacks within the container. This will automatically
|
||||
# result in memory pre allocation
|
||||
enable_hugepages = false
|
||||
|
||||
# Enable vhost-user storage device, default false
|
||||
# Enabling this will result in some Linux reserved block type
|
||||
# major range 240-254 being chosen to represent vhost-user devices.
|
||||
enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@
|
||||
|
||||
# The base directory specifically used for vhost-user devices.
|
||||
# Its sub-path "block" is used for block devices; "block/sockets" is
|
||||
# where we expect vhost-user sockets to live; "block/devices" is where
|
||||
# simulated block device nodes for vhost-user devices to live.
|
||||
vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@"
|
||||
|
||||
# Enable vIOMMU, default false
|
||||
# Enabling this will result in the VM having a vIOMMU device
|
||||
# This will also add the following options to the kernel's
|
||||
# command line: intel_iommu=on,iommu=pt
|
||||
enable_iommu = false
|
||||
|
||||
# Enable IOMMU_PLATFORM, default false
|
||||
# Enabling this will result in the VM device having iommu_platform=on set
|
||||
enable_iommu_platform = false
|
||||
|
||||
# List of valid annotations values for the vhost user store path
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
# This option will be ignored if VM templating is enabled.
|
||||
file_mem_backend = "@DEFFILEMEMBACKEND@"
|
||||
|
||||
# List of valid annotations values for the file_mem_backend annotation
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@
|
||||
valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@
|
||||
|
||||
# -pflash can add image file to VM. The arguments of it should be in format
|
||||
# of ["/path/to/flash0.img", "/path/to/flash1.img"]
|
||||
pflashes = []
|
||||
|
||||
# This option changes the default hypervisor and kernel parameters
|
||||
# to enable debug output where available.
|
||||
#
|
||||
# Default false
|
||||
enable_debug = false
|
||||
|
||||
# This option allows to add an extra HMP or QMP socket when `enable_debug = true`
|
||||
#
|
||||
# WARNING: Anyone with access to the extra socket can take full control of
|
||||
# Qemu. This is for debugging purpose only and must *NEVER* be used in
|
||||
# production.
|
||||
#
|
||||
# Valid values are :
|
||||
# - "hmp"
|
||||
# - "qmp"
|
||||
# - "qmp-pretty" (same as "qmp" with pretty json formatting)
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
# behaving as it would when running on bare metal.
|
||||
#
|
||||
disable_nesting_checks = true
|
||||
|
||||
# If false and nvdimm is supported, use nvdimm device to plug guest image.
|
||||
# Otherwise virtio-block device is used.
|
||||
#
|
||||
# nvdimm is not supported when `confidential_guest = true`.
|
||||
#
|
||||
# Default is false
|
||||
disable_image_nvdimm = true
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hotplugging on root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hotplugging on
|
||||
# a bridge.
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "root-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_root_port = @DEFAULTPCIEROOTPORT_NV@
|
||||
|
||||
# Before hot plugging a PCIe device onto a switch port, you need add a pcie_switch_port device fist.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means how many devices attached onto pcie_switch_port will be created.
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true, and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_switch_port = 0
|
||||
|
||||
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
|
||||
# security (vhost-net runs ring0) for network I/O performance.
|
||||
disable_vhost_net = false
|
||||
|
||||
#
|
||||
# Default entropy source.
|
||||
# The path to a host source of entropy (including a real hardware RNG)
|
||||
# /dev/urandom and /dev/random are two main options.
|
||||
# Be aware that /dev/random is a blocking source of entropy. If the host
|
||||
# runs out of entropy, the VMs boot time will increase leading to get startup
|
||||
# timeouts.
|
||||
# The source of entropy /dev/urandom is non-blocking and provides a
|
||||
# generally acceptable source of entropy. It should work well for pretty much
|
||||
# all practical purposes.
|
||||
entropy_source = "@DEFENTROPYSOURCE@"
|
||||
|
||||
# List of valid annotations values for entropy_source
|
||||
# The default if not set is empty (all annotations rejected.)
|
||||
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
|
||||
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
|
||||
|
||||
# Path to OCI hook binaries in the *guest rootfs*.
|
||||
# This does not affect host-side hooks which must instead be added to
|
||||
# the OCI spec passed to the runtime.
|
||||
#
|
||||
# You can create a rootfs with hooks by customizing the osbuilder scripts:
|
||||
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
|
||||
#
|
||||
# Hooks must be stored in a subdirectory of guest_hook_path according to their
|
||||
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
|
||||
# The agent will scan these directories for executable files and add them, in
|
||||
# lexicographical order, to the lifecycle of the guest container.
|
||||
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
|
||||
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
|
||||
# Warnings will be logged if any error is encountered while scanning for hooks,
|
||||
# but it will not abort container execution.
|
||||
# Recommended value when enabling: "/usr/share/oci/hooks"
|
||||
guest_hook_path = ""
|
||||
|
||||
#
|
||||
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
rx_rate_limiter_max_rate = 0
|
||||
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
|
||||
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block)
|
||||
# to discipline traffic.
|
||||
# Default 0-sized value means unlimited rate.
|
||||
tx_rate_limiter_max_rate = 0
|
||||
# network_queues configures the number of virtio-net queue pairs (RX/TX) exposed to the guest.
|
||||
# Setting network_queues = N creates N RX queues and N TX queues (i.e., N queue pairs).
|
||||
# More queues can improve network throughput and reduce per-queue contention by allowing packet processing to scale
|
||||
# across multiple vCPUs/threads (subject to host/guest capabilities and backend configuration such as vhost-net).
|
||||
# Increasing this value consumes more resources (e.g., virtqueue state, interrupts/MSI-X vectors, backend threads),
|
||||
# so it should typically not exceed the number of vCPUs or the practical parallelism of the networking backend.
|
||||
# Default: 1, Range: 1..=256
|
||||
network_queues = @DEFNETQUEUES@
|
||||
|
||||
# Set where to save the guest memory dump file.
|
||||
# If set, when GUEST_PANICKED event occurred,
|
||||
# guest memeory will be dumped to host filesystem under guest_memory_dump_path,
|
||||
# This directory will be created automatically if it does not exist.
|
||||
#
|
||||
# The dumped file(also called vmcore) can be processed with crash or gdb.
|
||||
#
|
||||
# WARNING:
|
||||
# Dump guest's memory can take very long depending on the amount of guest memory
|
||||
# and use much disk space.
|
||||
# Recommended value when enabling: "/var/crash/kata"
|
||||
guest_memory_dump_path = ""
|
||||
|
||||
# If enable paging.
|
||||
# Basically, if you want to use "gdb" rather than "crash",
|
||||
# or need the guest-virtual addresses in the ELF vmcore,
|
||||
# then you should enable paging.
|
||||
#
|
||||
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
|
||||
guest_memory_dump_paging = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
# disable applying SELinux on the container process
|
||||
# If set to false, the type `container_t` is applied to the container process by default.
|
||||
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
|
||||
# with `SELINUX=yes`.
|
||||
# (default: true)
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
# creation and saves a lot of memory if there are many kata containers running
|
||||
# on the same host.
|
||||
#
|
||||
# When disabled, new VMs are created from scratch.
|
||||
#
|
||||
# Note: Requires "initrd=" to be set ("image=" is not supported).
|
||||
#
|
||||
# Default false
|
||||
enable_template = false
|
||||
|
||||
# Specifies the path of template.
|
||||
#
|
||||
# Default "/run/vc/vm/template"
|
||||
template_path = "/run/vc/vm/template"
|
||||
|
||||
[agent.@PROJECT_TYPE@]
|
||||
# If enabled, make the agent display debug-level messages.
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Enable agent tracing.
|
||||
#
|
||||
# If enabled, the agent will generate OpenTelemetry trace spans.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# - If the runtime also has tracing enabled, the agent spans will be
|
||||
# associated with the appropriate runtime parent span.
|
||||
# - If enabled, the runtime will wait for the container to shutdown,
|
||||
# increasing the container shutdown time slightly.
|
||||
#
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Comma separated list of kernel modules and their parameters.
|
||||
# These modules will be loaded in the guest kernel using modprobe(8).
|
||||
# The following example can be used to load two kernel modules with parameters
|
||||
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
|
||||
# The first word is considered as the module name and the rest as its parameters.
|
||||
# Container will not be started when:
|
||||
# * A kernel module is specified and the modprobe command is not installed in the guest
|
||||
# or it fails loading the module.
|
||||
# * The module is not available in the guest or it doesn't met the guest kernel
|
||||
# requirements, like architecture and version.
|
||||
#
|
||||
kernel_modules = []
|
||||
|
||||
# Enable debug console.
|
||||
|
||||
# If enabled, user can connect guest OS running inside hypervisor
|
||||
# through "kata-runtime exec <sandbox-id>" command
|
||||
|
||||
debug_console_enabled = false
|
||||
|
||||
# Agent dial backoff in milliseconds (retry delay for hybrid vsock / remote;
|
||||
# vsock uses this as initial backoff, clamped by the agent implementation).
|
||||
# Makefile DEFDIALTIMEOUTMS_NV; total patience is reconnect_timeout_ms.
|
||||
dial_timeout_ms = @DEFDIALTIMEOUTMS_NV@
|
||||
|
||||
# Total time budget (ms) for the agent client to connect (vsock deadline).
|
||||
# Makefile DEFRECONNECTTIMEOUTMS_NV (default 1200000 ms = Go qemu-nvidia-gpu dial_timeout 1200 s).
|
||||
# For hybrid vsock, retry count is reconnect_timeout_ms / dial_timeout_ms.
|
||||
reconnect_timeout_ms = @DEFRECONNECTTIMEOUTMS_NV@
|
||||
|
||||
# Create Container Request Timeout
|
||||
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
|
||||
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,
|
||||
# have sufficient time to complete.
|
||||
#
|
||||
# Effective Timeout Determination:
|
||||
# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values:
|
||||
# - create_container_timeout: The timeout value configured for creating containers (default: 30 seconds).
|
||||
# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below:
|
||||
# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout)
|
||||
# Defaults to @DEFCREATECONTAINERTIMEOUT@ second(s)
|
||||
create_container_timeout = @DEFAULTTIMEOUT_NV@
|
||||
|
||||
[runtime]
|
||||
# If enabled, the runtime will log additional debug messages to the
|
||||
# system log
|
||||
# (default: disabled)
|
||||
enable_debug = false
|
||||
|
||||
# Internetworking model
|
||||
# Determines how the VM should be connected to the
|
||||
# the container network interface
|
||||
# Options:
|
||||
#
|
||||
# - macvtap
|
||||
# Used when the Container network interface can be bridged using
|
||||
# macvtap.
|
||||
#
|
||||
# - none
|
||||
# Used when customize network. Only creates a tap device. No veth pair.
|
||||
#
|
||||
# - tcfilter
|
||||
# Uses tc filter rules to redirect traffic from the network interface
|
||||
# provided by plugin to a tap interface connected to the VM.
|
||||
#
|
||||
internetworking_model = "@DEFNETWORKMODEL_QEMU@"
|
||||
|
||||
name = "@RUNTIMENAME@"
|
||||
hypervisor_name = "@HYPERVISOR_QEMU@"
|
||||
agent_name = "@PROJECT_TYPE@"
|
||||
|
||||
# disable guest seccomp
|
||||
# Determines whether container seccomp profiles are passed to the virtual
|
||||
# machine and applied by the kata agent. If set to true, seccomp is not applied
|
||||
# within the guest
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
enable_tracing = false
|
||||
|
||||
# Set the full url to the Jaeger HTTP Thrift collector.
|
||||
# The default if not set will be "http://localhost:14268/api/traces"
|
||||
jaeger_endpoint = ""
|
||||
|
||||
# Sets the username to be used if basic auth is required for Jaeger.
|
||||
jaeger_user = ""
|
||||
|
||||
# Sets the password to be used if basic auth is required for Jaeger.
|
||||
jaeger_password = ""
|
||||
|
||||
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
|
||||
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
|
||||
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
|
||||
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
|
||||
# (like OVS) directly.
|
||||
# (default: false)
|
||||
disable_new_netns = false
|
||||
|
||||
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
|
||||
# The container cgroups in the host are not created, just one single cgroup per sandbox.
|
||||
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
|
||||
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
|
||||
# The sandbox cgroup is constrained if there is no container type annotation.
|
||||
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
|
||||
sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_NV@
|
||||
|
||||
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
|
||||
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
|
||||
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
|
||||
# Compatibility for determining appropriate sandbox (VM) size:
|
||||
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
|
||||
# does not yet support sandbox sizing annotations.
|
||||
# - When running single containers using a tool like ctr, container sizing information will be available.
|
||||
static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_NV@
|
||||
|
||||
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
|
||||
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
|
||||
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
|
||||
# These will not be exposed to the container workloads, and are only provided for potential guest services.
|
||||
sandbox_bind_mounts = @DEFBINDMOUNTS@
|
||||
|
||||
# VFIO Mode
|
||||
# Determines how VFIO devices should be be presented to the container.
|
||||
# Options:
|
||||
#
|
||||
# - vfio
|
||||
# Matches behaviour of OCI runtimes (e.g. runc) as much as
|
||||
# possible. VFIO devices will appear in the container as VFIO
|
||||
# character devices under /dev/vfio. The exact names may differ
|
||||
# from the host (they need to match the VM's IOMMU group numbers
|
||||
# rather than the host's)
|
||||
#
|
||||
# - guest-kernel
|
||||
# This is a Kata-specific behaviour that's useful in certain cases.
|
||||
# The VFIO device is managed by whatever driver in the VM kernel
|
||||
# claims it. This means it will appear as one or more device nodes
|
||||
# or network interfaces depending on the nature of the device.
|
||||
# Using this mode requires specially built workloads that know how
|
||||
# to locate the relevant device interfaces within the VM.
|
||||
#
|
||||
vfio_mode = "@DEFVFIOMODE_NV@"
|
||||
|
||||
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
|
||||
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
|
||||
disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
# they may break compatibility, and are prepared for a big version bump.
|
||||
# Supported experimental features:
|
||||
# (default: [])
|
||||
experimental = @DEFAULTEXPFEATURES@
|
||||
|
||||
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
|
||||
# (default: false)
|
||||
enable_pprof = false
|
||||
|
||||
# kubelet_root_dir is the kubelet root directory used to match ConfigMap/Secret
|
||||
# volume paths for propagation. Override for distros that use a different path
|
||||
# (e.g. k0s: /var/lib/k0s/kubelet).
|
||||
kubelet_root_dir = "@DEFKUBELETROOTDIR@"
|
||||
|
||||
# pod_resource_api_sock specifies the unix socket for the Kubelet's PodResource API endpoint.
|
||||
# When set (together with a non-"no-port" cold_plug_vfio), the runtime can cold-plug
|
||||
# devices discovered via the Pod Resources API. Path is typically under kubelet root-dir.
|
||||
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK_NV@"
|
||||
@@ -163,7 +163,6 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
|
||||
|
||||
@@ -162,7 +162,6 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_SEL_VIRTIOFS@"
|
||||
|
||||
@@ -3,11 +3,8 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::{
|
||||
os::unix::prelude::{AsRawFd, FromRawFd},
|
||||
time::Duration,
|
||||
};
|
||||
use std::os::unix::prelude::{AsRawFd, FromRawFd};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
@@ -31,70 +28,94 @@ impl Vsock {
|
||||
#[async_trait]
|
||||
impl Sock for Vsock {
|
||||
async fn connect(&self, config: &ConnectConfig) -> Result<Stream> {
|
||||
let mut last_err = None;
|
||||
let retry_times = config.reconnect_timeout_ms / config.dial_timeout_ms;
|
||||
let sock_addr = VsockAddr::new(self.vsock_cid, self.port);
|
||||
let connect_once = || {
|
||||
// Create socket fd
|
||||
let socket = socket(
|
||||
AddressFamily::Vsock,
|
||||
SockType::Stream,
|
||||
SockFlag::empty(),
|
||||
None,
|
||||
)
|
||||
.context("failed to create vsock socket")?;
|
||||
let deadline = Instant::now() + Duration::from_millis(config.reconnect_timeout_ms);
|
||||
|
||||
// Wrap the socket fd in a UnixStream, so that it is closed when
|
||||
// anything fails.
|
||||
// We MUST NOT reuse a vsock socket which has failed a connection
|
||||
// attempt before, since a ECONNRESET error marks the whole socket as
|
||||
// broken and non-reusable.
|
||||
let socket = unsafe { std::os::unix::net::UnixStream::from_raw_fd(socket) };
|
||||
let mut backoff = Duration::from_millis(config.dial_timeout_ms);
|
||||
|
||||
// Connect the socket to vsock server.
|
||||
connect(socket.as_raw_fd(), &sock_addr)
|
||||
.with_context(|| format!("failed to connect to {sock_addr}"))?;
|
||||
let min_backoff = Duration::from_millis(10);
|
||||
let max_backoff = Duration::from_millis(500);
|
||||
if backoff < min_backoff {
|
||||
backoff = min_backoff;
|
||||
} else if backoff > max_backoff {
|
||||
backoff = max_backoff;
|
||||
}
|
||||
|
||||
// Started from tokio v1.44.0+, it would panic when giving
|
||||
// `from_std()` a blocking socket. A workaround is to set the
|
||||
// socket to non-blocking, see [1].
|
||||
//
|
||||
// https://github.com/tokio-rs/tokio/issues/7172
|
||||
socket
|
||||
.set_nonblocking(true)
|
||||
.context("failed to set non-blocking")?;
|
||||
let mut last_err: Option<anyhow::Error> = None;
|
||||
let mut attempts: u64 = 0;
|
||||
|
||||
// Finally, convert the std UnixSocket to tokio's UnixSocket.
|
||||
UnixStream::from_std(socket).context("from_std")
|
||||
};
|
||||
while Instant::now() < deadline {
|
||||
attempts += 1;
|
||||
|
||||
for i in 0..retry_times {
|
||||
match connect_once() {
|
||||
let sa = sock_addr;
|
||||
let res: Result<UnixStream> = tokio::task::spawn_blocking(move || -> Result<UnixStream> {
|
||||
// Create socket fd
|
||||
let fd = socket(
|
||||
AddressFamily::Vsock,
|
||||
SockType::Stream,
|
||||
SockFlag::empty(),
|
||||
None,
|
||||
)
|
||||
.context("failed to create vsock socket")?;
|
||||
|
||||
// Wrap fd so it closes on error
|
||||
let socket = unsafe { std::os::unix::net::UnixStream::from_raw_fd(fd) };
|
||||
|
||||
// Blocking connect (usually returns quickly for vsock)
|
||||
connect(socket.as_raw_fd(), &sa)
|
||||
.with_context(|| format!("failed to connect to {sa}"))?;
|
||||
|
||||
// Tokio requires non-blocking std socket before from_std()
|
||||
socket.set_nonblocking(true).context("failed to set non-blocking")?;
|
||||
|
||||
UnixStream::from_std(socket).context("from_std")
|
||||
})
|
||||
.await
|
||||
.context("vsock: connect task join failed")?;
|
||||
|
||||
match res {
|
||||
Ok(stream) => {
|
||||
info!(sl!(), "vsock: connected to {:?}", self);
|
||||
info!(
|
||||
sl!(),
|
||||
"vsock: connected to {:?} after {} attempts",
|
||||
self,
|
||||
attempts
|
||||
);
|
||||
return Ok(Stream::Vsock(stream));
|
||||
}
|
||||
Err(e) => {
|
||||
last_err = Some(e);
|
||||
|
||||
let now = Instant::now();
|
||||
if now >= deadline {
|
||||
break;
|
||||
}
|
||||
|
||||
let remaining = deadline.saturating_duration_since(now);
|
||||
let sleep_dur = std::cmp::min(backoff, remaining);
|
||||
|
||||
trace!(
|
||||
sl!(),
|
||||
"vsock: failed to connect to {:?}, err {:?}, attempts {}, will retry after {} ms",
|
||||
"vsock: failed to connect to {:?}, attempts {}, retry after {:?}, err {:?}",
|
||||
self,
|
||||
e,
|
||||
i,
|
||||
config.dial_timeout_ms,
|
||||
attempts,
|
||||
sleep_dur,
|
||||
last_err.as_ref().unwrap(),
|
||||
);
|
||||
last_err = Some(e);
|
||||
tokio::time::sleep(Duration::from_millis(config.dial_timeout_ms)).await;
|
||||
|
||||
tokio::time::sleep(sleep_dur).await;
|
||||
|
||||
backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Safe to unwrap the last_err, as this line will be unreachable if
|
||||
// no errors occurred.
|
||||
Err(anyhow!(
|
||||
"vsock: failed to connect to {:?}, err {:?}",
|
||||
"vsock: failed to connect to {:?} within {:?} (attempts={}), last_err={:?}",
|
||||
self,
|
||||
last_err.unwrap()
|
||||
Duration::from_millis(config.reconnect_timeout_ms),
|
||||
attempts,
|
||||
last_err
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,8 @@ qapi-spec = "0.3.2"
|
||||
qapi-qmp = "0.15.0"
|
||||
hyperlocal = { workspace = true }
|
||||
hyper = { workspace = true, features = ["client"] }
|
||||
|
||||
regex = "1"
|
||||
once_cell = "1.21.3"
|
||||
# Local dependencies
|
||||
kata-sys-util = { workspace = true }
|
||||
kata-types = { workspace = true }
|
||||
|
||||
@@ -45,7 +45,7 @@ pub const DEFAULT_FS_QUEUES: usize = 1;
|
||||
const DEFAULT_FS_QUEUE_SIZE: u16 = 1024;
|
||||
|
||||
impl CloudHypervisorInner {
|
||||
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<DeviceType> {
|
||||
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
if self.state != VmmState::VmRunning {
|
||||
// If the VM is not running, add the device to the pending list to
|
||||
// be handled later.
|
||||
@@ -80,10 +80,12 @@ impl CloudHypervisorInner {
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(device);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.handle_add_device(device).await
|
||||
let _devicex = self.handle_add_device(device).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_add_device(&mut self, device: DeviceType) -> Result<DeviceType> {
|
||||
|
||||
@@ -106,7 +106,7 @@ impl Hypervisor for CloudHypervisor {
|
||||
inner.save_vm().await
|
||||
}
|
||||
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.add_device(device).await
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_sys_util::rand::RandomBytes;
|
||||
@@ -12,7 +12,8 @@ use kata_types::config::hypervisor::{BlockDeviceInfo, TopologyConfigInfo, VIRTIO
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
||||
use crate::{
|
||||
vhost_user_blk::VhostUserBlkDevice, BlockConfig, BlockDevice, HybridVsockDevice, Hypervisor,
|
||||
vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig,
|
||||
BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor,
|
||||
NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig,
|
||||
VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
|
||||
KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO,
|
||||
@@ -231,6 +232,19 @@ impl DeviceManager {
|
||||
return Some(device_id.to_string());
|
||||
}
|
||||
}
|
||||
DeviceType::VfioModern(device) => {
|
||||
// device is an Arc<Mutex<VfioDeviceModern>>; lock it to access vfio_config
|
||||
if device.lock().await.config.iommu_group_devnode
|
||||
== PathBuf::from(host_path.clone())
|
||||
{
|
||||
return Some(device_id.to_string());
|
||||
}
|
||||
}
|
||||
DeviceType::BlockModern(device) => {
|
||||
if device.lock().await.config.path_on_host == host_path.clone() {
|
||||
return Some(device_id.to_string());
|
||||
}
|
||||
}
|
||||
DeviceType::VhostUserBlk(device) => {
|
||||
if device.config.socket_path == host_path {
|
||||
return Some(device_id.to_string());
|
||||
@@ -301,6 +315,17 @@ impl DeviceManager {
|
||||
.await
|
||||
.context("failed to create device")?
|
||||
}
|
||||
DeviceConfig::BlockCfgModern(config) => {
|
||||
// try to find the device, if found and just return id.
|
||||
if let Some(device_matched_id) = self.find_device(config.path_on_host.clone()).await
|
||||
{
|
||||
return Ok(device_matched_id);
|
||||
}
|
||||
|
||||
self.create_block_device_modern(config, device_id.clone())
|
||||
.await
|
||||
.context("failed to create block device modern")?
|
||||
}
|
||||
DeviceConfig::VfioCfg(config) => {
|
||||
let mut vfio_dev_config = config.clone();
|
||||
let dev_host_path = vfio_dev_config.host_path.clone();
|
||||
@@ -315,6 +340,22 @@ impl DeviceManager {
|
||||
&vfio_dev_config,
|
||||
)?))
|
||||
}
|
||||
DeviceConfig::VfioModernCfg(config) => {
|
||||
let dev_host_path = config.host_path.clone();
|
||||
if let Some(device_matched_id) = self.find_device(dev_host_path.clone()).await {
|
||||
return Ok(device_matched_id);
|
||||
}
|
||||
|
||||
let virt_path = self.get_dev_virt_path(&config.dev_type, false)?;
|
||||
let mut vfio_base = config.clone();
|
||||
vfio_base.iommu_group_devnode = PathBuf::from(dev_host_path);
|
||||
vfio_base.virt_path = virt_path;
|
||||
|
||||
Arc::new(Mutex::new(VfioDeviceModernHandle::new(
|
||||
device_id.clone(),
|
||||
&vfio_base,
|
||||
)?))
|
||||
}
|
||||
DeviceConfig::VhostUserBlkCfg(config) => {
|
||||
// try to find the device, found and just return id.
|
||||
if let Some(dev_id_matched) = self.find_device(config.socket_path.clone()).await {
|
||||
@@ -445,6 +486,61 @@ impl DeviceManager {
|
||||
))))
|
||||
}
|
||||
|
||||
async fn create_block_device_modern(
|
||||
&mut self,
|
||||
config: &BlockConfigModern,
|
||||
device_id: String,
|
||||
) -> Result<ArcMutexDevice> {
|
||||
let mut block_config = config.clone();
|
||||
let mut is_pmem = false;
|
||||
|
||||
match block_config.driver_option.as_str() {
|
||||
// convert the block driver to kata type
|
||||
VIRTIO_BLOCK_MMIO => {
|
||||
block_config.driver_option = KATA_MMIO_BLK_DEV_TYPE.to_string();
|
||||
}
|
||||
VIRTIO_BLOCK_PCI => {
|
||||
block_config.driver_option = KATA_BLK_DEV_TYPE.to_string();
|
||||
}
|
||||
VIRTIO_BLOCK_CCW => {
|
||||
block_config.driver_option = KATA_CCW_DEV_TYPE.to_string();
|
||||
}
|
||||
VIRTIO_PMEM => {
|
||||
block_config.driver_option = KATA_NVDIMM_DEV_TYPE.to_string();
|
||||
is_pmem = true;
|
||||
}
|
||||
VIRTIO_SCSI => {
|
||||
block_config.driver_option = KATA_SCSI_DEV_TYPE.to_string();
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(
|
||||
"unsupported driver type {}",
|
||||
block_config.driver_option
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// generate virt path
|
||||
if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK, is_pmem)? {
|
||||
block_config.index = virt_path.0;
|
||||
block_config.virt_path = virt_path.1;
|
||||
}
|
||||
|
||||
// if the path on host is empty, we need to get device host path from the device major and minor number
|
||||
// Otherwise, it might be rawfile based block device, the host path is already passed from the runtime,
|
||||
// so we don't need to do anything here.
|
||||
if block_config.path_on_host.is_empty() {
|
||||
block_config.path_on_host =
|
||||
get_host_path(DEVICE_TYPE_BLOCK, config.major, config.minor)
|
||||
.context("failed to get host path")?;
|
||||
}
|
||||
|
||||
Ok(Arc::new(Mutex::new(BlockDeviceModernHandle::new(
|
||||
device_id,
|
||||
block_config,
|
||||
))))
|
||||
}
|
||||
|
||||
async fn create_block_device(
|
||||
&mut self,
|
||||
config: &BlockConfig,
|
||||
|
||||
@@ -7,10 +7,12 @@
|
||||
mod port_device;
|
||||
mod protection_device;
|
||||
mod vfio;
|
||||
pub mod vfio_device;
|
||||
mod vhost_user;
|
||||
pub mod vhost_user_blk;
|
||||
mod vhost_user_net;
|
||||
mod virtio_blk;
|
||||
mod virtio_blk_modern;
|
||||
mod virtio_fs;
|
||||
mod virtio_net;
|
||||
mod virtio_vsock;
|
||||
@@ -21,6 +23,7 @@ pub use vfio::{
|
||||
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig,
|
||||
VfioDevice,
|
||||
};
|
||||
pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
|
||||
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
|
||||
pub use vhost_user_net::VhostUserNetDevice;
|
||||
pub use virtio_blk::{
|
||||
@@ -28,6 +31,7 @@ pub use virtio_blk::{
|
||||
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
|
||||
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
};
|
||||
pub use virtio_blk_modern::{BlockConfigModern, BlockDeviceModern, BlockDeviceModernHandle};
|
||||
pub use virtio_fs::{
|
||||
ShareFsConfig, ShareFsDevice, ShareFsMountConfig, ShareFsMountOperation, ShareFsMountType,
|
||||
};
|
||||
|
||||
@@ -177,6 +177,9 @@ pub struct HostDevice {
|
||||
/// PCI device information (Domain)
|
||||
pub domain: String,
|
||||
|
||||
// iommufd for vfio device
|
||||
pub iommufd: String,
|
||||
|
||||
/// PCI device information (BDF): "bus:slot:function"
|
||||
pub bus_slot_func: String,
|
||||
|
||||
@@ -531,13 +534,13 @@ impl Device for VfioDevice {
|
||||
|
||||
// do add device for vfio device
|
||||
match h.add_device(DeviceType::Vfio(self.clone())).await {
|
||||
Ok(dev) => {
|
||||
// Update device info with the one received from device attach
|
||||
if let DeviceType::Vfio(vfio) = dev {
|
||||
self.config = vfio.config;
|
||||
self.devices = vfio.devices;
|
||||
self.allocated = true;
|
||||
}
|
||||
Ok(_dev) => {
|
||||
// // Update device info with the one received from device attach
|
||||
// if let DeviceType::Vfio(vfio) = dev {
|
||||
// self.config = vfio.config;
|
||||
// self.devices = vfio.devices;
|
||||
// self.allocated = true;
|
||||
// }
|
||||
|
||||
update_pcie_device!(self, pcie_topo)?;
|
||||
|
||||
|
||||
@@ -0,0 +1,769 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::os::unix::fs::{FileTypeExt, MetadataExt};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Path constants for VFIO and IOMMU sysfs/dev interfaces
|
||||
const DEV_VFIO: &str = "/dev/vfio";
|
||||
const SYS_IOMMU_GROUPS: &str = "/sys/kernel/iommu_groups";
|
||||
const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices";
|
||||
const DEV_IOMMU: &str = "/dev/iommu";
|
||||
const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices";
|
||||
const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev";
|
||||
|
||||
// const SYS_MDEV_DEVS: &str = "/sys/bus/mdev/devices";
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VfioIommufdBackend {
|
||||
/// Host global IOMMUFD device node (/dev/iommu)
|
||||
pub iommufd_dev: PathBuf,
|
||||
/// The per-device VFIO cdev nodes required for this assignment
|
||||
pub cdevs: Vec<VfioCdev>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct VfioDevice {
|
||||
pub id: String,
|
||||
pub device_type: VfioDeviceType,
|
||||
pub bus_mode: VfioBusMode,
|
||||
|
||||
/// Metadata for Legacy VFIO backend
|
||||
pub iommu_group: Option<VfioGroup>,
|
||||
pub iommu_group_id: Option<u32>,
|
||||
|
||||
/// Metadata for IOMMUFD backend
|
||||
pub iommufd: Option<VfioIommufdBackend>,
|
||||
|
||||
/// Common device information
|
||||
pub devices: Vec<DeviceInfo>,
|
||||
/// The representative primary device for this assignment unit
|
||||
pub primary: DeviceInfo,
|
||||
pub labels: BTreeMap<String, String>,
|
||||
pub health: Health,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum VfioDeviceType {
|
||||
#[default]
|
||||
Normal,
|
||||
MediatedPci,
|
||||
MediatedAp,
|
||||
Error,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum VfioBusMode {
|
||||
#[default]
|
||||
Mmio,
|
||||
Pci,
|
||||
Ccw,
|
||||
}
|
||||
|
||||
/// PCI Bus-Device-Function (BDF) Address representation
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct BdfAddress {
|
||||
pub domain: u16,
|
||||
pub bus: u8,
|
||||
pub device: u8,
|
||||
pub function: u8,
|
||||
}
|
||||
|
||||
impl BdfAddress {
|
||||
pub fn new(domain: u16, bus: u8, device: u8, function: u8) -> Self {
|
||||
Self {
|
||||
domain,
|
||||
bus,
|
||||
device,
|
||||
function,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a BDF string in formats like "0000:01:00.0" or "01:00.0"
|
||||
pub fn parse(s: &str) -> Result<Self> {
|
||||
let parts: Vec<&str> = s.split(':').collect();
|
||||
|
||||
let (domain, bus_str, bus_dev_func) = match parts.len() {
|
||||
2 => (0u16, parts[0], parts[1]),
|
||||
3 => {
|
||||
let domain = u16::from_str_radix(parts[0], 16).context("Invalid domain hex")?;
|
||||
(domain, parts[1], parts[2])
|
||||
}
|
||||
_ => return Err(anyhow!("Invalid BDF format: {}", s)),
|
||||
};
|
||||
|
||||
let bus = u8::from_str_radix(bus_str, 16).context("Invalid bus hex")?;
|
||||
|
||||
let dev_func: Vec<&str> = bus_dev_func.split('.').collect();
|
||||
if dev_func.len() != 2 {
|
||||
return Err(anyhow!("Invalid device.function format"));
|
||||
}
|
||||
|
||||
let device = u8::from_str_radix(dev_func[0], 16).context("Invalid device hex")?;
|
||||
let function = u8::from_str_radix(dev_func[1], 16).context("Invalid function hex")?;
|
||||
|
||||
Ok(Self {
|
||||
domain,
|
||||
bus,
|
||||
device,
|
||||
function,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_short_string(&self) -> String {
|
||||
format!("{:02x}:{:02x}.{:x}", self.bus, self.device, self.function)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BdfAddress {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:04x}:{:02x}:{:02x}.{:x}",
|
||||
self.domain, self.bus, self.device, self.function
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum DeviceAddress {
|
||||
Pci(BdfAddress),
|
||||
Ccw(String),
|
||||
Mmio(String),
|
||||
MdevUuid(String),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DeviceAddress {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DeviceAddress::Pci(bdf) => write!(f, "{bdf}"),
|
||||
DeviceAddress::Ccw(s) => write!(f, "{s}"),
|
||||
DeviceAddress::Mmio(s) => write!(f, "{s}"),
|
||||
DeviceAddress::MdevUuid(s) => write!(f, "{s}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DeviceAddress {
|
||||
fn default() -> Self {
|
||||
DeviceAddress::Pci(BdfAddress::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceInfo {
|
||||
/// Logical address on the specific bus
|
||||
pub addr: DeviceAddress,
|
||||
|
||||
/// Hardware identification (may be missing for non-PCI/mdev)
|
||||
pub vendor_id: Option<String>,
|
||||
pub device_id: Option<String>,
|
||||
pub class_code: Option<u32>,
|
||||
|
||||
/// Active kernel driver (e.g., "vfio-pci")
|
||||
pub driver: Option<String>,
|
||||
|
||||
/// Parent IOMMU group (critical for legacy passthrough)
|
||||
pub iommu_group_id: Option<u32>,
|
||||
|
||||
/// Proximity to CPU/Memory (sysfs reports -1 for no specific node)
|
||||
pub numa_node: Option<i32>,
|
||||
|
||||
/// Canonical path in sysfs
|
||||
pub sysfs_path: PathBuf,
|
||||
|
||||
/// VFIO character device node (e.g., /dev/vfio/devices/vfio0)
|
||||
/// Only populated if the kernel/hardware supports device-centric VFIO
|
||||
pub vfio_cdev: Option<VfioCdev>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Health {
|
||||
#[default]
|
||||
Healthy,
|
||||
Unhealthy,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct VfioGroup {
|
||||
pub group_id: u32,
|
||||
pub devnode: PathBuf,
|
||||
pub vfio_ctl: PathBuf,
|
||||
/// Aggregated VFIO cdev nodes for all devices within this group
|
||||
pub vfio_cdevs: Vec<PathBuf>,
|
||||
pub devices: Vec<DeviceInfo>,
|
||||
// primary device used for labeling and identification
|
||||
pub primary: DeviceInfo,
|
||||
pub labels: BTreeMap<String, String>,
|
||||
pub is_viable: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VfioCdev {
|
||||
/// Instance name (e.g., "vfio0")
|
||||
pub name: String,
|
||||
/// Device node path (/dev/vfio/devices/vfio0)
|
||||
pub devnode: PathBuf,
|
||||
/// Character device major number
|
||||
pub major: Option<u32>,
|
||||
/// Character device minor number
|
||||
pub minor: Option<u32>,
|
||||
pub sysfs_path: PathBuf,
|
||||
/// Associated PCI BDF if applicable
|
||||
pub bdf: Option<String>,
|
||||
pub group_id: Option<u32>,
|
||||
}
|
||||
|
||||
fn read_trim(path: impl AsRef<Path>) -> Option<String> {
|
||||
fs::read_to_string(path.as_ref())
|
||||
.ok()
|
||||
.map(|s| s.trim().to_string())
|
||||
}
|
||||
|
||||
fn parse_i32(path: impl AsRef<Path>) -> Option<i32> {
|
||||
read_trim(path).and_then(|s| s.parse::<i32>().ok())
|
||||
}
|
||||
|
||||
fn driver_name(pci_dev_path: &Path) -> Option<String> {
|
||||
let link = fs::read_link(pci_dev_path.join("driver")).ok()?;
|
||||
link.file_name().map(|n| n.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
fn parse_bdf_str(s: &str) -> Result<BdfAddress> {
|
||||
// Standard format: "0000:65:00.0"
|
||||
let re = Regex::new(
|
||||
r"^(?P<d>[0-9a-fA-F]{4}):(?P<b>[0-9a-fA-F]{2}):(?P<dev>[0-9a-fA-F]{2})\.(?P<f>[0-7])$",
|
||||
)
|
||||
.unwrap();
|
||||
let cap = re
|
||||
.captures(s)
|
||||
.ok_or_else(|| anyhow!("invalid BDF format: {s}"))?;
|
||||
Ok(BdfAddress {
|
||||
domain: u16::from_str_radix(&cap["d"], 16)?,
|
||||
bus: u8::from_str_radix(&cap["b"], 16)?,
|
||||
device: u8::from_str_radix(&cap["dev"], 16)?,
|
||||
function: (cap["f"]).parse::<u8>()?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Scans sysfs to find all PCI devices belonging to a specific IOMMU group
|
||||
fn discover_group_devices(group_id: u32) -> Result<Vec<DeviceInfo>> {
|
||||
let mut out = vec![];
|
||||
let group_dir = Path::new(SYS_IOMMU_GROUPS)
|
||||
.join(group_id.to_string())
|
||||
.join("devices");
|
||||
|
||||
for ent in
|
||||
fs::read_dir(&group_dir).context(format!("Failed to read {}", group_dir.display()))?
|
||||
{
|
||||
let ent = ent?;
|
||||
let bdf_str = ent.file_name().to_string_lossy().to_string();
|
||||
let pci_path = Path::new(SYS_PCI_DEVS).join(&bdf_str);
|
||||
|
||||
if !pci_path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let bdf = parse_bdf_str(&bdf_str)?;
|
||||
let vendor_id = read_trim(pci_path.join("vendor"));
|
||||
let device_id = read_trim(pci_path.join("device"));
|
||||
let class_code = read_trim(pci_path.join("class"))
|
||||
.as_deref()
|
||||
.and_then(parse_class_code_u32);
|
||||
let driver = driver_name(&pci_path);
|
||||
|
||||
let numa_node =
|
||||
parse_i32(pci_path.join("numa_node")).and_then(|n| if n < 0 { None } else { Some(n) });
|
||||
|
||||
out.push(DeviceInfo {
|
||||
addr: DeviceAddress::Pci(bdf),
|
||||
vendor_id,
|
||||
device_id,
|
||||
class_code,
|
||||
driver,
|
||||
iommu_group_id: Some(group_id),
|
||||
numa_node,
|
||||
sysfs_path: pci_path,
|
||||
vfio_cdev: None, // Populated later
|
||||
});
|
||||
}
|
||||
|
||||
// Ensure deterministic ordering
|
||||
out.sort_by(|a, b| a.sysfs_path.cmp(&b.sysfs_path));
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Generates descriptive labels for an IOMMU group (e.g., identifying GPUs)
|
||||
fn build_group_labels(devs: &[DeviceInfo]) -> BTreeMap<String, String> {
|
||||
let mut labels = BTreeMap::new();
|
||||
let mut gpu = false;
|
||||
let mut vendor: Option<String> = None;
|
||||
|
||||
for d in devs {
|
||||
if vendor.is_none() {
|
||||
vendor = d.vendor_id.clone();
|
||||
}
|
||||
|
||||
// PCI Class Code layout: 0xBBSSPP (Base Class, Sub Class, Programming Interface)
|
||||
if let Some(class_code) = d.class_code {
|
||||
let base = ((class_code >> 16) & 0xff) as u8;
|
||||
let sub = ((class_code >> 8) & 0xff) as u8;
|
||||
|
||||
// Base 0x03 = Display controller
|
||||
// Sub 0x00 = VGA compatible, 0x02 = 3D controller (NVIDIA/AMD)
|
||||
if base == 0x03 && (sub == 0x00 || sub == 0x02) {
|
||||
gpu = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(v) = vendor {
|
||||
labels.insert("vendor".into(), v);
|
||||
}
|
||||
labels.insert("gpu".into(), gpu.to_string());
|
||||
labels
|
||||
}
|
||||
|
||||
/// Validates that an IOMMU group can be safely passed through.
|
||||
/// Note: Bridges and Host Controllers in the group are ignored as they cannot be passed to guests.
|
||||
fn validate_group_basic(devices: &[DeviceInfo]) -> bool {
|
||||
// Current minimal check: group must not be empty.
|
||||
// Production logic may include blacklisting specific device classes.
|
||||
for device in devices.iter() {
|
||||
if let DeviceAddress::Pci(bdf) = &device.addr {
|
||||
// filter host or PCI bridge
|
||||
let bdf_str = bdf.to_string();
|
||||
// Filter out host or PCI bridges (cannot be passed through)
|
||||
if filter_bridge_device(&bdf_str, 0x0600).is_some() {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
!devices.is_empty()
|
||||
}
|
||||
|
||||
fn get_device_property(device_bdf: &str, property: &str) -> Result<String> {
|
||||
let dev_sys_path = Path::new(SYS_PCI_DEVS).join(device_bdf);
|
||||
let cfg_path = fs::read_to_string(dev_sys_path.join(property)).with_context(|| {
|
||||
format!(
|
||||
"failed to read property {} for device {}",
|
||||
property, device_bdf
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(cfg_path.trim().to_string())
|
||||
}
|
||||
|
||||
/// Filters for Host or PCI bridges within an IOMMU group.
|
||||
/// PCI Bridge: Class 0x0604, Host Bridge: Class 0x0600.
|
||||
fn filter_bridge_device(bdf: &str, bitmask: u64) -> Option<u64> {
|
||||
let device_class = get_device_property(bdf, "class").unwrap_or_default();
|
||||
|
||||
if device_class.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
match device_class.parse::<u32>() {
|
||||
Ok(cid_u32) => {
|
||||
// PCI class code is 24 bits, shift right 8 to get base+sub class
|
||||
let class_code = u64::from(cid_u32) >> 8;
|
||||
if class_code & bitmask == bitmask {
|
||||
Some(class_code)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_class_code_u32(s: &str) -> Option<u32> {
|
||||
let t = s.trim().strip_prefix("0x").unwrap_or(s.trim());
|
||||
u32::from_str_radix(t, 16).ok()
|
||||
}
|
||||
|
||||
/// Determines device priority for selection as the 'Primary' device of a group.
|
||||
/// GPUs take precedence, followed by Network and Storage controllers.
|
||||
fn class_priority(class_code: Option<u32>) -> u8 {
|
||||
let Some(c) = class_code else { return 255 };
|
||||
let base = ((c >> 16) & 0xff) as u8;
|
||||
let sub = ((c >> 8) & 0xff) as u8;
|
||||
|
||||
match (base, sub) {
|
||||
(0x03, 0x00) | (0x03, 0x02) => 0, // VGA/3D GPU
|
||||
(0x02, _) => 10, // Network controller
|
||||
(0x01, _) => 20, // Mass storage
|
||||
_ => 100, // Other
|
||||
}
|
||||
}
|
||||
|
||||
/// Picks the most significant device in a group to act as the primary identifier.
|
||||
fn select_primary_device(devs: &[DeviceInfo]) -> DeviceInfo {
|
||||
assert!(!devs.is_empty());
|
||||
|
||||
devs.iter()
|
||||
.min_by(|a, b| {
|
||||
let pa = class_priority(a.class_code);
|
||||
let pb = class_priority(b.class_code);
|
||||
if pa != pb {
|
||||
return pa.cmp(&pb);
|
||||
}
|
||||
|
||||
// Fallback to function number if classes are identical
|
||||
let fa = match &a.addr {
|
||||
DeviceAddress::Pci(bdf) => bdf.function,
|
||||
_ => u8::MAX,
|
||||
};
|
||||
let fb = match &b.addr {
|
||||
DeviceAddress::Pci(bdf) => bdf.function,
|
||||
_ => u8::MAX,
|
||||
};
|
||||
fa.cmp(&fb)
|
||||
})
|
||||
.cloned()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn is_char_dev(p: &Path) -> bool {
|
||||
fs::metadata(p)
|
||||
.map(|m| m.file_type().is_char_device())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Extracts the IOMMU group ID from a PCI device's sysfs link.
|
||||
fn vfio_group_id_from_pci(bdf: &str) -> Option<u32> {
|
||||
let link = fs::read_link(Path::new(SYS_PCI_DEVS).join(bdf).join("iommu_group")).ok()?;
|
||||
link.file_name()?.to_string_lossy().parse::<u32>().ok()
|
||||
}
|
||||
|
||||
/// Locates the VFIO character device (cdev) for a given PCI BDF.
|
||||
/// Path: /sys/bus/pci/devices/<bdf>/vfio-dev/vfioX
|
||||
fn discover_vfio_cdev_for_pci(bdf: &str, gid: u32) -> Option<VfioCdev> {
|
||||
let pci_path = Path::new(SYS_PCI_DEVS).join(bdf);
|
||||
let vfio_dev_dir = pci_path.join("vfio-dev");
|
||||
let rd = fs::read_dir(&vfio_dev_dir).ok()?;
|
||||
for e in rd.flatten() {
|
||||
let name = e.file_name().to_string_lossy().to_string();
|
||||
if !name.starts_with("vfio") {
|
||||
continue;
|
||||
}
|
||||
return discover_vfio_cdev_by_name(&name, Some(bdf.to_string()), Some(gid));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Extracts major/minor device numbers from a file's metadata.
|
||||
fn stat_major_minor(path: &Path) -> Option<(u32, u32)> {
|
||||
let md = fs::metadata(path).ok()?;
|
||||
let rdev = md.rdev();
|
||||
Some((linux_major(rdev), linux_minor(rdev)))
|
||||
}
|
||||
|
||||
fn discover_vfio_cdev_by_name(
|
||||
vfio_name: &str,
|
||||
bdf: Option<String>,
|
||||
gid: Option<u32>,
|
||||
) -> Option<VfioCdev> {
|
||||
let devnode = Path::new(DEV_VFIO_DEVICES).join(vfio_name);
|
||||
if !is_char_dev(&devnode) {
|
||||
return None;
|
||||
}
|
||||
let (major, minor) = stat_major_minor(&devnode).unwrap_or((0, 0));
|
||||
Some(VfioCdev {
|
||||
name: vfio_name.to_string(),
|
||||
devnode,
|
||||
major: if major == 0 && minor == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(major)
|
||||
},
|
||||
minor: if major == 0 && minor == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(minor)
|
||||
},
|
||||
sysfs_path: Path::new(SYS_CLASS_VFIO_DEV).join(vfio_name),
|
||||
bdf,
|
||||
group_id: gid,
|
||||
})
|
||||
}
|
||||
|
||||
/// Discovers the VFIO device context based on a /dev/vfio/devices/vfio<X> path.
|
||||
pub fn discover_vfio_device(vfio_device: &Path) -> Result<VfioDevice> {
|
||||
if vfio_device.exists() && is_char_dev(vfio_device) {
|
||||
let vfio_name = vfio_device
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Invalid vfio device path"))?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
// Resolve VFIO name to BDF via sysfs symlink
|
||||
let dev_link = fs::read_link(
|
||||
Path::new(SYS_CLASS_VFIO_DEV)
|
||||
.join(&vfio_name)
|
||||
.join("device"),
|
||||
)
|
||||
.with_context(|| format!("failed to read sysfs device link for {}", vfio_name))?;
|
||||
|
||||
let bdf = dev_link
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("Malformed vfio-dev symlink for {}", vfio_name))?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
// Resolve BDF to IOMMU group. On iommufd-first hosts there is often no legacy
|
||||
// /dev/vfio/<gid> node — only /dev/vfio/devices/vfioX cdevs exist — so use the
|
||||
// cdev we were given as the group char dev when legacy is absent.
|
||||
let gid = vfio_group_id_from_pci(&bdf)
|
||||
.ok_or_else(|| anyhow!("could not resolve IOMMU group for {}", bdf))?;
|
||||
let legacy = Path::new(DEV_VFIO).join(gid.to_string());
|
||||
let group_devnode = if legacy.exists() && is_char_dev(&legacy) {
|
||||
legacy
|
||||
} else {
|
||||
vfio_device.to_path_buf()
|
||||
};
|
||||
discover_vfio_device_for_iommu_group(gid, group_devnode)
|
||||
} else {
|
||||
Err(anyhow!("vfio device {} not found", vfio_device.display()))
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_dev_vfio_group_id(s: &str) -> Option<u32> {
|
||||
// Extracts numeric ID from "/dev/vfio/12" or just "12"
|
||||
let base = Path::new(s).file_name()?.to_string_lossy();
|
||||
base.parse::<u32>().ok()
|
||||
}
|
||||
|
||||
/// Per-device cdev under iommufd (`/dev/vfio/devices/vfioN`). Matches Go
|
||||
/// `strings.HasPrefix(HostPath, IommufdDevPath)` in `pkg/device/drivers/vfio.go`, not only
|
||||
/// [`Path::starts_with`]: component-wise path prefix can disagree with string prefix for some
|
||||
/// `OsStr` forms, so we use the same string rule as the Go runtime.
|
||||
fn is_iommufd_devices_cdev_path(path: &Path) -> bool {
|
||||
let s = path.to_string_lossy();
|
||||
if !s.starts_with(DEV_VFIO_DEVICES) {
|
||||
return false;
|
||||
}
|
||||
match s.as_bytes().get(DEV_VFIO_DEVICES.len()) {
|
||||
None => true,
|
||||
Some(b'/') => true,
|
||||
Some(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Main entry point: Discovers a VFIO device unit based on an IOMMU group path (/dev/vfio/<X>)
|
||||
///
|
||||
/// CDI / device plugins often pass the per-device cdev (`/dev/vfio/devices/vfioX`) as the only
|
||||
/// host path; that is stored as `iommu_group_devnode` without setting `iommu_device_node`.
|
||||
/// Treat those like [`discover_vfio_device`].
|
||||
pub fn discover_vfio_group_device(host_path: PathBuf) -> Result<VfioDevice> {
|
||||
if is_iommufd_devices_cdev_path(&host_path) {
|
||||
return discover_vfio_device(&host_path);
|
||||
}
|
||||
let gid = parse_dev_vfio_group_id(&host_path.to_string_lossy())
|
||||
.ok_or_else(|| anyhow!("Invalid VFIO group path: {}", host_path.display()))?;
|
||||
discover_vfio_device_for_iommu_group(gid, host_path)
|
||||
}
|
||||
|
||||
/// Builds [`VfioDevice`] for IOMMU group `gid`.
|
||||
///
|
||||
/// `group_devnode` is the char device used to represent the group for metadata/health:
|
||||
/// typically `/dev/vfio/<gid>` (legacy) or `/dev/vfio/devices/vfioX` when legacy nodes are absent.
|
||||
fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Result<VfioDevice> {
|
||||
let vfio_ctl = Path::new(DEV_VFIO).join("vfio");
|
||||
if !vfio_ctl.exists() {
|
||||
return Err(anyhow!("VFIO control node missing: {}", vfio_ctl.display()));
|
||||
}
|
||||
|
||||
let devnode = group_devnode;
|
||||
let mut devices = discover_group_devices(gid)?;
|
||||
if devices.is_empty() {
|
||||
return Err(anyhow!("IOMMU group {} contains no PCI devices", gid));
|
||||
}
|
||||
|
||||
// Populate per-device VFIO cdevs (required for IOMMUFD backend)
|
||||
for d in devices.iter_mut() {
|
||||
if let DeviceAddress::Pci(bdf) = &d.addr {
|
||||
d.vfio_cdev = discover_vfio_cdev_for_pci(&bdf.to_string(), gid);
|
||||
}
|
||||
}
|
||||
|
||||
let labels = build_group_labels(&devices);
|
||||
let is_viable = validate_group_basic(&devices);
|
||||
let primary_device = select_primary_device(&devices);
|
||||
|
||||
let group = VfioGroup {
|
||||
group_id: gid,
|
||||
devnode: devnode.clone(),
|
||||
vfio_ctl: vfio_ctl.clone(),
|
||||
devices: devices.clone(),
|
||||
primary: primary_device.clone(),
|
||||
labels: labels.clone(),
|
||||
is_viable,
|
||||
vfio_cdevs: devices
|
||||
.iter()
|
||||
.filter_map(|d| d.vfio_cdev.as_ref().map(|c| c.devnode.clone()))
|
||||
.collect(),
|
||||
};
|
||||
|
||||
// Construct IOMMUFD backend context (Best-effort discovery)
|
||||
let iommufd_backend = {
|
||||
let iommu_dev = PathBuf::from(DEV_IOMMU);
|
||||
if is_char_dev(&iommu_dev) {
|
||||
let mut cdevs: Vec<VfioCdev> =
|
||||
devices.iter().filter_map(|d| d.vfio_cdev.clone()).collect();
|
||||
cdevs.sort_by(|a, b| a.devnode.cmp(&b.devnode));
|
||||
cdevs.dedup_by(|a, b| a.devnode == b.devnode);
|
||||
if !cdevs.is_empty() {
|
||||
Some(VfioIommufdBackend {
|
||||
iommufd_dev: iommu_dev,
|
||||
cdevs,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let health = if is_viable && devnode.exists() && is_char_dev(&devnode) {
|
||||
Health::Healthy
|
||||
} else {
|
||||
Health::Unhealthy
|
||||
};
|
||||
|
||||
Ok(VfioDevice {
|
||||
id: format!("vfio-group-{}", gid),
|
||||
device_type: VfioDeviceType::Normal,
|
||||
bus_mode: VfioBusMode::Pci,
|
||||
iommu_group: Some(group),
|
||||
iommu_group_id: Some(gid),
|
||||
iommufd: iommufd_backend,
|
||||
devices,
|
||||
primary: primary_device,
|
||||
labels,
|
||||
health,
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolves an IOMMUFD-style VFIO device cdev (/dev/vfio/devices/vfioX)
|
||||
/// back to its PCI BDF and IOMMU group ID.
|
||||
#[allow(dead_code)]
|
||||
pub fn vfio_cdev_to_bdf_and_group(vfio_cdev: impl AsRef<Path>) -> Result<(String, u32)> {
|
||||
let vfio_cdev = vfio_cdev.as_ref();
|
||||
|
||||
let (major, minor) = major_minor_from_char_device(vfio_cdev).context(format!(
|
||||
"Failed to get major/minor for {}",
|
||||
vfio_cdev.display()
|
||||
))?;
|
||||
|
||||
// Map char device to its sysfs entry
|
||||
let sys_dev_char = PathBuf::from(format!("/sys/dev/char/{major}:{minor}"));
|
||||
let resolved = fs::canonicalize(&sys_dev_char)
|
||||
.context(format!("failed to canonicalize {}", sys_dev_char.display()))?;
|
||||
|
||||
// Parse the sysfs path to find the associated PCI device
|
||||
let bdf = extract_last_pci_bdf(&resolved)
|
||||
.context(format!("no PCI BDF found in path {}", resolved.display()))?;
|
||||
|
||||
// Get IOMMU group, with a fallback to manual path scanning if the symlink is missing
|
||||
let group_id = iommu_group_id_for_bdf(&bdf).or_else(|primary_err| {
|
||||
group_id_from_path(&resolved).map_err(|fallback_err| {
|
||||
anyhow!(
|
||||
"failed to resolve group for BDF {bdf}: {primary_err}; fallback scan also failed: {fallback_err}"
|
||||
)
|
||||
})
|
||||
})?;
|
||||
|
||||
Ok((bdf, group_id))
|
||||
}
|
||||
|
||||
/// Extract (major, minor) from a char device node.
|
||||
/// Uses Linux's encoding macros (same logic as gnu libc major()/minor()).
|
||||
fn major_minor_from_char_device(p: &Path) -> Result<(u32, u32)> {
|
||||
let md = fs::metadata(p).context(format!("stat failed for {}", p.display()))?;
|
||||
if !md.file_type().is_char_device() {
|
||||
return Err(anyhow!("{} is not a character device", p.display()));
|
||||
}
|
||||
|
||||
let rdev = md.rdev();
|
||||
Ok((linux_major(rdev), linux_minor(rdev)))
|
||||
}
|
||||
|
||||
/// Linux device number encoding (glibc-compatible).
|
||||
#[inline]
|
||||
fn linux_major(dev: u64) -> u32 {
|
||||
(((dev >> 8) & 0xfff) | ((dev >> 32) & 0xfffff000)) as u32
|
||||
}
|
||||
|
||||
/// Linux device number encoding (glibc-compatible).
|
||||
#[inline]
|
||||
fn linux_minor(dev: u64) -> u32 {
|
||||
((dev & 0xff) | ((dev >> 12) & 0xfffff00)) as u32
|
||||
}
|
||||
|
||||
/// Extracts the final PCI BDF in a sysfs path string.
|
||||
/// Handles nested bridge paths like: .../pci0000:00/0000:00:01.0/0000:01:00.0/vfio-dev/...
|
||||
fn extract_last_pci_bdf(p: &Path) -> Result<String> {
|
||||
static RE: once_cell::sync::Lazy<Regex> = once_cell::sync::Lazy::new(|| {
|
||||
Regex::new(r"(?i)\b[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]\b").unwrap()
|
||||
});
|
||||
|
||||
let s = p.to_string_lossy();
|
||||
RE.find_iter(&s)
|
||||
.last()
|
||||
.map(|m| m.as_str().to_owned())
|
||||
.ok_or_else(|| anyhow!("no PCI BDF found in path: {}", s))
|
||||
}
|
||||
|
||||
/// Resolve iommu group id from `/sys/bus/pci/devices/<BDF>/iommu_group`.
|
||||
fn iommu_group_id_for_bdf(bdf: &str) -> Result<u32> {
|
||||
let iommu_link = PathBuf::from(format!("/sys/bus/pci/devices/{bdf}/iommu_group"));
|
||||
let target = fs::read_link(&iommu_link).context("failed to read iommu_group symlink")?;
|
||||
|
||||
target
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("link target {} invalid", target.display()))?
|
||||
.to_string_lossy()
|
||||
.parse::<u32>()
|
||||
.context("failed to parse group ID from filename")
|
||||
}
|
||||
|
||||
fn group_id_from_path(p: &Path) -> Result<u32> {
|
||||
static RE: once_cell::sync::Lazy<Regex> =
|
||||
once_cell::sync::Lazy::new(|| Regex::new(r"/iommu_groups/(\d+)(/|$)").unwrap());
|
||||
|
||||
let s = p.to_string_lossy();
|
||||
let caps = RE
|
||||
.captures(&s)
|
||||
.ok_or_else(|| anyhow!("no iommu_groups component in path"))?;
|
||||
|
||||
caps.get(1)
|
||||
.unwrap()
|
||||
.as_str()
|
||||
.parse::<u32>()
|
||||
.context("parse group id")
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn is_dev_vfio_group_path(host_path: &str) -> bool {
|
||||
let s = host_path.trim_end_matches('/');
|
||||
const PREFIX: &str = "/dev/vfio/";
|
||||
let rest = match s.strip_prefix(PREFIX) {
|
||||
Some(r) => r,
|
||||
None => return false,
|
||||
};
|
||||
|
||||
// Valid if remainder is non-empty and contains only digits
|
||||
!rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit())
|
||||
}
|
||||
@@ -0,0 +1,326 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use kata_sys_util::pcilibs::get_bars_max_addressable_memory;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::device::pci_path::PciPath;
|
||||
use crate::device::topology::{PCIePort, PCIeTopology};
|
||||
use crate::device::util::{do_decrease_count, do_increase_count};
|
||||
use crate::device::{Device, DeviceType, PCIeDevice};
|
||||
use crate::vfio_device::core::{VfioDevice, discover_vfio_device, discover_vfio_group_device};
|
||||
use crate::Hypervisor;
|
||||
|
||||
/// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id)
|
||||
/// bus_name = rp<port_id>
|
||||
pub type BusPortId = (String, u32, u32);
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct VfioDeviceBase {
|
||||
/// Host device path, typically /dev/vfio/N (legacy)
|
||||
pub host_path: String,
|
||||
|
||||
/// Primary PCI Bus-Device-Function (BDF) address
|
||||
pub host_bdf: String,
|
||||
|
||||
/// All BDFs belonging to the same logical device or IOMMU group
|
||||
pub host_bdfs: Vec<String>,
|
||||
|
||||
/// The bus and port ID to which the device is attached (e.g., ("pci.1", 2))
|
||||
pub bus_port_id: BusPortId,
|
||||
|
||||
/// Specifies the PCIe port type (e.g., Root Port, Downstream Port)
|
||||
pub port: PCIePort,
|
||||
|
||||
/// Character device node for the IOMMU group (/dev/vfio/X)
|
||||
pub iommu_group_devnode: PathBuf,
|
||||
|
||||
/// Character device node for the specific VFIO device (/dev/vfio/devices/vfioX)
|
||||
pub iommu_device_node: Option<PathBuf>,
|
||||
|
||||
/// The guest-side PCI path representing the device's BDF address in the VM
|
||||
pub guest_pci_path: Option<PciPath>,
|
||||
|
||||
/// Device classification: "block" or "char"
|
||||
pub dev_type: String,
|
||||
|
||||
/// Underlying bus architecture: "pci" or "ccw"
|
||||
pub bus_type: String,
|
||||
|
||||
/// Represents the device's path as it appears inside the VM guest,
|
||||
/// independent of the host container's mount namespace.
|
||||
/// format: Option<(device_index, path_name)>
|
||||
pub virt_path: Option<(u64, String)>,
|
||||
|
||||
/// Prefix used for host device identification. Examples:
|
||||
/// - Physical Endpoint: "physical_nic_"
|
||||
/// - Mediated Device: "vfio_mdev_"
|
||||
/// - PCI Passthrough: "vfio_device_"
|
||||
/// - VFIO Volume: "vfio_vol_"
|
||||
/// - VFIO NVMe: "vfio_nvme_"
|
||||
pub hostdev_prefix: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct VfioDeviceModern {
|
||||
pub device_id: String,
|
||||
pub device: VfioDevice,
|
||||
pub config: VfioDeviceBase,
|
||||
|
||||
/// Configuration options passed to the vfio-pci handler in kata-agent
|
||||
pub device_options: Vec<String>,
|
||||
|
||||
/// Indicates if the host device has been allocated to a specific guest
|
||||
pub is_allocated: bool,
|
||||
|
||||
/// Reference count for active attachments
|
||||
pub attach_count: u64,
|
||||
|
||||
/// Maximum addressable memory reserved for MMIO BARs
|
||||
pub memory_reserve: u64,
|
||||
|
||||
/// Maximum addressable memory reserved for 64-bit prefetchable BARs
|
||||
pub pref64_reserve: u64,
|
||||
}
|
||||
|
||||
/// Path used for [`discover_vfio_group_device`] when `iommu_device_node` is unset.
|
||||
/// CDI cold-plug often only fills `host_path`; `iommu_group_devnode` may still be empty until
|
||||
/// device_manager copies `host_path` — treat those as the same node.
|
||||
fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf {
|
||||
if !base.iommu_group_devnode.as_os_str().is_empty() {
|
||||
base.iommu_group_devnode.clone()
|
||||
} else {
|
||||
PathBuf::from(base.host_path.trim())
|
||||
}
|
||||
}
|
||||
|
||||
impl VfioDeviceModern {
|
||||
pub fn new(device_id: String, base: &VfioDeviceBase) -> Result<Self> {
|
||||
// For modern VFIO devices, we require the specific device cdev path to be provided in the configuration.
|
||||
// This allows us to directly discover the device context without needing to resolve group devices.
|
||||
// If the device node is not provided, we can optionally fallback to group device discovery,
|
||||
// but this is less efficient and may not be supported in all environments.
|
||||
let device = if let Some(ref node) = base.iommu_device_node {
|
||||
if !node.as_os_str().is_empty() {
|
||||
discover_vfio_device(node)?
|
||||
} else {
|
||||
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
|
||||
}
|
||||
} else {
|
||||
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
|
||||
};
|
||||
let (memory_reserve, pref64_reserve) = get_bars_max_addressable_memory();
|
||||
|
||||
Ok(Self {
|
||||
device_id,
|
||||
device,
|
||||
config: base.clone(),
|
||||
device_options: Vec::new(),
|
||||
is_allocated: false,
|
||||
attach_count: 0,
|
||||
memory_reserve,
|
||||
pref64_reserve,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Thread-safe handle for managing modern VFIO devices using asynchronous locking.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VfioDeviceModernHandle {
|
||||
pub inner: Arc<Mutex<VfioDeviceModern>>,
|
||||
}
|
||||
|
||||
impl VfioDeviceModernHandle {
|
||||
pub fn new(device_id: String, base: &VfioDeviceBase) -> Result<Self> {
|
||||
let vfio_device = VfioDeviceModern::new(device_id, base)?;
|
||||
Ok(Self {
|
||||
inner: Arc::new(Mutex::new(vfio_device)),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn arc(&self) -> Arc<Mutex<VfioDeviceModern>> {
|
||||
self.inner.clone()
|
||||
}
|
||||
|
||||
/// Scoped read access: Executes a closure within the device lock.
|
||||
pub async fn with<R>(&self, f: impl FnOnce(&VfioDeviceModern) -> R) -> R {
|
||||
let guard = self.inner.lock().await;
|
||||
f(&guard)
|
||||
}
|
||||
|
||||
/// Scoped write access: Executes a mutating closure within the device lock.
|
||||
pub async fn with_mut<R>(&self, f: impl FnOnce(&mut VfioDeviceModern) -> R) -> R {
|
||||
let mut guard = self.inner.lock().await;
|
||||
f(&mut guard)
|
||||
}
|
||||
|
||||
pub async fn device_id(&self) -> String {
|
||||
self.inner.lock().await.device_id.clone()
|
||||
}
|
||||
|
||||
pub async fn vfio_config(&self) -> VfioDeviceBase {
|
||||
self.inner.lock().await.config.clone()
|
||||
}
|
||||
|
||||
pub async fn vfio_device(&self) -> VfioDevice {
|
||||
self.inner.lock().await.device.clone()
|
||||
}
|
||||
|
||||
pub async fn attach_count(&self) -> u64 {
|
||||
self.inner.lock().await.attach_count
|
||||
}
|
||||
|
||||
pub async fn set_allocated(&self, allocated: bool) {
|
||||
self.inner.lock().await.is_allocated = allocated;
|
||||
}
|
||||
|
||||
pub async fn update_config(&self, cfg: VfioDeviceBase) {
|
||||
self.inner.lock().await.config = cfg;
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Device for VfioDeviceModernHandle {
|
||||
/// Attaches the VFIO device to the hypervisor and registers it in the PCIe topology.
|
||||
async fn attach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn Hypervisor,
|
||||
) -> Result<()> {
|
||||
// Check if device is already attached
|
||||
if self
|
||||
.increase_attach_count()
|
||||
.await
|
||||
.context("failed to increase attach count")?
|
||||
{
|
||||
warn!(
|
||||
sl!(),
|
||||
"The device {:?} is already attached; multi-attach is not allowed.",
|
||||
self.device_id().await
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Register the device in the virtual PCIe topology if provided
|
||||
match pcie_topo {
|
||||
Some(topo) => self.register(topo).await?,
|
||||
None => return Ok(()),
|
||||
}
|
||||
|
||||
// Request Hypervisor to perform the actual hardware passthrough
|
||||
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
|
||||
error!(sl!(), "failed to attach vfio device: {:?}", e);
|
||||
|
||||
// Rollback state on failure
|
||||
self.decrease_attach_count().await?;
|
||||
if let Some(topo) = pcie_topo {
|
||||
self.unregister(topo).await?;
|
||||
}
|
||||
return Err(e);
|
||||
}
|
||||
warn!(sl!(), "The device {:?} is already attached", self.arc());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Detaches the VFIO device from the hypervisor and releases topology resources.
|
||||
async fn detach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn Hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// Only proceed with detachment if reference count reaches zero
|
||||
if self
|
||||
.decrease_attach_count()
|
||||
.await
|
||||
.context("failed to decrease attach count")?
|
||||
{
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if let Err(e) = h
|
||||
.remove_device(DeviceType::VfioModern(self.inner.clone()))
|
||||
.await
|
||||
{
|
||||
// Rollback: increment count if hypervisor fails to remove the device
|
||||
self.increase_attach_count().await?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
// Retrieve device index if a virtual path exists
|
||||
let virt = self.with(|d| d.config.virt_path.clone()).await;
|
||||
let device_index = virt.map(|(idx, _)| idx);
|
||||
|
||||
// Unregister from PCIe topology
|
||||
if let Some(topo) = pcie_topo {
|
||||
self.unregister(topo).await?;
|
||||
}
|
||||
|
||||
Ok(device_index)
|
||||
}
|
||||
|
||||
async fn update(&mut self, _h: &dyn Hypervisor) -> Result<()> {
|
||||
// Updates are typically not required for VFIO passthrough devices
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn increase_attach_count(&mut self) -> Result<bool> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
do_increase_count(&mut guard.attach_count)
|
||||
}
|
||||
|
||||
async fn decrease_attach_count(&mut self) -> Result<bool> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
do_decrease_count(&mut guard.attach_count)
|
||||
}
|
||||
|
||||
async fn get_device_info(&self) -> DeviceType {
|
||||
DeviceType::VfioModern(self.arc())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PCIeDevice for VfioDeviceModernHandle {
|
||||
/// Reserves a bus and port in the PCIe topology for this device.
|
||||
async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> {
|
||||
let device_id = self.device_id().await;
|
||||
let port_type = self.with(|d| d.config.port).await;
|
||||
|
||||
// Reserve the bus based on the specified port type
|
||||
let bus_port_id = match topo.reserve_bus_for_device(&device_id, port_type)? {
|
||||
Some(id) => id,
|
||||
None => return Err(anyhow::anyhow!("can not get bus port")),
|
||||
};
|
||||
|
||||
self.with_mut(|d| {
|
||||
// Update the configuration with the assigned bus/port
|
||||
d.config.bus_port_id = bus_port_id;
|
||||
d.is_allocated = true;
|
||||
// // Clear static guest PCI path; use dynamically assigned path after hotplug
|
||||
// d.config.guest_pci_path = None;
|
||||
})
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Releases the reserved PCIe resources and resets attachment state.
|
||||
async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> {
|
||||
let device_id = self.device_id().await;
|
||||
topo.release_bus_for_device(&device_id)?;
|
||||
|
||||
self.with_mut(|d| {
|
||||
d.is_allocated = false;
|
||||
d.config.bus_port_id.0.clear();
|
||||
d.config.guest_pci_path = None;
|
||||
})
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
mod core;
|
||||
mod device;
|
||||
|
||||
pub use core::{discover_vfio_group_device, VfioDevice};
|
||||
pub use device::VfioDeviceBase;
|
||||
pub use device::VfioDeviceModern;
|
||||
pub use device::VfioDeviceModernHandle;
|
||||
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
const DEV_VFIO_CTL: &str = "/dev/vfio/vfio";
|
||||
const DEV_IOMMU: &str = "/dev/iommu";
|
||||
const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices";
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum VfioBackendChoice {
|
||||
/// legacy VFIO group/container: /dev/vfio/vfio + /dev/vfio/<group>
|
||||
LegacyGroup,
|
||||
/// iommufd backend: /dev/iommu + /dev/vfio/devices/vfioX
|
||||
Iommufd,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct VfioHostCaps {
|
||||
pub has_vfio_ctl: bool, // /dev/vfio/vfio exists
|
||||
pub has_iommufd: bool, // /dev/iommu exists
|
||||
pub has_vfio_cdev: bool, // /dev/vfio/devices exists and contains vfio*
|
||||
}
|
||||
|
||||
pub fn detect_vfio_host_caps() -> VfioHostCaps {
|
||||
let has_vfio_ctl = Path::new(DEV_VFIO_CTL).exists();
|
||||
let has_iommufd = Path::new(DEV_IOMMU).exists();
|
||||
|
||||
let has_vfio_cdev = match fs::read_dir(DEV_VFIO_DEVICES) {
|
||||
Ok(rd) => rd
|
||||
.flatten()
|
||||
.any(|e| e.file_name().to_string_lossy().starts_with("vfio")),
|
||||
Err(_) => false,
|
||||
};
|
||||
|
||||
VfioHostCaps {
|
||||
has_vfio_ctl,
|
||||
has_iommufd,
|
||||
has_vfio_cdev,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn choose_vfio_backend(caps: &VfioHostCaps) -> Result<VfioBackendChoice> {
|
||||
// Prefer iommufd when fully supported
|
||||
if caps.has_iommufd && caps.has_vfio_cdev {
|
||||
return Ok(VfioBackendChoice::Iommufd);
|
||||
}
|
||||
|
||||
// Fallback to legacy VFIO container/group
|
||||
if caps.has_vfio_ctl {
|
||||
return Ok(VfioBackendChoice::LegacyGroup);
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"No usable VFIO backend: caps={:?}. Need (/dev/iommu + /dev/vfio/devices/vfio*) \
|
||||
for iommufd, or /dev/vfio/vfio for legacy.",
|
||||
caps
|
||||
))
|
||||
}
|
||||
@@ -149,11 +149,7 @@ impl Device for BlockDevice {
|
||||
}
|
||||
|
||||
match h.add_device(DeviceType::Block(self.clone())).await {
|
||||
Ok(dev) => {
|
||||
// Update device info with the one received from device attach
|
||||
if let DeviceType::Block(blk) = dev {
|
||||
self.config = blk.config;
|
||||
}
|
||||
Ok(_dev) => {
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -0,0 +1,210 @@
|
||||
// Copyright (c) 2022-2023 Alibaba Cloud
|
||||
// Copyright (c) 2022-2023 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::device::pci_path::PciPath;
|
||||
use crate::device::topology::PCIeTopology;
|
||||
use crate::device::util::do_decrease_count;
|
||||
use crate::device::util::do_increase_count;
|
||||
use crate::device::Device;
|
||||
use crate::device::DeviceType;
|
||||
use crate::Hypervisor as hypervisor;
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub enum BlockDeviceAio {
|
||||
// IoUring is the Linux io_uring I/O implementation.
|
||||
#[default]
|
||||
IoUring,
|
||||
|
||||
// Native is the native Linux AIO implementation.
|
||||
Native,
|
||||
|
||||
// Threads is the pthread asynchronous I/O implementation.
|
||||
Threads,
|
||||
}
|
||||
|
||||
impl BlockDeviceAio {
|
||||
pub fn new(aio: &str) -> Self {
|
||||
match aio {
|
||||
"native" => BlockDeviceAio::Native,
|
||||
"threads" => BlockDeviceAio::Threads,
|
||||
_ => BlockDeviceAio::IoUring,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockDeviceAio {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let to_string = match *self {
|
||||
BlockDeviceAio::Native => "native".to_string(),
|
||||
BlockDeviceAio::Threads => "threads".to_string(),
|
||||
_ => "iouring".to_string(),
|
||||
};
|
||||
write!(f, "{to_string}")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct BlockConfigModern {
|
||||
/// Path of the drive.
|
||||
pub path_on_host: String,
|
||||
|
||||
/// If set to true, the drive is opened in read-only mode. Otherwise, the
|
||||
/// drive is opened as read-write.
|
||||
pub is_readonly: bool,
|
||||
|
||||
/// Don't close `path_on_host` file when dropping the device.
|
||||
pub no_drop: bool,
|
||||
|
||||
/// Specifies cache-related options for block devices.
|
||||
/// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
|
||||
/// If not set, use configurarion block_device_cache_direct.
|
||||
pub is_direct: Option<bool>,
|
||||
|
||||
/// device index
|
||||
pub index: u64,
|
||||
|
||||
/// blkdev_aio defines the type of asynchronous I/O the block device should use.
|
||||
pub blkdev_aio: BlockDeviceAio,
|
||||
|
||||
/// driver type for block device
|
||||
pub driver_option: String,
|
||||
|
||||
/// device path in guest
|
||||
pub virt_path: String,
|
||||
|
||||
/// pci path is the slot at which the drive is attached
|
||||
pub pci_path: Option<PciPath>,
|
||||
|
||||
/// scsi_addr of the block device, in case the device is attached using SCSI driver
|
||||
/// scsi_addr is of the format SCSI-Id:LUN
|
||||
pub scsi_addr: Option<String>,
|
||||
|
||||
/// device attach count
|
||||
pub attach_count: u64,
|
||||
|
||||
/// device major number
|
||||
pub major: i64,
|
||||
|
||||
/// device minor number
|
||||
pub minor: i64,
|
||||
|
||||
/// virtio queue size. size: byte
|
||||
pub queue_size: u32,
|
||||
|
||||
/// block device multi-queue
|
||||
pub num_queues: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct BlockDeviceModern {
|
||||
pub device_id: String,
|
||||
pub attach_count: u64,
|
||||
pub config: BlockConfigModern,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockDeviceModernHandle {
|
||||
inner: Arc<Mutex<BlockDeviceModern>>,
|
||||
}
|
||||
|
||||
impl BlockDeviceModernHandle {
|
||||
pub fn new(device_id: String, config: BlockConfigModern) -> Self {
|
||||
Self {
|
||||
inner: Arc::new(Mutex::new(BlockDeviceModern {
|
||||
device_id,
|
||||
attach_count: 0,
|
||||
config,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn arc(&self) -> Arc<Mutex<BlockDeviceModern>> {
|
||||
self.inner.clone()
|
||||
}
|
||||
|
||||
pub async fn snapshot_config(&self) -> BlockConfigModern {
|
||||
self.inner.lock().await.config.clone()
|
||||
}
|
||||
|
||||
pub async fn device_id(&self) -> String {
|
||||
self.inner.lock().await.device_id.clone()
|
||||
}
|
||||
|
||||
pub async fn attach_count(&self) -> u64 {
|
||||
self.inner.lock().await.attach_count
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Device for BlockDeviceModernHandle {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
// increase attach count, skip attach the device if the device is already attached
|
||||
if self
|
||||
.increase_attach_count()
|
||||
.await
|
||||
.context("failed to increase attach count")?
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if let Err(e) = h.add_device(DeviceType::BlockModern(self.arc())).await {
|
||||
error!(sl!(), "failed to attach vfio device: {:?}", e);
|
||||
self.decrease_attach_count().await?;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// get the count of device detached, skip detach once it reaches the 0
|
||||
if self
|
||||
.decrease_attach_count()
|
||||
.await
|
||||
.context("failed to decrease attach count")?
|
||||
{
|
||||
return Ok(None);
|
||||
}
|
||||
if let Err(e) = h.remove_device(DeviceType::BlockModern(self.arc())).await {
|
||||
self.increase_attach_count().await?;
|
||||
return Err(e);
|
||||
}
|
||||
Ok(Some(self.snapshot_config().await.index))
|
||||
}
|
||||
|
||||
async fn update(&mut self, _h: &dyn hypervisor) -> Result<()> {
|
||||
// There's no need to do update for virtio-blk
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_device_info(&self) -> DeviceType {
|
||||
DeviceType::BlockModern(self.inner.clone())
|
||||
}
|
||||
|
||||
async fn increase_attach_count(&mut self) -> Result<bool> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
do_increase_count(&mut guard.attach_count)
|
||||
}
|
||||
|
||||
async fn decrease_attach_count(&mut self) -> Result<bool> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
do_decrease_count(&mut guard.attach_count)
|
||||
}
|
||||
}
|
||||
@@ -5,16 +5,19 @@
|
||||
//
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::device::driver::vhost_user_blk::VhostUserBlkDevice;
|
||||
use crate::vfio_device::{VfioDeviceBase, VfioDeviceModern};
|
||||
use crate::{
|
||||
BlockConfig, BlockDevice, HybridVsockConfig, HybridVsockDevice, Hypervisor as hypervisor,
|
||||
NetworkConfig, NetworkDevice, PCIePortDevice, PortDeviceConfig, ProtectionDevice,
|
||||
ProtectionDeviceConfig, ShareFsConfig, ShareFsDevice, VfioConfig, VfioDevice, VhostUserConfig,
|
||||
VhostUserNetDevice, VsockConfig, VsockDevice,
|
||||
BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModern, HybridVsockConfig,
|
||||
HybridVsockDevice, Hypervisor as hypervisor, NetworkConfig, NetworkDevice, PCIePortDevice,
|
||||
PortDeviceConfig, ProtectionDevice, ProtectionDeviceConfig, ShareFsConfig, ShareFsDevice,
|
||||
VfioConfig, VfioDevice, VhostUserConfig, VhostUserNetDevice, VsockConfig, VsockDevice,
|
||||
};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use self::topology::PCIeTopology;
|
||||
|
||||
@@ -29,11 +32,13 @@ pub mod util;
|
||||
#[derive(Debug)]
|
||||
pub enum DeviceConfig {
|
||||
BlockCfg(BlockConfig),
|
||||
BlockCfgModern(BlockConfigModern),
|
||||
VhostUserBlkCfg(VhostUserConfig),
|
||||
NetworkCfg(NetworkConfig),
|
||||
VhostUserNetworkCfg(VhostUserConfig),
|
||||
ShareFsCfg(ShareFsConfig),
|
||||
VfioCfg(VfioConfig),
|
||||
VfioModernCfg(VfioDeviceBase),
|
||||
VsockCfg(VsockConfig),
|
||||
HybridVsockCfg(HybridVsockConfig),
|
||||
ProtectionDevCfg(ProtectionDeviceConfig),
|
||||
@@ -43,8 +48,10 @@ pub enum DeviceConfig {
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum DeviceType {
|
||||
Block(BlockDevice),
|
||||
BlockModern(Arc<Mutex<BlockDeviceModern>>),
|
||||
VhostUserBlk(VhostUserBlkDevice),
|
||||
Vfio(VfioDevice),
|
||||
VfioModern(Arc<Mutex<VfioDeviceModern>>),
|
||||
Network(NetworkDevice),
|
||||
VhostUserNetwork(VhostUserNetDevice),
|
||||
ShareFs(ShareFsDevice),
|
||||
|
||||
@@ -227,7 +227,7 @@ impl std::fmt::Display for PCIePortBusPrefix {
|
||||
/// PCIePort distinguishes between different types of PCIe ports.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)]
|
||||
pub enum PCIePort {
|
||||
// NoPort is for disabling VFIO hotplug/coldplug
|
||||
// No root-port/switch, VFIO does not occupy ports
|
||||
#[default]
|
||||
NoPort,
|
||||
|
||||
@@ -327,15 +327,19 @@ pub enum AvailableNode {
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct PCIeTopology {
|
||||
pub mode: PCIePort, // only one mode exists
|
||||
pub hypervisor_name: String,
|
||||
pub root_complex: PCIeRootComplex,
|
||||
|
||||
pub bridges: u32,
|
||||
pub cold_plug: bool,
|
||||
pub pcie_root_ports: u32,
|
||||
pub pcie_switch_ports: u32,
|
||||
pub hotplug_vfio_on_root_bus: bool,
|
||||
// pcie_port_devices keeps track of the devices attached to different types of PCI ports.
|
||||
pub pcie_port_devices: HashMap<u32, TopologyPortDevice>,
|
||||
// device_id -> (bus, bus_slot, port_id)
|
||||
pub reserved_bus: HashMap<String, (String, u32, u32)>,
|
||||
}
|
||||
|
||||
impl PCIeTopology {
|
||||
@@ -354,15 +358,172 @@ impl PCIeTopology {
|
||||
let total_rp = topo_config.device_info.pcie_root_port;
|
||||
let total_swp = topo_config.device_info.pcie_switch_port;
|
||||
|
||||
Some(Self {
|
||||
let mode = match (total_rp, total_swp) {
|
||||
(0, 0) => PCIePort::NoPort,
|
||||
(r, 0) if r > 0 => PCIePort::RootPort,
|
||||
(r, s) if r > 0 && s > 0 => PCIePort::SwitchPort,
|
||||
(0, s) if s > 0 => {
|
||||
// Cannot attach switch without rootport
|
||||
// Here you can choose to return None or error directly; since new() returns Option, None is safer
|
||||
return None;
|
||||
}
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
let mut topo = Self {
|
||||
hypervisor_name: topo_config.hypervisor_name.to_owned(),
|
||||
root_complex,
|
||||
bridges: topo_config.device_info.default_bridges,
|
||||
cold_plug: true,
|
||||
pcie_root_ports: total_rp,
|
||||
pcie_switch_ports: total_swp,
|
||||
hotplug_vfio_on_root_bus: topo_config.device_info.hotplug_vfio_on_root_bus,
|
||||
pcie_port_devices: HashMap::new(),
|
||||
})
|
||||
mode,
|
||||
reserved_bus: HashMap::new(),
|
||||
};
|
||||
|
||||
// Initialize port structures (only in RootPort/SwitchPort mode)
|
||||
if mode != PCIePort::NoPort {
|
||||
// First create rootports
|
||||
let _ = topo.add_root_ports_on_bus(total_rp);
|
||||
// Then create switch based on mode
|
||||
if mode == PCIePort::SwitchPort {
|
||||
// Your existing strategy method can be reused:
|
||||
let _ = topo.add_switch_ports_with_strategy(1, total_swp, Strategy::SingleRootPort);
|
||||
}
|
||||
}
|
||||
|
||||
Some(topo)
|
||||
}
|
||||
|
||||
pub fn reserve_bus_for_device(
|
||||
&mut self,
|
||||
device_id: &str,
|
||||
mode: PCIePort,
|
||||
) -> Result<Option<(String, u32, u32)>> {
|
||||
if let Some(bus) = self.reserved_bus.get(device_id) {
|
||||
return Ok(Some(bus.clone()));
|
||||
}
|
||||
|
||||
let bus_port_id = match mode {
|
||||
PCIePort::NoPort => return Ok(None),
|
||||
PCIePort::RootPort => {
|
||||
let rp = self
|
||||
.find_free_root_port()
|
||||
.ok_or_else(|| anyhow!("no free root port"))?;
|
||||
// "rpX" starts from slot-6 and 0~5 are reserved for other virtio pci devices
|
||||
(rp.port_id(), rp.id + 9, rp.id + 2)
|
||||
}
|
||||
PCIePort::SwitchPort => {
|
||||
let dp = self
|
||||
.find_free_switch_down_port()
|
||||
.ok_or_else(|| anyhow!("no free switch downstream port"))?;
|
||||
(dp.port_id(), dp.id + 9, dp.id + 2)
|
||||
}
|
||||
};
|
||||
|
||||
self.reserved_bus
|
||||
.insert(device_id.to_string(), bus_port_id.clone());
|
||||
|
||||
Ok(Some(bus_port_id))
|
||||
}
|
||||
|
||||
pub fn release_bus_for_device(&mut self, device_id: &str) -> Result<()> {
|
||||
let bus = match self.reserved_bus.remove(device_id) {
|
||||
Some(b) => b,
|
||||
None => return Ok(()),
|
||||
};
|
||||
|
||||
match self.mode {
|
||||
PCIePort::NoPort => Ok(()),
|
||||
PCIePort::RootPort => {
|
||||
self.release_root_port(&bus.0);
|
||||
Ok(())
|
||||
}
|
||||
PCIePort::SwitchPort => {
|
||||
self.release_switch_down_port(&bus.0);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn find_free_root_port(&mut self) -> Option<TopologyPortDevice> {
|
||||
// 1. First, try to find an existing unallocated port.
|
||||
// We sort the IDs to ensure we use them in sequential order (0, 1, 2...).
|
||||
let mut ids: Vec<u32> = self.pcie_port_devices.keys().cloned().collect();
|
||||
ids.sort();
|
||||
|
||||
for id in ids {
|
||||
if let Some(rp) = self.pcie_port_devices.get_mut(&id) {
|
||||
if !rp.allocated {
|
||||
rp.allocated = true;
|
||||
return Some(rp.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. If no free port is found and cold_plug is enabled, add a new root port.
|
||||
if self.cold_plug {
|
||||
// Determine the next ID: if empty, start from 0; otherwise, max_id + 1.
|
||||
let next_id = self
|
||||
.pcie_port_devices
|
||||
.keys()
|
||||
.max()
|
||||
.map(|&id| id + 1)
|
||||
.unwrap_or(0);
|
||||
|
||||
let new_port = TopologyPortDevice {
|
||||
id: next_id,
|
||||
bus: "pcie.0".to_string(), // Root ports are attached to pcie.0
|
||||
allocated: true, // Mark as allocated immediately
|
||||
connected_switch: None,
|
||||
};
|
||||
|
||||
// Store the newly created port into the map
|
||||
self.pcie_port_devices.insert(next_id, new_port.clone());
|
||||
|
||||
return Some(new_port);
|
||||
}
|
||||
|
||||
// No ports available and cannot create new ones
|
||||
None
|
||||
}
|
||||
|
||||
fn find_free_switch_down_port(&mut self) -> Option<SwitchDownPort> {
|
||||
for rp in self.pcie_port_devices.values_mut() {
|
||||
if let Some(sw) = rp.connected_switch.as_mut() {
|
||||
for dp in sw.switch_ports.values_mut() {
|
||||
if !dp.allocated {
|
||||
dp.allocated = true;
|
||||
return Some(dp.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn release_root_port(&mut self, bus: &str) {
|
||||
if let Some(id) = bus.strip_prefix("rp").and_then(|s| s.parse::<u32>().ok()) {
|
||||
if let Some(rp) = self.pcie_port_devices.get_mut(&id) {
|
||||
rp.allocated = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn release_switch_down_port(&mut self, bus: &str) {
|
||||
if let Some(id) = bus.strip_prefix("swdp").and_then(|s| s.parse::<u32>().ok()) {
|
||||
for rp in self.pcie_port_devices.values_mut() {
|
||||
if let Some(sw) = rp.connected_switch.as_mut() {
|
||||
if let Some(dp) = sw.switch_ports.get_mut(&id) {
|
||||
dp.allocated = false;
|
||||
dp.connected_device = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_device(&mut self, ep: &mut PCIeEndpoint) -> Option<PciPath> {
|
||||
@@ -566,10 +727,15 @@ impl PCIeTopology {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds a root port to pcie bus
|
||||
/// Adds root ports `0..num_root_ports` on the root bus. Skips IDs that already exist so this
|
||||
/// is safe when [`PCIeTopology::new`] already created the same ports from TOML and
|
||||
/// [`crate::device::driver::port_device::PCIePortDevice`] attaches again (same pattern as
|
||||
/// [`Self::add_switch_ports_single_root_port`]).
|
||||
pub fn add_root_ports_on_bus(&mut self, num_root_ports: u32) -> Result<()> {
|
||||
for index in 0..num_root_ports {
|
||||
self.add_pcie_root_port(index)?;
|
||||
if !self.pcie_port_devices.contains_key(&index) {
|
||||
self.add_pcie_root_port(index)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -826,8 +992,9 @@ mod tests {
|
||||
assert!(topology.add_root_ports_on_bus(3).is_ok());
|
||||
assert_eq!(topology.pcie_port_devices.len(), 3);
|
||||
|
||||
// Adding more root ports than available should fail
|
||||
assert!(topology.add_root_ports_on_bus(1).is_err());
|
||||
// Idempotent: matches PCIeTopology::new pre-seeding + PortDevice::attach calling again.
|
||||
assert!(topology.add_root_ports_on_bus(3).is_ok());
|
||||
assert_eq!(topology.pcie_port_devices.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -38,7 +38,7 @@ const VIRTIO_FS: &str = "virtio-fs";
|
||||
const INLINE_VIRTIO_FS: &str = "inline-virtio-fs";
|
||||
|
||||
impl DragonballInner {
|
||||
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<DeviceType> {
|
||||
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
if self.state == VmmState::NotReady {
|
||||
info!(sl!(), "VMM not ready, queueing device {}", device);
|
||||
|
||||
@@ -46,7 +46,7 @@ impl DragonballInner {
|
||||
// start_vm would pop the devices in an right order
|
||||
// to add the devices.
|
||||
self.pending_devices.insert(0, device.clone());
|
||||
return Ok(device);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(sl!(), "dragonball add device {:?}", &device);
|
||||
@@ -54,13 +54,10 @@ impl DragonballInner {
|
||||
DeviceType::Network(network) => {
|
||||
self.add_net_device(&network.config)
|
||||
.context("add net device")?;
|
||||
Ok(DeviceType::Network(network))
|
||||
}
|
||||
DeviceType::Vfio(mut hostdev) => {
|
||||
self.add_vfio_device(&mut hostdev)
|
||||
.context("add vfio device")?;
|
||||
|
||||
Ok(DeviceType::Vfio(hostdev))
|
||||
}
|
||||
DeviceType::Block(mut block) => {
|
||||
let use_pci_bus = if block.config.driver_option == KATA_BLK_DEV_TYPE {
|
||||
@@ -85,8 +82,6 @@ impl DragonballInner {
|
||||
block.config.pci_path = Some(PciPath::try_from(slot as u32)?);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DeviceType::Block(block))
|
||||
}
|
||||
DeviceType::VhostUserBlk(block) => {
|
||||
self.add_block_device(
|
||||
@@ -98,24 +93,21 @@ impl DragonballInner {
|
||||
None,
|
||||
)
|
||||
.context("add vhost user based block device")?;
|
||||
Ok(DeviceType::VhostUserBlk(block))
|
||||
}
|
||||
DeviceType::HybridVsock(hvsock) => {
|
||||
self.add_hvsock(&hvsock.config).context("add vsock")?;
|
||||
Ok(DeviceType::HybridVsock(hvsock))
|
||||
}
|
||||
DeviceType::ShareFs(sharefs) => {
|
||||
self.add_share_fs_device(&sharefs.config)
|
||||
.context("add share fs device")?;
|
||||
Ok(DeviceType::ShareFs(sharefs))
|
||||
}
|
||||
DeviceType::VhostUserNetwork(dev) => {
|
||||
self.add_vhost_user_net_device(&dev.config)
|
||||
.context("add vhost-user-net device")?;
|
||||
Ok(DeviceType::VhostUserNetwork(dev))
|
||||
}
|
||||
DeviceType::Vsock(_) | DeviceType::Protection(_) | DeviceType::PortDevice(_) => todo!(),
|
||||
_ => todo!(),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remove_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
|
||||
@@ -142,7 +142,7 @@ impl Hypervisor for Dragonball {
|
||||
inner.resize_vcpu(old_vcpus, new_vcpus).await
|
||||
}
|
||||
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.add_device(device.clone()).await
|
||||
}
|
||||
|
||||
@@ -110,12 +110,9 @@ impl Hypervisor for Firecracker {
|
||||
inner.save_vm().await
|
||||
}
|
||||
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
match inner.add_device(device.clone()).await {
|
||||
Ok(_) => Ok(device),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
inner.add_device(device.clone()).await
|
||||
}
|
||||
|
||||
async fn remove_device(&self, device: DeviceType) -> Result<()> {
|
||||
|
||||
@@ -115,7 +115,7 @@ pub trait Hypervisor: std::fmt::Debug + Send + Sync {
|
||||
async fn resize_memory(&self, new_mem_mb: u32) -> Result<(u32, MemoryConfig)>;
|
||||
|
||||
// device manager
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType>;
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()>;
|
||||
async fn remove_device(&self, device: DeviceType) -> Result<()>;
|
||||
async fn update_device(&self, device: DeviceType) -> Result<()>;
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,9 @@
|
||||
|
||||
use super::cmdline_generator::{get_network_device, QemuCmdLine};
|
||||
use super::qmp::Qmp;
|
||||
use crate::device::pci_path::PciPath;
|
||||
use crate::device::topology::PCIePort;
|
||||
use crate::qemu::cmdline_generator::VfioDeviceConfig;
|
||||
use crate::qemu::qmp::get_qmp_socket_path;
|
||||
use crate::{
|
||||
device::driver::ProtectionDeviceConfig, hypervisor_persist::HypervisorState, selinux,
|
||||
@@ -21,6 +23,7 @@ use crate::utils::{
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use kata_sys_util::netns::NetnsGuard;
|
||||
use kata_sys_util::pcilibs::calc_fw_cfg_mmio64_mb;
|
||||
use kata_types::build_path;
|
||||
use kata_types::config::hypervisor::{RootlessUser, VIRTIO_BLK_CCW};
|
||||
use kata_types::rootless::is_rootless;
|
||||
@@ -32,7 +35,7 @@ use nix::unistd::{setgid, setuid, Gid, Uid};
|
||||
use persist::sandbox_persist::Persist;
|
||||
use qapi_qmp::MigrationStatus;
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::TryInto;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::path::Path;
|
||||
use std::process::Stdio;
|
||||
use std::time::Duration;
|
||||
@@ -208,6 +211,60 @@ impl QemuInner {
|
||||
_ => info!(sl!(), "no need to add {} ports", port_type),
|
||||
}
|
||||
}
|
||||
DeviceType::VfioModern(vfio_dev) => {
|
||||
// To avoid holding the lock for too long, we first snapshot the necessary VFIO parameters,
|
||||
// then release the lock before doing the coldplug via cmdline,
|
||||
// and finally re-acquire the lock to update the guest PCI path after coldplug.
|
||||
let (devices, memory_reserve, _pref64_reserve, bus_port_id) = {
|
||||
let vfio_device = vfio_dev.lock().await;
|
||||
let devices = vfio_device
|
||||
.device
|
||||
.iommu_group
|
||||
.as_ref()
|
||||
.map(|g| g.clone().devices)
|
||||
.unwrap_or_default();
|
||||
|
||||
(
|
||||
devices,
|
||||
vfio_device.memory_reserve,
|
||||
vfio_device.pref64_reserve,
|
||||
vfio_device.config.bus_port_id.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Cold plug devices
|
||||
for dev in devices.iter() {
|
||||
let host_bdf = dev.addr.to_string();
|
||||
let fw_cfg_mmio64_mb = calc_fw_cfg_mmio64_mb(&host_bdf);
|
||||
let pref64_reserve = fw_cfg_mmio64_mb * 1024 * 1024;
|
||||
|
||||
let vfio_cfg = VfioDeviceConfig::new(
|
||||
host_bdf,
|
||||
bus_port_id.1 as u16,
|
||||
bus_port_id.1 + 1,
|
||||
)
|
||||
.with_vfio_bus(bus_port_id.0.clone())
|
||||
.with_fw_cfg_mmio_size(fw_cfg_mmio64_mb);
|
||||
|
||||
cmdline.add_pcie_vfio_device(vfio_cfg, memory_reserve, pref64_reserve)?;
|
||||
}
|
||||
|
||||
// Write back with lock
|
||||
let pci_path = PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?;
|
||||
|
||||
{
|
||||
let mut vfio_device = vfio_dev.lock().await;
|
||||
// Update the guest PCI path for the VFIO device after coldplug,
|
||||
// which will be used for device mapping into from Guest to Container Environment.
|
||||
vfio_device.config.guest_pci_path = Some(pci_path.clone());
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Completed VFIOModern coldplug with returned guest pci path: {:?}",
|
||||
pci_path
|
||||
);
|
||||
}
|
||||
_ => info!(sl!(), "qemu cmdline: unsupported device: {:?}", device),
|
||||
}
|
||||
}
|
||||
@@ -813,17 +870,17 @@ use crate::device::DeviceType;
|
||||
|
||||
// device manager part of Hypervisor
|
||||
impl QemuInner {
|
||||
pub(crate) async fn add_device(&mut self, mut device: DeviceType) -> Result<DeviceType> {
|
||||
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
info!(sl!(), "QemuInner::add_device() {}", device);
|
||||
let is_qemu_ready_to_hotplug = self.qmp.is_some();
|
||||
if is_qemu_ready_to_hotplug {
|
||||
// hypervisor is running already
|
||||
device = self.hotplug_device(device)?;
|
||||
self.hotplug_device(device).await?;
|
||||
} else {
|
||||
// store the device to coldplug it later, on hypervisor launch
|
||||
self.devices.push(device.clone());
|
||||
}
|
||||
Ok(device)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remove_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
@@ -834,7 +891,7 @@ impl QemuInner {
|
||||
))
|
||||
}
|
||||
|
||||
fn hotplug_device(&mut self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn hotplug_device(&mut self, device: DeviceType) -> Result<()> {
|
||||
let qmp = match self.qmp {
|
||||
Some(ref mut qmp) => qmp,
|
||||
None => return Err(anyhow!("QMP not initialized")),
|
||||
@@ -879,8 +936,6 @@ impl QemuInner {
|
||||
block_device.config.scsi_addr = Some(addr);
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(DeviceType::Block(block_device));
|
||||
}
|
||||
DeviceType::Vfio(mut vfiodev) => {
|
||||
// FIXME: the first one might not the true device we want to passthrough.
|
||||
@@ -899,12 +954,115 @@ impl QemuInner {
|
||||
&vfiodev.driver_type,
|
||||
&vfiodev.bus,
|
||||
)?;
|
||||
|
||||
return Ok(DeviceType::Vfio(vfiodev));
|
||||
}
|
||||
_ => info!(sl!(), "hotplugging of {:#?} is unsupported", device),
|
||||
DeviceType::BlockModern(block_device) => {
|
||||
info!(sl!(), "Starting QMP hotplug for BlockModern device");
|
||||
|
||||
// Fisrt, snapshot parameters within the lock.
|
||||
// Do not hold the lock across the 'await' point of the hotplug operation to avoid blocking.
|
||||
let (index, path_on_host, aio, is_direct, is_readonly, no_drop, driver) = {
|
||||
let cfg = &block_device.lock().await.config;
|
||||
(
|
||||
cfg.index,
|
||||
cfg.path_on_host.clone(),
|
||||
cfg.blkdev_aio.to_string(),
|
||||
cfg.is_direct,
|
||||
cfg.is_readonly,
|
||||
cfg.no_drop,
|
||||
self.config.blockdev_info.block_device_driver.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Second, execute the asynchronous hotplug without holding the lock.
|
||||
let (pci_path, scsi_addr) = qmp
|
||||
.hotplug_block_device(
|
||||
&driver,
|
||||
index,
|
||||
&path_on_host,
|
||||
&aio,
|
||||
is_direct,
|
||||
is_readonly,
|
||||
no_drop,
|
||||
)
|
||||
.context("hotplug block device")?;
|
||||
|
||||
// Third, re-acquire the lock to write back results.
|
||||
{
|
||||
let mut dev = block_device.lock().await;
|
||||
let cfg = &mut dev.config;
|
||||
if let Some(p) = pci_path {
|
||||
cfg.pci_path = Some(p);
|
||||
}
|
||||
if let Some(s) = scsi_addr {
|
||||
cfg.scsi_addr = Some(s);
|
||||
}
|
||||
info!(sl!(), "Completed BlockModern hotplug: {:?}", &cfg);
|
||||
}
|
||||
}
|
||||
DeviceType::VfioModern(vfiodev) => {
|
||||
// Snapshot VFIO parameters inside the lock.
|
||||
let (hostdev_id, sysfs_path, address, driver_type, bus) = {
|
||||
let vfio_device = vfiodev.lock().await;
|
||||
let hostdev_id = vfio_device.device_id.clone();
|
||||
let device = &vfio_device.device;
|
||||
|
||||
// FIXME: The first device in the group might not be the actual device intended for passthrough.
|
||||
// Multi-function support is tracked via issue #11292.
|
||||
let primary_device = device
|
||||
.clone()
|
||||
.iommu_group
|
||||
.ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))?
|
||||
.primary;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr
|
||||
);
|
||||
|
||||
let sysfs_path = primary_device.sysfs_path.display().to_string();
|
||||
let driver_type = primary_device
|
||||
.driver
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow!("Driver type missing for primary device"))?;
|
||||
let address = format!("{}", primary_device.addr);
|
||||
|
||||
(
|
||||
hostdev_id,
|
||||
sysfs_path,
|
||||
address,
|
||||
driver_type,
|
||||
vfio_device.config.bus_port_id.0.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Execute hotplug outside the lock.
|
||||
let guest_pci_path = qmp.hotplug_vfio_device(
|
||||
&hostdev_id,
|
||||
&sysfs_path,
|
||||
&address,
|
||||
&driver_type,
|
||||
&bus,
|
||||
)?;
|
||||
|
||||
// Write the resulting Guest PCI Path back within the lock.
|
||||
{
|
||||
let mut vfio_device = vfiodev.lock().await;
|
||||
if let Some(p) = guest_pci_path {
|
||||
// Very important to write back the guest pci path for VFIO devices.
|
||||
vfio_device.config.guest_pci_path = Some(p);
|
||||
}
|
||||
info!(
|
||||
sl!(),
|
||||
"Completed VFIOModern hotplug for device ID: {}", hostdev_id
|
||||
);
|
||||
}
|
||||
}
|
||||
_ => info!(
|
||||
sl!(),
|
||||
"Hotplugging for {:#?} is currently unsupported", device
|
||||
),
|
||||
}
|
||||
Ok(device)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
mod cmdline_generator;
|
||||
mod inner;
|
||||
mod qmp;
|
||||
// mod extra2;
|
||||
|
||||
use crate::device::DeviceType;
|
||||
use crate::hypervisor_persist::HypervisorState;
|
||||
@@ -105,7 +106,7 @@ impl Hypervisor for Qemu {
|
||||
inner.save_vm().await
|
||||
}
|
||||
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.add_device(device).await
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ use std::time::Instant;
|
||||
|
||||
/// default qmp connection read timeout
|
||||
const DEFAULT_QMP_READ_TIMEOUT: u64 = 250;
|
||||
const DEFAULT_QMP_CONNECT_DEADLINE_MS: u64 = 5000;
|
||||
const DEFAULT_QMP_CONNECT_DEADLINE_MS: u64 = 50000;
|
||||
const DEFAULT_QMP_RETRY_SLEEP_MS: u64 = 50;
|
||||
|
||||
pub struct Qmp {
|
||||
@@ -72,7 +72,7 @@ impl Qmp {
|
||||
let stream = UnixStream::connect(qmp_sock_path)?;
|
||||
|
||||
stream
|
||||
.set_read_timeout(Some(Duration::from_millis(DEFAULT_QMP_READ_TIMEOUT)))
|
||||
.set_read_timeout(Some(Duration::from_millis(5000)))
|
||||
.context("set qmp read timeout")?;
|
||||
|
||||
let mut qmp = Qmp {
|
||||
@@ -292,6 +292,63 @@ impl Qmp {
|
||||
Ok(hotplugged_mem_size)
|
||||
}
|
||||
|
||||
/// Hotplug an iommufd QOM object in QEMU and return the object id.
|
||||
#[allow(dead_code)]
|
||||
pub fn hotplug_iommufd(
|
||||
&mut self,
|
||||
suffix_or_id: &str,
|
||||
external_fdname: Option<&str>,
|
||||
) -> Result<String> {
|
||||
// Object id in QEMU (also used as fdname for getfd)
|
||||
let obj_id = if suffix_or_id.starts_with("iommufd") {
|
||||
suffix_or_id.to_string()
|
||||
} else {
|
||||
format!("iommufd{suffix_or_id}")
|
||||
};
|
||||
|
||||
{
|
||||
let file = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open("/dev/iommu")
|
||||
.context("open /dev/iommu failed")?;
|
||||
self.pass_fd(file.as_raw_fd(), &obj_id)?;
|
||||
}
|
||||
|
||||
let obj = match external_fdname {
|
||||
None => {
|
||||
qmp::object_add(qapi_qmp::ObjectOptions::iommufd {
|
||||
id: obj_id.clone(),
|
||||
iommufd: qapi_qmp::IOMMUFDProperties { fd: None },
|
||||
})
|
||||
}
|
||||
Some(_fdname) => {
|
||||
qmp::object_add(qapi_qmp::ObjectOptions::iommufd {
|
||||
id: obj_id.clone(),
|
||||
iommufd: qapi_qmp::IOMMUFDProperties {
|
||||
fd: Some(obj_id.to_string()),
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
match self.qmp.execute(&obj) {
|
||||
Ok(_) => Ok(obj_id),
|
||||
Err(e) => {
|
||||
let msg = format!("{e:#}");
|
||||
if msg.contains("duplicate ID")
|
||||
|| msg.contains("already exists")
|
||||
|| msg.contains("exists")
|
||||
{
|
||||
Ok(obj_id)
|
||||
} else {
|
||||
Err(anyhow!(e))
|
||||
.with_context(|| format!("object-add iommufd failed (id={obj_id})"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hotplug_memory(&mut self, size: u64) -> Result<()> {
|
||||
let memdev_idx = self
|
||||
.qmp
|
||||
|
||||
@@ -246,9 +246,9 @@ impl RemoteInner {
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub(crate) async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
pub(crate) async fn add_device(&self, _device: DeviceType) -> Result<()> {
|
||||
warn!(sl!(), "RemoteInner::add_device(): NOT YET IMPLEMENTED");
|
||||
Ok(device)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remove_device(&self, _device: DeviceType) -> Result<()> {
|
||||
|
||||
@@ -86,7 +86,7 @@ impl Hypervisor for Remote {
|
||||
inner.save_vm().await
|
||||
}
|
||||
|
||||
async fn add_device(&self, device: DeviceType) -> Result<DeviceType> {
|
||||
async fn add_device(&self, device: DeviceType) -> Result<()> {
|
||||
let inner = self.inner.write().await;
|
||||
inner.add_device(device).await
|
||||
}
|
||||
|
||||
@@ -10,14 +10,14 @@ use agent::types::Device;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct DeviceInfo {
|
||||
pub class_id: String,
|
||||
pub vendor_id: String,
|
||||
pub host_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ContainerDevice {
|
||||
pub device_info: Option<DeviceInfo>,
|
||||
pub device: Device,
|
||||
|
||||
@@ -18,7 +18,8 @@ mod manager_inner;
|
||||
pub mod network;
|
||||
pub mod resource_persist;
|
||||
use hypervisor::{
|
||||
BlockConfig, HybridVsockConfig, PortDeviceConfig, ProtectionDeviceConfig, VsockConfig,
|
||||
vfio_device::VfioDeviceBase, BlockConfig, HybridVsockConfig, PortDeviceConfig,
|
||||
ProtectionDeviceConfig, VsockConfig,
|
||||
};
|
||||
use network::NetworkConfig;
|
||||
pub mod rootfs;
|
||||
@@ -39,6 +40,7 @@ pub enum ResourceConfig {
|
||||
HybridVsock(HybridVsockConfig),
|
||||
Vsock(VsockConfig),
|
||||
Protection(ProtectionDeviceConfig),
|
||||
VfioDeviceModern(VfioDeviceBase),
|
||||
PortDevice(PortDeviceConfig),
|
||||
InitData(BlockConfig),
|
||||
}
|
||||
|
||||
@@ -211,6 +211,11 @@ impl ResourceManagerInner {
|
||||
.await
|
||||
.context("do handle initdata block device failed.")?;
|
||||
}
|
||||
ResourceConfig::VfioDeviceModern(vfiobase) => {
|
||||
do_handle_device(&self.device_manager, &DeviceConfig::VfioModernCfg(vfiobase))
|
||||
.await
|
||||
.context("do handle vfio device failed.")?;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -481,50 +486,118 @@ impl ResourceManagerInner {
|
||||
.await
|
||||
.context("do handle device")?;
|
||||
|
||||
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
|
||||
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
|
||||
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
|
||||
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
|
||||
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
|
||||
"vfio" => {
|
||||
if bus_type == "ccw" {
|
||||
"vfio-ap".to_string()
|
||||
} else {
|
||||
"vfio-pci".to_string()
|
||||
}
|
||||
}
|
||||
_ => "vfio-pci-gk".to_string(),
|
||||
};
|
||||
if let DeviceType::VfioModern(vfio_dev) = device_info.clone() {
|
||||
info!(sl!(), "device info: {:?}", vfio_dev.lock().await);
|
||||
let vfio_device = vfio_dev.lock().await;
|
||||
let guest_pci_path = vfio_device
|
||||
.config
|
||||
.guest_pci_path
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
let host_bdf = vfio_device.device.primary.addr.to_string();
|
||||
info!(
|
||||
sl!(),
|
||||
"vfio device guest pci path: {:?}, host bdf: {:?}",
|
||||
guest_pci_path,
|
||||
&host_bdf
|
||||
);
|
||||
|
||||
// create agent device
|
||||
if let DeviceType::Vfio(device) = device_info {
|
||||
let device_options = sort_options_by_pcipath(device.device_options);
|
||||
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
|
||||
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
|
||||
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
|
||||
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
|
||||
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
|
||||
"vfio" => {
|
||||
if bus_type == "ccw" {
|
||||
"vfio-ap".to_string()
|
||||
} else {
|
||||
"vfio-pci".to_string()
|
||||
}
|
||||
}
|
||||
_ => "vfio-pci-gk".to_string(),
|
||||
};
|
||||
let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)];
|
||||
let agent_device = Device {
|
||||
id: device.device_id, // just for kata-agent
|
||||
id: vfio_device.clone().device_id, // just for kata-agent
|
||||
container_path: d.path().display().to_string().clone(),
|
||||
field_type: vfio_mode,
|
||||
options: device_options,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = if let Some(device_vendor_class) =
|
||||
&device.devices.first().unwrap().device_vendor_class
|
||||
{
|
||||
let vendor_class = device_vendor_class
|
||||
.get_vendor_class_id()
|
||||
.context("get vendor class failed")?;
|
||||
Some(DeviceInfo {
|
||||
vendor_id: vendor_class.0.to_owned(),
|
||||
class_id: vendor_class.1.to_owned(),
|
||||
host_path: d.path().clone(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let device_info = Some(DeviceInfo {
|
||||
vendor_id: vfio_device
|
||||
.device
|
||||
.primary
|
||||
.vendor_id
|
||||
.clone()
|
||||
.unwrap_or_default(),
|
||||
class_id: format!(
|
||||
"{:#08x}",
|
||||
vfio_device.device.primary.class_code.unwrap_or_default()
|
||||
),
|
||||
host_path: d.path().clone(),
|
||||
});
|
||||
info!(
|
||||
sl!(),
|
||||
"vfio device info for agent: {:?}",
|
||||
device_info.clone()
|
||||
);
|
||||
info!(
|
||||
sl!(),
|
||||
"agent device info for agent: {:?}",
|
||||
agent_device.clone()
|
||||
);
|
||||
devices.push(ContainerDevice {
|
||||
device_info,
|
||||
device: agent_device,
|
||||
});
|
||||
} else {
|
||||
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
|
||||
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
|
||||
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
|
||||
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
|
||||
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
|
||||
"vfio" => {
|
||||
if bus_type == "ccw" {
|
||||
"vfio-ap".to_string()
|
||||
} else {
|
||||
"vfio-pci".to_string()
|
||||
}
|
||||
}
|
||||
_ => "vfio-pci-gk".to_string(),
|
||||
};
|
||||
|
||||
// create agent device
|
||||
if let DeviceType::Vfio(device) = device_info {
|
||||
let device_options = sort_options_by_pcipath(device.device_options);
|
||||
let agent_device = Device {
|
||||
id: device.device_id, // just for kata-agent
|
||||
container_path: d.path().display().to_string().clone(),
|
||||
field_type: vfio_mode,
|
||||
options: device_options,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = if let Some(device_vendor_class) =
|
||||
&device.devices.first().unwrap().device_vendor_class
|
||||
{
|
||||
let vendor_class = device_vendor_class
|
||||
.get_vendor_class_id()
|
||||
.context("get vendor class failed")?;
|
||||
Some(DeviceInfo {
|
||||
vendor_id: vendor_class.0.to_owned(),
|
||||
class_id: vendor_class.1.to_owned(),
|
||||
host_path: d.path().clone(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
devices.push(ContainerDevice {
|
||||
device_info,
|
||||
device: agent_device,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
|
||||
@@ -11,7 +11,7 @@ use hypervisor::{
|
||||
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
|
||||
DeviceConfig,
|
||||
},
|
||||
BlockConfig, BlockDeviceAio,
|
||||
BlockConfigModern,
|
||||
};
|
||||
use kata_types::mount::DirectVolumeMountInfo;
|
||||
use nix::sys::{stat, stat::SFlag};
|
||||
@@ -58,17 +58,17 @@ impl RawblockVolume {
|
||||
));
|
||||
}
|
||||
|
||||
let block_config = BlockConfig {
|
||||
let block_config = BlockConfigModern {
|
||||
path_on_host: mount_info.device.clone(),
|
||||
driver_option: blkdev_info.block_device_driver,
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
// blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
num_queues: blkdev_info.num_queues,
|
||||
queue_size: blkdev_info.queue_size,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// create and insert block device into Kata VM
|
||||
let device_info = do_handle_device(d, &DeviceConfig::BlockCfg(block_config.clone()))
|
||||
let device_info = do_handle_device(d, &DeviceConfig::BlockCfgModern(block_config.clone()))
|
||||
.await
|
||||
.context("do handle device failed.")?;
|
||||
|
||||
|
||||
@@ -86,6 +86,32 @@ pub async fn handle_block_volume(
|
||||
// BlockVolume.
|
||||
// safe here, device_info is correct and only unwrap it.
|
||||
let mut device_id = String::new();
|
||||
|
||||
if let DeviceType::BlockModern(device_mod) = device_info.clone() {
|
||||
let device = &device_mod.lock().await;
|
||||
let blk_driver = device.config.driver_option.clone();
|
||||
// blk, mmioblk
|
||||
storage.driver = blk_driver.clone();
|
||||
storage.source = match blk_driver.as_str() {
|
||||
KATA_BLK_DEV_TYPE => {
|
||||
if let Some(pci_path) = &device.config.pci_path {
|
||||
pci_path.to_string()
|
||||
} else {
|
||||
return Err(anyhow!("block driver is blk but no pci path exists"));
|
||||
}
|
||||
}
|
||||
KATA_SCSI_DEV_TYPE => {
|
||||
if let Some(scsi_addr) = &device.config.scsi_addr {
|
||||
scsi_addr.to_string()
|
||||
} else {
|
||||
return Err(anyhow!("block driver is scsi but no scsi address exists"));
|
||||
}
|
||||
}
|
||||
_ => device.config.virt_path.clone(),
|
||||
};
|
||||
device_id = device.device_id.clone();
|
||||
}
|
||||
|
||||
if let DeviceType::Block(device) = device_info {
|
||||
let blk_driver = device.config.driver_option;
|
||||
// blk, mmioblk
|
||||
|
||||
@@ -41,6 +41,7 @@ kata-types = { workspace = true }
|
||||
protocols = { workspace = true }
|
||||
protobuf = { workspace = true }
|
||||
kata-sys-util = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
logging = { workspace = true }
|
||||
runtime-spec = { workspace = true }
|
||||
shim-interface = { workspace = true }
|
||||
|
||||
@@ -8,6 +8,7 @@ license = { workspace = true }
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
container-device-interface = "0.1.2"
|
||||
awaitgroup = "0.6.0"
|
||||
containerd-shim-protos = { workspace = true }
|
||||
libc = { workspace = true }
|
||||
@@ -30,6 +31,7 @@ agent = { workspace = true }
|
||||
common = { workspace = true }
|
||||
hypervisor = { workspace = true, features = ["cloud-hypervisor"] }
|
||||
kata-sys-util = { workspace = true }
|
||||
pod-resources-rs = { workspace = true }
|
||||
kata-types = { workspace = true }
|
||||
logging = { workspace = true }
|
||||
runtime-spec = { workspace = true }
|
||||
|
||||
@@ -24,7 +24,7 @@ use common::{
|
||||
};
|
||||
|
||||
use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
|
||||
use hypervisor::VsockConfig;
|
||||
use hypervisor::device::topology::PCIePort;
|
||||
use hypervisor::HYPERVISOR_FIRECRACKER;
|
||||
use hypervisor::HYPERVISOR_REMOTE;
|
||||
#[cfg(feature = "dragonball")]
|
||||
@@ -37,6 +37,7 @@ use hypervisor::{
|
||||
use hypervisor::{BlockConfig, Hypervisor};
|
||||
use hypervisor::{BlockDeviceAio, PortDeviceConfig};
|
||||
use hypervisor::{ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
|
||||
use hypervisor::{VfioDeviceBase, VsockConfig};
|
||||
use kata_sys_util::hooks::HookStates;
|
||||
use kata_sys_util::protection::{available_guest_protection, GuestProtection};
|
||||
use kata_sys_util::spec::load_oci_spec;
|
||||
@@ -47,6 +48,8 @@ use kata_types::config::{hypervisor::Factory, TomlConfig};
|
||||
use kata_types::initdata::{calculate_initdata_digest, ProtectedPlatform};
|
||||
use oci_spec::runtime as oci;
|
||||
use persist::{self, sandbox_persist::Persist};
|
||||
use container_device_interface::specs::config::DeviceNode as CdiSpecDeviceNode;
|
||||
use pod_resources_rs::handle_cdi_devices;
|
||||
use protobuf::SpecialFields;
|
||||
use resource::coco_data::initdata::{
|
||||
kata_shared_init_data_path, InitDataConfig, KATA_INIT_DATA_IMAGE,
|
||||
@@ -56,14 +59,26 @@ use resource::manager::ManagerArgs;
|
||||
use resource::network::{dan_config_path, DanNetworkConfig, NetworkConfig, NetworkWithNetNsConfig};
|
||||
use resource::{ResourceConfig, ResourceManager};
|
||||
use runtime_spec as spec;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use strum::Display;
|
||||
use tokio::sync::{mpsc::Sender, Mutex, RwLock};
|
||||
use tokio::time;
|
||||
use tracing::instrument;
|
||||
|
||||
pub(crate) const VIRTCONTAINER: &str = "virt_container";
|
||||
|
||||
/// Host path for CDI device nodes (`hostPath` when set, else `path`). Needed because
|
||||
/// `container_device_interface::specs::config::DeviceNode` fields are crate-private.
|
||||
fn cdi_device_node_host_path(dn: &CdiSpecDeviceNode) -> Option<String> {
|
||||
serde_json::to_value(dn).ok().and_then(|v| {
|
||||
v.get("hostPath")
|
||||
.or_else(|| v.get("path"))
|
||||
.and_then(|p| p.as_str())
|
||||
.map(String::from)
|
||||
})
|
||||
}
|
||||
|
||||
pub struct SandboxRestoreArgs {
|
||||
pub sid: String,
|
||||
pub toml_config: TomlConfig,
|
||||
@@ -161,7 +176,7 @@ impl VirtSandbox {
|
||||
async fn prepare_for_start_sandbox(
|
||||
&self,
|
||||
id: &str,
|
||||
network_env: SandboxNetworkEnv,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let mut resource_configs = vec![];
|
||||
|
||||
@@ -172,6 +187,7 @@ impl VirtSandbox {
|
||||
.context("failed to prepare vm socket config")?;
|
||||
resource_configs.push(vm_socket_config);
|
||||
|
||||
let network_env: SandboxNetworkEnv = sandbox_config.network_env.clone();
|
||||
// prepare network config
|
||||
if !network_env.network_created {
|
||||
if let Some(network_resource) = self.prepare_network_resource(&network_env).await {
|
||||
@@ -207,6 +223,17 @@ impl VirtSandbox {
|
||||
None
|
||||
};
|
||||
|
||||
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
if !vfio_devices.is_empty() {
|
||||
info!(
|
||||
sl!(),
|
||||
"prepare pod devices {vfio_devices:?} for sandbox done."
|
||||
);
|
||||
resource_configs.extend(vfio_devices);
|
||||
} else {
|
||||
info!(sl!(), "no pod devices to prepare for sandbox.");
|
||||
}
|
||||
|
||||
// prepare protection device config
|
||||
if let Some(protection_dev_config) = self
|
||||
.prepare_protection_device_config(&self.hypervisor.hypervisor_config().await, init_data)
|
||||
@@ -252,6 +279,64 @@ impl VirtSandbox {
|
||||
}
|
||||
}
|
||||
|
||||
async fn prepare_coldplug_cdi_devices(
|
||||
&self,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let config = self.resource_manager.config().await;
|
||||
let pod_resource_socket = &config.runtime.pod_resource_api_sock;
|
||||
info!(
|
||||
sl!(),
|
||||
"sandbox pod_resource_socket: {:?}", pod_resource_socket
|
||||
);
|
||||
// If pod_resource_socket is empty, we should treat it as not support such function.
|
||||
if !Path::new(pod_resource_socket).exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let annotations = &sandbox_config.annotations;
|
||||
info!(sl!(), "sandbox annotations: {:?}", annotations);
|
||||
|
||||
let cdi_devices = (pod_resources_rs::pod_resources::get_pod_cdi_devices(
|
||||
pod_resource_socket,
|
||||
annotations,
|
||||
)
|
||||
.await)
|
||||
.unwrap_or_default();
|
||||
info!(sl!(), "pod cdi devices: {:?}", cdi_devices);
|
||||
|
||||
let device_nodes =
|
||||
handle_cdi_devices(&cdi_devices, time::Duration::from_secs(100))
|
||||
.await?;
|
||||
let paths: Vec<String> = device_nodes
|
||||
.iter()
|
||||
.filter_map(cdi_device_node_host_path)
|
||||
.collect();
|
||||
|
||||
// FQN: nvidia.com/gpu=X
|
||||
let mut vfio_configs = Vec::new();
|
||||
for path in paths.iter() {
|
||||
let dev_info = VfioDeviceBase {
|
||||
host_path: path.clone(),
|
||||
// CDI passes the per-device cdev (e.g. /dev/vfio/devices/vfio0); device_manager
|
||||
// also copies host_path here — set early so configs are self-consistent in logs
|
||||
// and any code path that runs before that assignment still discovers VFIO correctly.
|
||||
iommu_group_devnode: PathBuf::from(path),
|
||||
dev_type: "c".to_string(),
|
||||
// bus_type: bus_type.clone(),
|
||||
port: PCIePort::RootPort,
|
||||
hostdev_prefix: "vfio_device".to_owned(),
|
||||
..Default::default()
|
||||
};
|
||||
vfio_configs.push(dev_info);
|
||||
}
|
||||
|
||||
Ok(vfio_configs
|
||||
.into_iter()
|
||||
.map(ResourceConfig::VfioDeviceModern)
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn prepare_network_resource(
|
||||
&self,
|
||||
network_env: &SandboxNetworkEnv,
|
||||
@@ -602,9 +687,7 @@ impl Sandbox for VirtSandbox {
|
||||
|
||||
// generate device and setup before start vm
|
||||
// should after hypervisor.prepare_vm
|
||||
let resources = self
|
||||
.prepare_for_start_sandbox(id, sandbox_config.network_env.clone())
|
||||
.await?;
|
||||
let resources = self.prepare_for_start_sandbox(id, sandbox_config).await?;
|
||||
|
||||
self.resource_manager
|
||||
.prepare_before_start_vm(resources)
|
||||
@@ -786,7 +869,7 @@ impl Sandbox for VirtSandbox {
|
||||
// generate device and setup before start vm
|
||||
// should after hypervisor.prepare_vm
|
||||
let resources = self
|
||||
.prepare_for_start_sandbox(id, sandbox_config.network_env.clone())
|
||||
.prepare_for_start_sandbox(id, sandbox_config)
|
||||
.await
|
||||
.context("prepare resources before start vm")?;
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module github.com/kata-containers/kata-containers/src/runtime
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
// WARNING: Do NOT use `replace` directives as those break dependabot:
|
||||
// https://github.com/kata-containers/kata-containers/issues/11020
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module kata-containers/csi-kata-directvolume
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
// WARNING: Do NOT use `replace` directives as those break dependabot:
|
||||
// https://github.com/kata-containers/kata-containers/issues/11020
|
||||
|
||||
@@ -184,8 +184,6 @@ pub struct HypervisorInfo {
|
||||
#[serde(default)]
|
||||
virtio_fs_daemon: String,
|
||||
#[serde(default)]
|
||||
msize_9p: u32,
|
||||
#[serde(default)]
|
||||
memory_slots: u32,
|
||||
#[serde(default)]
|
||||
pcie_root_port: u32,
|
||||
@@ -417,7 +415,6 @@ pub fn get_hypervisor_info(
|
||||
.clone()
|
||||
.unwrap_or_else(|| String::from("none")),
|
||||
virtio_fs_daemon: hypervisor_config.shared_fs.virtio_fs_daemon.to_string(),
|
||||
msize_9p: hypervisor_config.shared_fs.msize_9p,
|
||||
memory_slots: hypervisor_config.memory_info.memory_slots,
|
||||
pcie_root_port: hypervisor_config.device_info.pcie_root_port,
|
||||
hotplug_vfio_on_rootbus: hypervisor_config.device_info.hotplug_vfio_on_root_bus,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module github.com/kata-containers/kata-containers/src/tools/log-parser
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
require (
|
||||
github.com/BurntSushi/toml v1.1.0
|
||||
|
||||
@@ -635,7 +635,7 @@ function helm_helper() {
|
||||
base_values_file="${helm_chart_dir}/try-kata-nvidia-gpu.values.yaml"
|
||||
fi
|
||||
;;
|
||||
qemu-snp|qemu-tdx|qemu-se|qemu-se-runtime-rs|qemu-cca|qemu-coco-dev|qemu-coco-dev-runtime-rs)
|
||||
qemu-snp|qemu-snp-runtime-rs|qemu-tdx|qemu-se|qemu-se-runtime-rs|qemu-cca|qemu-coco-dev|qemu-coco-dev-runtime-rs)
|
||||
# Use TEE example file
|
||||
if [[ -f "${helm_chart_dir}/try-kata-tee.values.yaml" ]]; then
|
||||
base_values_file="${helm_chart_dir}/try-kata-tee.values.yaml"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module github.com/kata-containers/tests
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
// WARNING: Do NOT use `replace` directives as those break dependabot:
|
||||
// https://github.com/kata-containers/kata-containers/issues/11020
|
||||
|
||||
@@ -11,7 +11,7 @@ source "${BATS_TEST_DIRNAME}/../../common.bash"
|
||||
load "${BATS_TEST_DIRNAME}/confidential_kbs.sh"
|
||||
|
||||
SUPPORTED_GPU_TEE_HYPERVISORS=("qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx")
|
||||
SUPPORTED_TEE_HYPERVISORS=("qemu-snp" "qemu-tdx" "qemu-se" "qemu-se-runtime-rs" "${SUPPORTED_GPU_TEE_HYPERVISORS[@]}")
|
||||
SUPPORTED_TEE_HYPERVISORS=("qemu-snp" "qemu-snp-runtime-rs" "qemu-tdx" "qemu-se" "qemu-se-runtime-rs" "${SUPPORTED_GPU_TEE_HYPERVISORS[@]}")
|
||||
SUPPORTED_NON_TEE_HYPERVISORS=("qemu-coco-dev" "qemu-coco-dev-runtime-rs")
|
||||
|
||||
function setup_unencrypted_confidential_pod() {
|
||||
@@ -36,7 +36,7 @@ function get_remote_command_per_hypervisor() {
|
||||
qemu-se*)
|
||||
echo "cd /sys/firmware/uv; cat prot_virt_guest | grep 1"
|
||||
;;
|
||||
qemu-snp)
|
||||
qemu-snp|qemu-snp-runtime-rs)
|
||||
echo "dmesg | grep \"Memory Encryption Features active:.*SEV-SNP\""
|
||||
;;
|
||||
qemu-tdx)
|
||||
|
||||
@@ -187,7 +187,7 @@ function deploy_kata() {
|
||||
|
||||
# Workaround to avoid modifying the workflow yaml files
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-tdx|qemu-snp|qemu-nvidia-gpu-*)
|
||||
qemu-tdx|qemu-snp|qemu-snp-runtime-rs|qemu-nvidia-gpu-*)
|
||||
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER=true
|
||||
SNAPSHOTTER="nydus"
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL=false
|
||||
@@ -447,7 +447,7 @@ function cleanup() {
|
||||
}
|
||||
|
||||
function deploy_snapshotter() {
|
||||
if [[ "${KATA_HYPERVISOR}" == "qemu-tdx" || "${KATA_HYPERVISOR}" == "qemu-snp" ]]; then
|
||||
if [[ "${KATA_HYPERVISOR}" == "qemu-tdx" || "${KATA_HYPERVISOR}" == "qemu-snp" || "${KATA_HYPERVISOR}" == "qemu-snp-runtime-rs" ]]; then
|
||||
echo "[Skip] ${SNAPSHOTTER} is pre-installed in the TEE machine"
|
||||
return
|
||||
fi
|
||||
@@ -461,7 +461,7 @@ function deploy_snapshotter() {
|
||||
}
|
||||
|
||||
function cleanup_snapshotter() {
|
||||
if [[ "${KATA_HYPERVISOR}" == "qemu-tdx" || "${KATA_HYPERVISOR}" == "qemu-snp" ]]; then
|
||||
if [[ "${KATA_HYPERVISOR}" == "qemu-tdx" || "${KATA_HYPERVISOR}" == "qemu-snp" || "${KATA_HYPERVISOR}" == "qemu-snp-runtime-rs" ]]; then
|
||||
echo "[Skip] ${SNAPSHOTTER} is pre-installed in the TEE machine"
|
||||
return
|
||||
fi
|
||||
|
||||
@@ -146,15 +146,22 @@ setup() {
|
||||
kbs_set_cpu0_resource_policy
|
||||
|
||||
# get measured artifacts from qemu command line of previous test
|
||||
# Go runtime logs: "launching <path> with: [<args>]"
|
||||
# runtime-rs logs: "qemu args: <args>"
|
||||
log_line=$(sudo journalctl -r -x -t kata | grep -m 1 'launching.*qemu.*with:' || true)
|
||||
qemu_cmd=$(echo "$log_line" | sed 's/.*with: \[\(.*\)\]".*/\1/')
|
||||
if [[ -n "$log_line" ]]; then
|
||||
qemu_cmd=$(echo "$log_line" | sed 's/.*with: \[\(.*\)\]".*/\1/')
|
||||
else
|
||||
log_line=$(sudo journalctl -r -x -t kata | grep -m 1 'qemu args:' || true)
|
||||
qemu_cmd=$(echo "$log_line" | sed 's/.*qemu args: //')
|
||||
fi
|
||||
[[ -n "$qemu_cmd" ]] || { echo "Could not find QEMU command line"; return 1; }
|
||||
|
||||
kernel_path=$(echo "$qemu_cmd" | grep -oP -- '-kernel \K[^ ]+')
|
||||
initrd_path=$(echo "$qemu_cmd" | grep -oP -- '-initrd \K[^ ]+' || true)
|
||||
firmware_path=$(echo "$qemu_cmd" | grep -oP -- '-bios \K[^ ]+')
|
||||
vcpu_count=$(echo "$qemu_cmd" | grep -oP -- '-smp \K\d+')
|
||||
append=$(echo "$qemu_cmd" | sed -n 's/.*-append \(.*\) -bios.*/\1/p')
|
||||
append=$(echo "$qemu_cmd" | grep -oP -- '-append \K.*?(?= -(smp|bios) )')
|
||||
# Remove escape backslashes for quotes from output for dm-mod.create parameters
|
||||
append="${append//\\\"/\"}"
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ setup() {
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-se-runtime-rs" ] && skip "Requires CPU hotplug which isn't supported on ${KATA_HYPERVISOR} yet"
|
||||
[[ "${KATA_HYPERVISOR}" == qemu-coco-dev* ]] && skip "Requires CPU hotplug which disabled by static_sandbox_resource_mgmt"
|
||||
( [ "${KATA_HYPERVISOR}" == "qemu-tdx" ] || [ "${KATA_HYPERVISOR}" == "qemu-snp" ] || \
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-se" ] ) \
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-snp-runtime-rs" ] || [ "${KATA_HYPERVISOR}" == "qemu-se" ] ) \
|
||||
&& skip "TEEs do not support memory / CPU hotplug"
|
||||
|
||||
pod_name="constraints-cpu-test"
|
||||
@@ -121,7 +121,7 @@ teardown() {
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-se-runtime-rs" ] && skip "Requires CPU hotplug which isn't supported on ${KATA_HYPERVISOR} yet"
|
||||
[[ "${KATA_HYPERVISOR}" == qemu-coco-dev* ]] && skip "Requires CPU hotplug which disabled by static_sandbox_resource_mgmt"
|
||||
( [ "${KATA_HYPERVISOR}" == "qemu-tdx" ] || [ "${KATA_HYPERVISOR}" == "qemu-snp" ] || \
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-se" ] ) \
|
||||
[ "${KATA_HYPERVISOR}" == "qemu-snp-runtime-rs" ] || [ "${KATA_HYPERVISOR}" == "qemu-se" ] ) \
|
||||
&& skip "TEEs do not support memory / CPU hotplug"
|
||||
|
||||
# Debugging information
|
||||
|
||||
@@ -9,14 +9,18 @@ load "${BATS_TEST_DIRNAME}/../../common.bash"
|
||||
load "${BATS_TEST_DIRNAME}/lib.sh"
|
||||
load "${BATS_TEST_DIRNAME}/tests_common.sh"
|
||||
|
||||
# Currently only the Go runtime provides the config path used here.
|
||||
# If a Rust hypervisor runs this test, mirror the enabling_hypervisor
|
||||
# pattern in tests/common.bash to select the correct runtime-rs config.
|
||||
shim_config_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml"
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
*-runtime-rs)
|
||||
shim_config_file="/opt/kata/share/defaults/kata-containers/runtime-rs/runtimes/${KATA_HYPERVISOR}/configuration-${KATA_HYPERVISOR}.toml"
|
||||
;;
|
||||
*)
|
||||
shim_config_file="/opt/kata/share/defaults/kata-containers/runtimes/${KATA_HYPERVISOR}/configuration-${KATA_HYPERVISOR}.toml"
|
||||
;;
|
||||
esac
|
||||
|
||||
check_and_skip() {
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-tdx|qemu-coco-dev|qemu-snp)
|
||||
qemu-tdx|qemu-coco-dev|qemu-snp|qemu-snp-runtime-rs)
|
||||
if [ "$(uname -m)" == "s390x" ]; then
|
||||
skip "measured rootfs tests not implemented for s390x"
|
||||
fi
|
||||
|
||||
@@ -93,7 +93,7 @@ else
|
||||
"k8s-nvidia-nim-service.bats")
|
||||
fi
|
||||
|
||||
SUPPORTED_HYPERVISORS=("qemu-nvidia-gpu" "qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx")
|
||||
SUPPORTED_HYPERVISORS=("qemu-nvidia-gpu" "qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx" "qemu-nvidia-gpu-runtime-rs" "qemu-nvidia-gpu-snp-runtime-rs" "qemu-nvidia-gpu-tdx-runtime-rs")
|
||||
export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu}"
|
||||
# shellcheck disable=SC2076 # intentionally use literal string matching
|
||||
if [[ ! " ${SUPPORTED_HYPERVISORS[*]} " =~ " ${KATA_HYPERVISOR} " ]]; then
|
||||
|
||||
@@ -138,7 +138,7 @@ add_runtime_handler_annotations() {
|
||||
fi
|
||||
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-coco-dev | qemu-snp | qemu-tdx | qemu-coco-dev-runtime-rs)
|
||||
qemu-coco-dev | qemu-snp | qemu-snp-runtime-rs | qemu-tdx | qemu-coco-dev-runtime-rs)
|
||||
info "Add runtime handler annotations for ${KATA_HYPERVISOR}"
|
||||
local handler_value="kata-${KATA_HYPERVISOR}"
|
||||
for K8S_TEST_YAML in runtimeclass_workloads_work/*.yaml
|
||||
|
||||
@@ -82,7 +82,7 @@ auto_generate_policy_enabled() {
|
||||
|
||||
is_coco_platform() {
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
"qemu-tdx"|"qemu-snp"|"qemu-coco-dev"|"qemu-coco-dev-runtime-rs"|"qemu-nvidia-gpu-tdx"|"qemu-nvidia-gpu-snp")
|
||||
"qemu-tdx"|"qemu-snp"|"qemu-snp-runtime-rs"|"qemu-coco-dev"|"qemu-coco-dev-runtime-rs"|"qemu-nvidia-gpu-tdx"|"qemu-nvidia-gpu-snp")
|
||||
return 0
|
||||
;;
|
||||
*)
|
||||
@@ -148,7 +148,7 @@ install_genpolicy_drop_ins() {
|
||||
# 20-* OCI version overlay
|
||||
if [[ "${KATA_HOST_OS:-}" == "cbl-mariner" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.2.0-drop-in.json" "${settings_d}/"
|
||||
elif is_k3s_or_rke2 || is_nvidia_gpu_platform || [[ "${KATA_HYPERVISOR}" == "qemu-snp" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
elif is_k3s_or_rke2 || is_nvidia_gpu_platform || [[ "${KATA_HYPERVISOR}" == "qemu-snp" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-snp-runtime-rs" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.3.0-drop-in.json" "${settings_d}/"
|
||||
fi
|
||||
|
||||
@@ -340,7 +340,7 @@ hard_coded_policy_tests_enabled() {
|
||||
# CI is testing hard-coded policies just on a the platforms listed here. Outside of CI,
|
||||
# users can enable testing of the same policies (plus the auto-generated policies) by
|
||||
# specifying AUTO_GENERATE_POLICY=yes.
|
||||
local -r enabled_hypervisors=("qemu-coco-dev" "qemu-snp" "qemu-tdx" "qemu-coco-dev-runtime-rs")
|
||||
local -r enabled_hypervisors=("qemu-coco-dev" "qemu-snp" "qemu-snp-runtime-rs" "qemu-tdx" "qemu-coco-dev-runtime-rs")
|
||||
for enabled_hypervisor in "${enabled_hypervisors[@]}"
|
||||
do
|
||||
if [[ "${enabled_hypervisor}" == "${KATA_HYPERVISOR}" ]]; then
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module example.com/m
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
require (
|
||||
github.com/BurntSushi/toml v1.3.2
|
||||
|
||||
@@ -30,8 +30,11 @@ const ALL_SHIMS: &[&str] = &[
|
||||
"qemu-coco-dev",
|
||||
"qemu-coco-dev-runtime-rs",
|
||||
"qemu-nvidia-gpu",
|
||||
"qemu-nvidia-gpu-runtime-rs",
|
||||
"qemu-nvidia-gpu-snp",
|
||||
"qemu-nvidia-gpu-snp-runtime-rs",
|
||||
"qemu-nvidia-gpu-tdx",
|
||||
"qemu-nvidia-gpu-tdx-runtime-rs",
|
||||
"qemu-runtime-rs",
|
||||
"qemu-se",
|
||||
"qemu-se-runtime-rs",
|
||||
@@ -440,14 +443,46 @@ fn add_kata_deploy_warning(config_file: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Atomically replace a file with a symlink.
|
||||
///
|
||||
/// Creates the symlink at a temporary path first, then renames it over the
|
||||
/// original so the original is preserved if symlink creation fails.
|
||||
fn atomic_symlink_replace(file_path: &str, symlink_target: &str) -> Result<()> {
|
||||
let temp_symlink = format!("{}.tmp-link", file_path);
|
||||
|
||||
// Clean up any stale temp symlink from a previous interrupted run
|
||||
if Path::new(&temp_symlink).exists() || Path::new(&temp_symlink).is_symlink() {
|
||||
let _ = fs::remove_file(&temp_symlink);
|
||||
}
|
||||
|
||||
std::os::unix::fs::symlink(symlink_target, &temp_symlink).with_context(|| {
|
||||
format!(
|
||||
"Failed to create temporary symlink {} -> {}",
|
||||
temp_symlink, symlink_target
|
||||
)
|
||||
})?;
|
||||
|
||||
fs::rename(&temp_symlink, file_path).map_err(|err| {
|
||||
let _ = fs::remove_file(&temp_symlink);
|
||||
anyhow::anyhow!(
|
||||
"Failed to atomically replace {} with symlink to {}: {}",
|
||||
file_path,
|
||||
symlink_target,
|
||||
err
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set up the runtime directory structure for a shim.
|
||||
/// Creates: {config_path}/runtimes/{shim}/
|
||||
/// {config_path}/runtimes/{shim}/config.d/
|
||||
/// {config_path}/runtimes/{shim}/configuration-{shim}.toml (copy of original)
|
||||
///
|
||||
/// Note: We copy the config file instead of symlinking because kata-containers'
|
||||
/// ResolvePath uses filepath.EvalSymlinks, which would resolve to the original
|
||||
/// location and look for config.d there instead of in our per-shim directory.
|
||||
/// After copying, the original config file is replaced with a symlink pointing
|
||||
/// to the runtime copy. This way the runtime's ResolvePath / EvalSymlinks resolves
|
||||
/// the symlink and finds config.d next to the real file in the per-shim directory.
|
||||
fn setup_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
let original_config_dir = format!(
|
||||
"/host{}",
|
||||
@@ -466,9 +501,9 @@ fn setup_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
fs::create_dir_all(&config_d_dir)
|
||||
.with_context(|| format!("Failed to create config.d directory: {}", config_d_dir))?;
|
||||
|
||||
// Copy the original config file to the runtime directory
|
||||
let original_config_file = format!("{}/configuration-{}.toml", original_config_dir, shim);
|
||||
let dest_config_file = format!("{}/configuration-{}.toml", runtime_config_dir, shim);
|
||||
let config_filename = format!("configuration-{}.toml", shim);
|
||||
let original_config_file = format!("{}/{}", original_config_dir, config_filename);
|
||||
let dest_config_file = format!("{}/{}", runtime_config_dir, config_filename);
|
||||
|
||||
// Only copy if original exists
|
||||
if Path::new(&original_config_file).exists() {
|
||||
@@ -481,7 +516,7 @@ fn setup_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
})?;
|
||||
}
|
||||
|
||||
// Copy the base config file
|
||||
// Copy the base config file to the runtime directory
|
||||
fs::copy(&original_config_file, &dest_config_file).with_context(|| {
|
||||
format!(
|
||||
"Failed to copy config: {} -> {}",
|
||||
@@ -493,13 +528,37 @@ fn setup_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
add_kata_deploy_warning(Path::new(&dest_config_file))?;
|
||||
|
||||
info!(" Copied base config: {}", dest_config_file);
|
||||
|
||||
let symlink_target = format!("runtimes/{}/{}", shim, config_filename);
|
||||
atomic_symlink_replace(&original_config_file, &symlink_target)?;
|
||||
|
||||
info!(
|
||||
" Symlinked original config: {} -> {}",
|
||||
original_config_file, symlink_target
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove the runtime directory for a shim during cleanup
|
||||
/// Remove the runtime directory for a shim during cleanup.
|
||||
/// Also removes the symlink at the original config location that was created
|
||||
/// by setup_runtime_directory.
|
||||
fn remove_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
// Remove the symlink at the original config location (if present)
|
||||
let original_config_dir = format!(
|
||||
"/host{}",
|
||||
utils::get_kata_containers_original_config_path(shim, &config.dest_dir)
|
||||
);
|
||||
let original_config_file = format!("{}/configuration-{}.toml", original_config_dir, shim);
|
||||
let original_path = Path::new(&original_config_file);
|
||||
if original_path.is_symlink() {
|
||||
fs::remove_file(&original_config_file).with_context(|| {
|
||||
format!("Failed to remove config symlink: {}", original_config_file)
|
||||
})?;
|
||||
log::debug!("Removed config symlink: {}", original_config_file);
|
||||
}
|
||||
|
||||
let runtime_config_dir = format!(
|
||||
"/host{}",
|
||||
utils::get_kata_containers_config_path(shim, &config.dest_dir)
|
||||
@@ -528,7 +587,7 @@ fn remove_runtime_directory(config: &Config, shim: &str) -> Result<()> {
|
||||
}
|
||||
|
||||
async fn configure_shim_config(config: &Config, shim: &str, container_runtime: &str) -> Result<()> {
|
||||
// Set up the runtime directory structure with symlink to original config
|
||||
// Set up the runtime directory: copy config to per-shim dir and replace original with symlink
|
||||
setup_runtime_directory(config, shim)?;
|
||||
|
||||
let runtime_config_dir = format!(
|
||||
@@ -540,11 +599,11 @@ async fn configure_shim_config(config: &Config, shim: &str, container_runtime: &
|
||||
let kata_config_file =
|
||||
Path::new(&runtime_config_dir).join(format!("configuration-{shim}.toml"));
|
||||
|
||||
// The configuration file (symlink) should exist after setup_runtime_directory()
|
||||
// The configuration file should exist after setup_runtime_directory()
|
||||
if !kata_config_file.exists() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Configuration file not found: {kata_config_file:?}. This file should have been \
|
||||
symlinked from the original config. Check that the shim '{}' has a valid configuration \
|
||||
copied from the original config. Check that the shim '{}' has a valid configuration \
|
||||
file in the artifacts.",
|
||||
shim
|
||||
));
|
||||
@@ -609,7 +668,9 @@ fn get_qemu_share_name(shim: &str) -> Option<String> {
|
||||
let share_name = match shim {
|
||||
"qemu-cca" => "qemu-cca-experimental",
|
||||
"qemu-nvidia-gpu-snp" => "qemu-snp-experimental",
|
||||
"qemu-nvidia-gpu-snp-runtime-rs" => "qemu-snp-experimental",
|
||||
"qemu-nvidia-gpu-tdx" => "qemu-tdx-experimental",
|
||||
"qemu-nvidia-gpu-tdx-runtime-rs" => "qemu-tdx-experimental",
|
||||
_ => "qemu",
|
||||
};
|
||||
|
||||
@@ -999,8 +1060,11 @@ mod tests {
|
||||
#[case("qemu-coco-dev", "qemu")]
|
||||
#[case("qemu-cca", "qemu")]
|
||||
#[case("qemu-nvidia-gpu", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-tdx", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-runtime-rs", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-snp", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-snp-runtime-rs", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-tdx", "qemu")]
|
||||
#[case("qemu-nvidia-gpu-tdx-runtime-rs", "qemu")]
|
||||
#[case("qemu-runtime-rs", "qemu")]
|
||||
#[case("qemu-coco-dev-runtime-rs", "qemu")]
|
||||
#[case("qemu-se-runtime-rs", "qemu")]
|
||||
@@ -1144,4 +1208,141 @@ mod tests {
|
||||
"following the symlink should yield the real content"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atomic_symlink_replace_creates_symlink() {
|
||||
let tmpdir = tempfile::tempdir().unwrap();
|
||||
|
||||
// Create the original file and the target it will point to
|
||||
let target_dir = tmpdir.path().join("runtimes/qemu");
|
||||
fs::create_dir_all(&target_dir).unwrap();
|
||||
let target_file = target_dir.join("configuration-qemu.toml");
|
||||
fs::write(&target_file, "real config content").unwrap();
|
||||
|
||||
let original = tmpdir.path().join("configuration-qemu.toml");
|
||||
fs::write(&original, "original content").unwrap();
|
||||
|
||||
let symlink_target = "runtimes/qemu/configuration-qemu.toml";
|
||||
atomic_symlink_replace(original.to_str().unwrap(), symlink_target).unwrap();
|
||||
|
||||
assert!(original.is_symlink(), "original should now be a symlink");
|
||||
assert_eq!(
|
||||
fs::read_link(&original).unwrap().to_str().unwrap(),
|
||||
symlink_target
|
||||
);
|
||||
assert_eq!(
|
||||
fs::read_to_string(&original).unwrap(),
|
||||
"real config content",
|
||||
"reading through the symlink should yield the target's content"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atomic_symlink_replace_is_idempotent() {
|
||||
let tmpdir = tempfile::tempdir().unwrap();
|
||||
|
||||
let target_dir = tmpdir.path().join("runtimes/qemu");
|
||||
fs::create_dir_all(&target_dir).unwrap();
|
||||
let target_file = target_dir.join("configuration-qemu.toml");
|
||||
fs::write(&target_file, "config content").unwrap();
|
||||
|
||||
let original = tmpdir.path().join("configuration-qemu.toml");
|
||||
fs::write(&original, "original").unwrap();
|
||||
|
||||
let symlink_target = "runtimes/qemu/configuration-qemu.toml";
|
||||
|
||||
// First call
|
||||
atomic_symlink_replace(original.to_str().unwrap(), symlink_target).unwrap();
|
||||
assert!(original.is_symlink());
|
||||
|
||||
// Second call (e.g. re-install) should succeed and still be a valid symlink
|
||||
atomic_symlink_replace(original.to_str().unwrap(), symlink_target).unwrap();
|
||||
assert!(original.is_symlink());
|
||||
assert_eq!(
|
||||
fs::read_link(&original).unwrap().to_str().unwrap(),
|
||||
symlink_target
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atomic_symlink_replace_cleans_stale_temp() {
|
||||
let tmpdir = tempfile::tempdir().unwrap();
|
||||
|
||||
let original = tmpdir.path().join("configuration-qemu.toml");
|
||||
fs::write(&original, "original").unwrap();
|
||||
|
||||
// Simulate a stale temp symlink from an interrupted previous run
|
||||
let stale_temp = tmpdir.path().join("configuration-qemu.toml.tmp-link");
|
||||
std::os::unix::fs::symlink("stale-target", &stale_temp).unwrap();
|
||||
assert!(stale_temp.is_symlink());
|
||||
|
||||
let target_dir = tmpdir.path().join("runtimes/qemu");
|
||||
fs::create_dir_all(&target_dir).unwrap();
|
||||
fs::write(target_dir.join("configuration-qemu.toml"), "content").unwrap();
|
||||
|
||||
let symlink_target = "runtimes/qemu/configuration-qemu.toml";
|
||||
atomic_symlink_replace(original.to_str().unwrap(), symlink_target).unwrap();
|
||||
|
||||
assert!(original.is_symlink());
|
||||
assert_eq!(
|
||||
fs::read_link(&original).unwrap().to_str().unwrap(),
|
||||
symlink_target
|
||||
);
|
||||
// Temp file should not linger
|
||||
assert!(!stale_temp.exists() && !stale_temp.is_symlink());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_setup_and_remove_runtime_directory_symlink() {
|
||||
let tmpdir = tempfile::tempdir().unwrap();
|
||||
|
||||
// Simulate the directory layout that setup_runtime_directory expects
|
||||
// (after copy_artifacts has run), using a Go shim as example.
|
||||
let defaults_dir = tmpdir.path().join("share/defaults/kata-containers");
|
||||
fs::create_dir_all(&defaults_dir).unwrap();
|
||||
|
||||
let config_filename = "configuration-qemu.toml";
|
||||
let original_config = defaults_dir.join(config_filename);
|
||||
fs::write(
|
||||
&original_config,
|
||||
"[hypervisor.qemu]\npath = \"/usr/bin/qemu\"",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Create the runtime directory and copy the config (mimics setup_runtime_directory)
|
||||
let runtime_dir = defaults_dir.join("runtimes/qemu");
|
||||
let config_d_dir = runtime_dir.join("config.d");
|
||||
fs::create_dir_all(&config_d_dir).unwrap();
|
||||
|
||||
let dest_config = runtime_dir.join(config_filename);
|
||||
fs::copy(&original_config, &dest_config).unwrap();
|
||||
|
||||
// Atomically replace the original with a symlink
|
||||
let symlink_target = format!("runtimes/qemu/{}", config_filename);
|
||||
atomic_symlink_replace(original_config.to_str().unwrap(), &symlink_target).unwrap();
|
||||
|
||||
// Verify: original is now a symlink
|
||||
assert!(original_config.is_symlink());
|
||||
assert_eq!(
|
||||
fs::read_link(&original_config).unwrap().to_str().unwrap(),
|
||||
symlink_target
|
||||
);
|
||||
|
||||
// Verify: reading through the symlink yields the real file content
|
||||
assert_eq!(
|
||||
fs::read_to_string(&original_config).unwrap(),
|
||||
fs::read_to_string(&dest_config).unwrap()
|
||||
);
|
||||
|
||||
// Verify: config.d is next to the real file (the resolved path)
|
||||
assert!(dest_config.parent().unwrap().join("config.d").is_dir());
|
||||
|
||||
// Simulate remove_runtime_directory: remove symlink then runtime dir
|
||||
assert!(original_config.is_symlink());
|
||||
fs::remove_file(&original_config).unwrap();
|
||||
assert!(!original_config.exists() && !original_config.is_symlink());
|
||||
|
||||
fs::remove_dir_all(&runtime_dir).unwrap();
|
||||
assert!(!runtime_dir.exists());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -718,8 +718,8 @@ fn parse_custom_runtimes() -> Result<Vec<CustomRuntime>> {
|
||||
/// Returns only shims that are supported for that architecture
|
||||
fn get_default_shims_for_arch(arch: &str) -> &'static str {
|
||||
match arch {
|
||||
"x86_64" => "clh cloud-hypervisor dragonball fc qemu qemu-coco-dev qemu-coco-dev-runtime-rs qemu-runtime-rs qemu-nvidia-gpu qemu-nvidia-gpu-snp qemu-nvidia-gpu-tdx qemu-snp qemu-snp-runtime-rs qemu-tdx qemu-tdx-runtime-rs",
|
||||
"aarch64" => "clh cloud-hypervisor dragonball fc qemu qemu-runtime-rs qemu-nvidia-gpu qemu-cca",
|
||||
"x86_64" => "clh cloud-hypervisor dragonball fc qemu qemu-coco-dev qemu-coco-dev-runtime-rs qemu-runtime-rs qemu-nvidia-gpu qemu-nvidia-gpu-runtime-rs qemu-nvidia-gpu-snp qemu-nvidia-gpu-snp-runtime-rs qemu-nvidia-gpu-tdx qemu-nvidia-gpu-tdx-runtime-rs qemu-snp qemu-snp-runtime-rs qemu-tdx qemu-tdx-runtime-rs",
|
||||
"aarch64" => "clh cloud-hypervisor dragonball fc qemu qemu-runtime-rs qemu-nvidia-gpu qemu-nvidia-gpu-runtime-rs qemu-cca",
|
||||
"s390x" => "qemu qemu-runtime-rs qemu-se qemu-se-runtime-rs qemu-coco-dev qemu-coco-dev-runtime-rs",
|
||||
"ppc64le" => "qemu",
|
||||
_ => "qemu", // Fallback to qemu for unknown architectures
|
||||
|
||||
@@ -10,6 +10,9 @@ pub const RUST_SHIMS: &[&str] = &[
|
||||
"cloud-hypervisor",
|
||||
"dragonball",
|
||||
"qemu-runtime-rs",
|
||||
"qemu-nvidia-gpu-runtime-rs",
|
||||
"qemu-nvidia-gpu-snp-runtime-rs",
|
||||
"qemu-nvidia-gpu-tdx-runtime-rs",
|
||||
"qemu-coco-dev-runtime-rs",
|
||||
"qemu-se-runtime-rs",
|
||||
"qemu-snp-runtime-rs",
|
||||
|
||||
@@ -97,8 +97,11 @@ scheduling:
|
||||
"qemu-tdx" (dict "memory" "2048Mi" "cpu" "1.0")
|
||||
"qemu-tdx-runtime-rs" (dict "memory" "2048Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu-runtime-rs" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu-snp" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu-snp-runtime-rs" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu-tdx" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-nvidia-gpu-tdx-runtime-rs" (dict "memory" "10240Mi" "cpu" "1.0")
|
||||
"qemu-cca" (dict "memory" "2048Mi" "cpu" "1.0")
|
||||
"stratovirt" (dict "memory" "130Mi" "cpu" "250m")
|
||||
"remote" (dict "memory" "120Mi" "cpu" "250m")
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Example values file to enable NVIDIA GPU shims
|
||||
# This includes all NVIDIA GPU-enabled shims: qemu-nvidia-gpu, qemu-nvidia-gpu-snp, qemu-nvidia-gpu-tdx
|
||||
# This includes all NVIDIA GPU-enabled shims: qemu-nvidia-gpu, qemu-nvidia-gpu-runtime-rs, qemu-nvidia-gpu-snp, qemu-nvidia-gpu-snp-runtime-rs, qemu-nvidia-gpu-tdx, qemu-nvidia-gpu-tdx-runtime-rs
|
||||
#
|
||||
# Usage:
|
||||
# helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
@@ -31,6 +31,20 @@ shims:
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "false"
|
||||
|
||||
qemu-nvidia-gpu-runtime-rs:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
- arm64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
runtimeClass:
|
||||
# This label is automatically added by gpu-operator. Override it
|
||||
# if you want to use a different label.
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "false"
|
||||
|
||||
qemu-nvidia-gpu-snp:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
@@ -53,6 +67,28 @@ shims:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
amd.feature.node.kubernetes.io/snp: "true"
|
||||
|
||||
qemu-nvidia-gpu-snp-runtime-rs:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
httpsProxy: ""
|
||||
noProxy: ""
|
||||
runtimeClass:
|
||||
# These labels are automatically added by gpu-operator and NFD
|
||||
# respectively. Override if you want to use a different label.
|
||||
# If you don't have NFD, you need to add the snp label by other
|
||||
# means to your SNP nodes.
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
amd.feature.node.kubernetes.io/snp: "true"
|
||||
|
||||
qemu-nvidia-gpu-tdx:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
@@ -75,6 +111,28 @@ shims:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
intel.feature.node.kubernetes.io/tdx: "true"
|
||||
|
||||
qemu-nvidia-gpu-tdx-runtime-rs:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
httpsProxy: ""
|
||||
noProxy: ""
|
||||
runtimeClass:
|
||||
# These labels are automatically added by gpu-operator and NFD
|
||||
# respectively. Override if you want to use a different label.
|
||||
# If you don't have NFD, you need to add the tdx label by other
|
||||
# means to your TDX nodes.
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
intel.feature.node.kubernetes.io/tdx: "true"
|
||||
|
||||
# Default shim per architecture (prefer NVIDIA GPU shims)
|
||||
defaultShim:
|
||||
amd64: qemu-nvidia-gpu # Can be changed to qemu-nvidia-gpu-snp or qemu-nvidia-gpu-tdx if preferred
|
||||
|
||||
@@ -146,6 +146,20 @@ shims:
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "false"
|
||||
|
||||
qemu-nvidia-gpu-runtime-rs:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
- amd64
|
||||
- arm64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
runtimeClass:
|
||||
# This label is automatically added by gpu-operator. Override it
|
||||
# if you want to use a different label.
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "false"
|
||||
|
||||
qemu-nvidia-gpu-snp:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
@@ -168,6 +182,24 @@ shims:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
amd.feature.node.kubernetes.io/snp: "true"
|
||||
|
||||
qemu-nvidia-gpu-snp-runtime-rs:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
httpsProxy: ""
|
||||
noProxy: ""
|
||||
runtimeClass:
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
amd.feature.node.kubernetes.io/snp: "true"
|
||||
|
||||
qemu-nvidia-gpu-tdx:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
@@ -190,6 +222,24 @@ shims:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
intel.feature.node.kubernetes.io/tdx: "true"
|
||||
|
||||
qemu-nvidia-gpu-tdx-runtime-rs:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
httpsProxy: ""
|
||||
noProxy: ""
|
||||
runtimeClass:
|
||||
nodeSelector:
|
||||
nvidia.com/cc.ready.state: "true"
|
||||
intel.feature.node.kubernetes.io/tdx: "true"
|
||||
|
||||
qemu-snp:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
module module-path
|
||||
|
||||
// Keep in sync with version in versions.yaml
|
||||
go 1.25.8
|
||||
go 1.25.9
|
||||
|
||||
require (
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
|
||||
@@ -470,12 +470,12 @@ languages:
|
||||
description: "Google's 'go' language"
|
||||
notes: "'version' is the default minimum version used by this project."
|
||||
# When updating this, also update in go.mod files.
|
||||
version: "1.25.8"
|
||||
version: "1.25.9"
|
||||
meta:
|
||||
description: |
|
||||
'newest-version' is the latest version known to work when
|
||||
building Kata
|
||||
newest-version: "1.25.8"
|
||||
newest-version: "1.25.9"
|
||||
|
||||
rust:
|
||||
description: "Rust language"
|
||||
|
||||
Reference in New Issue
Block a user