Merge pull request #4582 from BbolroC/vfio-ap

agent: Bring in VFIO-AP device handling again
This commit is contained in:
Fabiano Fidêncio 2023-03-20 11:43:13 +01:00 committed by GitHub
commit 2fe0733dcb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 522 additions and 192 deletions

1
src/agent/Cargo.lock generated
View File

@ -801,6 +801,7 @@ dependencies = [
"async-recursion",
"async-trait",
"capctl",
"cfg-if 1.0.0",
"cgroups-rs",
"clap",
"futures",

View File

@ -48,6 +48,7 @@ slog-scope = "4.1.2"
slog-stdlog = "4.0.0"
log = "0.4.11"
cfg-if = "1.0.0"
prometheus = { version = "0.13.0", features = ["process"] }
procfs = "0.12.0"
anyhow = "1.0.32"

79
src/agent/src/ap.rs Normal file
View File

@ -0,0 +1,79 @@
// Copyright (c) IBM Corp. 2023
//
// SPDX-License-Identifier: Apache-2.0
//
use std::fmt;
use std::str::FromStr;
use anyhow::{anyhow, Context};
// IBM Adjunct Processor (AP) is used for cryptographic operations
// by IBM Crypto Express hardware security modules on IBM zSystem & LinuxONE (s390x).
// In Linux, virtual cryptographic devices are called AP queues.
// The name of an AP queue respects a format <xx>.<xxxx> in hexadecimal notation [1, p.467]:
// - <xx> is an adapter ID
// - <xxxx> is an adapter domain ID
// [1] https://www.ibm.com/docs/en/linuxonibm/pdf/lku5dd05.pdf
#[derive(Debug)]
pub struct Address {
pub adapter_id: u8,
pub adapter_domain: u16,
}
impl Address {
pub fn new(adapter_id: u8, adapter_domain: u16) -> Address {
Address {
adapter_id,
adapter_domain,
}
}
}
impl FromStr for Address {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<Self> {
let split: Vec<&str> = s.split('.').collect();
if split.len() != 2 {
return Err(anyhow!(
"Wrong AP bus format. It needs to be in the form <xx>.<xxxx> (e.g. 0a.003f), got {:?}",
s
));
}
let adapter_id = u8::from_str_radix(split[0], 16).context(format!(
"Wrong AP bus format. AP ID needs to be in the form <xx> (e.g. 0a), got {:?}",
split[0]
))?;
let adapter_domain = u16::from_str_radix(split[1], 16).context(format!(
"Wrong AP bus format. AP domain needs to be in the form <xxxx> (e.g. 003f), got {:?}",
split[1]
))?;
Ok(Address::new(adapter_id, adapter_domain))
}
}
impl fmt::Display for Address {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "{:02x}.{:04x}", self.adapter_id, self.adapter_domain)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_from_str() {
let device = Address::from_str("a.1").unwrap();
assert_eq!(format!("{}", device), "0a.0001");
assert!(Address::from_str("").is_err());
assert!(Address::from_str(".").is_err());
assert!(Address::from_str("0.0.0").is_err());
assert!(Address::from_str("0g.0000").is_err());
assert!(Address::from_str("0a.10000").is_err());
}
}

View File

@ -16,13 +16,12 @@ use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::Mutex;
#[cfg(target_arch = "s390x")]
use crate::ccw;
use crate::linux_abi::*;
use crate::pci;
use crate::sandbox::Sandbox;
use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher};
use anyhow::{anyhow, Context, Result};
use cfg_if::cfg_if;
use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
use protocols::agent::Device;
use tracing::instrument;
@ -46,14 +45,22 @@ pub const DRIVER_NVDIMM_TYPE: &str = "nvdimm";
pub const DRIVER_EPHEMERAL_TYPE: &str = "ephemeral";
pub const DRIVER_LOCAL_TYPE: &str = "local";
pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind";
// VFIO device to be bound to a guest kernel driver
pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk";
// VFIO device to be bound to vfio-pci and made available inside the
// VFIO PCI device to be bound to a guest kernel driver
pub const DRIVER_VFIO_PCI_GK_TYPE: &str = "vfio-pci-gk";
// VFIO PCI device to be bound to vfio-pci and made available inside the
// container as a VFIO device node
pub const DRIVER_VFIO_TYPE: &str = "vfio";
pub const DRIVER_VFIO_PCI_TYPE: &str = "vfio-pci";
pub const DRIVER_VFIO_AP_TYPE: &str = "vfio-ap";
pub const DRIVER_OVERLAYFS_TYPE: &str = "overlayfs";
pub const FS_TYPE_HUGETLB: &str = "hugetlbfs";
cfg_if! {
if #[cfg(target_arch = "s390x")] {
use crate::ap;
use crate::ccw;
}
}
#[instrument]
pub fn online_device(path: &str) -> Result<()> {
fs::write(path, "1")?;
@ -280,7 +287,7 @@ pub async fn get_virtio_blk_ccw_device_name(
sandbox: &Arc<Mutex<Sandbox>>,
device: &ccw::Device,
) -> Result<String> {
let matcher = VirtioBlkCCWMatcher::new(&create_ccw_root_bus_path(), device);
let matcher = VirtioBlkCCWMatcher::new(CCW_ROOT_BUS_PATH, device);
let uev = wait_for_uevent(sandbox, matcher).await?;
let devname = uev.devname;
return match Path::new(SYSTEM_DEV_PATH).join(&devname).to_str() {
@ -401,6 +408,39 @@ async fn get_vfio_device_name(sandbox: &Arc<Mutex<Sandbox>>, grp: IommuGroup) ->
Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname))
}
#[cfg(target_arch = "s390x")]
#[derive(Debug)]
struct ApMatcher {
syspath: String,
}
#[cfg(target_arch = "s390x")]
impl ApMatcher {
fn new(address: ap::Address) -> ApMatcher {
ApMatcher {
syspath: format!(
"{}/card{:02x}/{}",
AP_ROOT_BUS_PATH, address.adapter_id, address
),
}
}
}
#[cfg(target_arch = "s390x")]
impl UeventMatcher for ApMatcher {
fn is_match(&self, uev: &Uevent) -> bool {
uev.action == "add" && uev.devpath == self.syspath
}
}
#[cfg(target_arch = "s390x")]
#[instrument]
async fn wait_for_ap_device(sandbox: &Arc<Mutex<Sandbox>>, address: ap::Address) -> Result<()> {
let matcher = ApMatcher::new(address);
wait_for_uevent(sandbox, matcher).await?;
Ok(())
}
/// Scan SCSI bus for the given SCSI address(SCSI-Id and LUN)
#[instrument]
fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
@ -699,7 +739,7 @@ async fn virtio_nvdimm_device_handler(
Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into())
}
fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
fn split_vfio_pci_option(opt: &str) -> Option<(&str, &str)> {
let mut tokens = opt.split('=');
let hostbdf = tokens.next()?;
let path = tokens.next()?;
@ -714,14 +754,18 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
// Each option should have the form "DDDD:BB:DD.F=<pcipath>"
// DDDD:BB:DD.F is the device's PCI address in the host
// <pcipath> is a PCI path to the device in the guest (see pci.rs)
async fn vfio_device_handler(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE;
#[instrument]
async fn vfio_pci_device_handler(
device: &Device,
sandbox: &Arc<Mutex<Sandbox>>,
) -> Result<SpecUpdate> {
let vfio_in_guest = device.field_type != DRIVER_VFIO_PCI_GK_TYPE;
let mut pci_fixups = Vec::<(pci::Address, pci::Address)>::new();
let mut group = None;
for opt in device.options.iter() {
let (host, pcipath) =
split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?;
let (host, pcipath) = split_vfio_pci_option(opt)
.ok_or_else(|| anyhow!("Malformed VFIO PCI option {:?}", opt))?;
let host =
pci::Address::from_str(host).context("Bad host PCI address in VFIO option {:?}")?;
let pcipath = pci::Path::from_str(pcipath)?;
@ -763,6 +807,28 @@ async fn vfio_device_handler(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) ->
})
}
// The VFIO AP (Adjunct Processor) device handler takes all the APQNs provided as device options
// and awaits them. It sets the minimum AP rescan time of 5 seconds and temporarily adds that
// amount to the hotplug timeout.
#[cfg(target_arch = "s390x")]
#[instrument]
async fn vfio_ap_device_handler(
device: &Device,
sandbox: &Arc<Mutex<Sandbox>>,
) -> Result<SpecUpdate> {
// Force AP bus rescan
fs::write(AP_SCANS_PATH, "1")?;
for apqn in device.options.iter() {
wait_for_ap_device(sandbox, ap::Address::from_str(apqn)?).await?;
}
Ok(Default::default())
}
#[cfg(not(target_arch = "s390x"))]
async fn vfio_ap_device_handler(_: &Device, _: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
Err(anyhow!("AP is only supported on s390x"))
}
#[instrument]
pub async fn add_devices(
devices: &[Device],
@ -828,7 +894,10 @@ async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<Sp
DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, sandbox).await,
DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, sandbox).await,
DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, sandbox).await,
DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => vfio_device_handler(device, sandbox).await,
DRIVER_VFIO_PCI_GK_TYPE | DRIVER_VFIO_PCI_TYPE => {
vfio_pci_device_handler(device, sandbox).await
}
DRIVER_VFIO_AP_TYPE => vfio_ap_device_handler(device, sandbox).await,
_ => Err(anyhow!("Unknown device type {}", device.field_type)),
}
}
@ -1378,7 +1447,7 @@ mod tests {
#[cfg(target_arch = "s390x")]
#[tokio::test]
async fn test_virtio_blk_ccw_matcher() {
let root_bus = create_ccw_root_bus_path();
let root_bus = CCW_ROOT_BUS_PATH;
let subsystem = "block";
let devname = "vda";
let relpath = "0.0.0002";
@ -1487,13 +1556,13 @@ mod tests {
}
#[test]
fn test_split_vfio_option() {
fn test_split_vfio_pci_option() {
assert_eq!(
split_vfio_option("0000:01:00.0=02/01"),
split_vfio_pci_option("0000:01:00.0=02/01"),
Some(("0000:01:00.0", "02/01"))
);
assert_eq!(split_vfio_option("0000:01:00.0=02/01=rubbish"), None);
assert_eq!(split_vfio_option("0000:01:00.0"), None);
assert_eq!(split_vfio_pci_option("0000:01:00.0=02/01=rubbish"), None);
assert_eq!(split_vfio_pci_option("0000:01:00.0"), None);
}
#[test]
@ -1572,4 +1641,35 @@ mod tests {
// Test dev2
assert!(pci_iommu_group(&syspci, dev2).is_err());
}
#[cfg(target_arch = "s390x")]
#[tokio::test]
async fn test_vfio_ap_matcher() {
let subsystem = "ap";
let card = "0a";
let relpath = format!("{}.0001", card);
let mut uev = Uevent::default();
uev.action = U_EVENT_ACTION_ADD.to_string();
uev.subsystem = subsystem.to_string();
uev.devpath = format!("{}/card{}/{}", AP_ROOT_BUS_PATH, card, relpath);
let ap_address = ap::Address::from_str(&relpath).unwrap();
let matcher = ApMatcher::new(ap_address);
assert!(matcher.is_match(&uev));
let mut uev_remove = uev.clone();
uev_remove.action = U_EVENT_ACTION_REMOVE.to_string();
assert!(!matcher.is_match(&uev_remove));
let mut uev_other_device = uev.clone();
uev_other_device.devpath = format!(
"{}/card{}/{}",
AP_ROOT_BUS_PATH,
card,
format!("{}.0002", card)
);
assert!(!matcher.is_match(&uev_other_device));
}
}

View File

@ -3,6 +3,8 @@
// SPDX-License-Identifier: Apache-2.0
//
use cfg_if::cfg_if;
/// Linux ABI related constants.
#[cfg(target_arch = "aarch64")]
@ -64,10 +66,14 @@ pub fn create_pci_root_bus_path() -> String {
ret
}
#[cfg(target_arch = "s390x")]
pub fn create_ccw_root_bus_path() -> String {
String::from("/devices/css0")
cfg_if! {
if #[cfg(target_arch = "s390x")] {
pub const CCW_ROOT_BUS_PATH: &str = "/devices/css0";
pub const AP_ROOT_BUS_PATH: &str = "/devices/ap";
pub const AP_SCANS_PATH: &str = "/sys/bus/ap/scans";
}
}
// From https://www.kernel.org/doc/Documentation/acpi/namespace.txt
// The Linux kernel's core ACPI subsystem creates struct acpi_device
// objects for ACPI namespace objects representing devices, power resources

View File

@ -20,6 +20,7 @@ extern crate scopeguard;
extern crate slog;
use anyhow::{anyhow, Context, Result};
use cfg_if::cfg_if;
use clap::{AppSettings, Parser};
use nix::fcntl::OFlag;
use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr};
@ -34,8 +35,6 @@ use std::process::exit;
use std::sync::Arc;
use tracing::{instrument, span};
#[cfg(target_arch = "s390x")]
mod ccw;
mod config;
mod console;
mod device;
@ -74,6 +73,13 @@ use tokio::{
mod rpc;
mod tracer;
cfg_if! {
if #[cfg(target_arch = "s390x")] {
mod ap;
mod ccw;
}
}
const NAME: &str = "kata-agent";
lazy_static! {

View File

@ -258,15 +258,24 @@ const (
// VFIODeviceErrorType is the error type of VFIO device
VFIODeviceErrorType VFIODeviceType = iota
// VFIODeviceNormalType is a normal VFIO device type
VFIODeviceNormalType
// VFIOPCIDeviceNormalType is a normal VFIO PCI device type
VFIOPCIDeviceNormalType
// VFIODeviceMediatedType is a VFIO mediated device type
VFIODeviceMediatedType
// VFIOPCIDeviceMediatedType is a VFIO PCI mediated device type
VFIOPCIDeviceMediatedType
// VFIOAPDeviceMediatedType is a VFIO AP mediated device type
VFIOAPDeviceMediatedType
)
// VFIODev represents a VFIO drive used for hotplugging
type VFIODev struct {
type VFIODev interface {
GetID() *string
GetType() VFIODeviceType
GetSysfsDev() *string
}
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
type VFIOPCIDev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
@ -298,6 +307,44 @@ type VFIODev struct {
IsPCIe bool
}
func (d VFIOPCIDev) GetID() *string {
return &d.ID
}
func (d VFIOPCIDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOPCIDev) GetSysfsDev() *string {
return &d.SysfsDev
}
type VFIOAPDev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
// sysfsdev of VFIO mediated device
SysfsDev string
// APDevices are the Adjunct Processor devices assigned to the mdev
APDevices []string
// Type of VFIO device
Type VFIODeviceType
}
func (d VFIOAPDev) GetID() *string {
return &d.ID
}
func (d VFIOAPDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOAPDev) GetSysfsDev() *string {
return &d.SysfsDev
}
// RNGDev represents a random number generator device
type RNGDev struct {
// ID is used to identify the device in the hypervisor options.

View File

@ -89,18 +89,47 @@ func readPCIProperty(propertyPath string) (string, error) {
return strings.Split(string(buf), "\n")[0], nil
}
func GetVFIODeviceType(deviceFileName string) config.VFIODeviceType {
func GetVFIODeviceType(deviceFilePath string) (config.VFIODeviceType, error) {
deviceFileName := filepath.Base(deviceFilePath)
//For example, 0000:04:00.0
tokens := strings.Split(deviceFileName, ":")
vfioDeviceType := config.VFIODeviceErrorType
if len(tokens) == 3 {
vfioDeviceType = config.VFIODeviceNormalType
} else {
//For example, 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
tokens = strings.Split(deviceFileName, "-")
if len(tokens) == 5 {
vfioDeviceType = config.VFIODeviceMediatedType
}
return config.VFIOPCIDeviceNormalType, nil
}
return vfioDeviceType
//For example, 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
tokens = strings.Split(deviceFileName, "-")
if len(tokens) != 5 {
return config.VFIODeviceErrorType, fmt.Errorf("Incorrect tokens found while parsing VFIO details: %s", deviceFileName)
}
deviceSysfsDev, err := GetSysfsDev(deviceFilePath)
if err != nil {
return config.VFIODeviceErrorType, err
}
if strings.HasPrefix(deviceSysfsDev, vfioAPSysfsDir) {
return config.VFIOAPDeviceMediatedType, nil
}
return config.VFIOPCIDeviceMediatedType, nil
}
// GetSysfsDev returns the sysfsdev of mediated device
// Expected input string format is absolute path to the sysfs dev node
// eg. /sys/kernel/iommu_groups/0/devices/f79944e4-5a3d-11e8-99ce-479cbab002e4
func GetSysfsDev(sysfsDevStr string) (string, error) {
return filepath.EvalSymlinks(sysfsDevStr)
}
// GetAPVFIODevices retrieves all APQNs associated with a mediated VFIO-AP
// device
func GetAPVFIODevices(sysfsdev string) ([]string, error) {
data, err := os.ReadFile(filepath.Join(sysfsdev, "matrix"))
if err != nil {
return []string{}, err
}
// Split by newlines, omitting final newline
return strings.Split(string(data[:len(data)-1]), "\n"), nil
}

View File

@ -30,6 +30,7 @@ const (
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
vfioDevPath = "/dev/vfio/%s"
pcieRootPortPrefix = "rp"
vfioAPSysfsDir = "/sys/devices/vfio_ap"
)
var (
@ -85,19 +86,42 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
if err != nil {
return err
}
vfio := &config.VFIODev{
ID: utils.MakeNameID("vfio", device.DeviceInfo.ID+strconv.Itoa(i), maxDevIDSize),
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIeDevice(deviceBDF),
Class: getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass),
}
device.VfioDevs = append(device.VfioDevs, vfio)
if vfio.IsPCIe {
vfio.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[vfio.BDF] = true
id := utils.MakeNameID("vfio", device.DeviceInfo.ID+strconv.Itoa(i), maxDevIDSize)
var vfio config.VFIODev
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
Class: getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass),
}
if isPCIe {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[deviceBDF] = true
}
vfio = vfioPCI
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return err
}
vfio = config.VFIOAPDev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
APDevices: devices,
}
default:
return fmt.Errorf("Failed to append device: VFIO device type unrecognized")
}
device.VfioDevs = append(device.VfioDevs, &vfio)
}
coldPlug := device.DeviceInfo.ColdPlug
@ -192,31 +216,60 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
device.GenericDevice.Load(ds)
for _, dev := range ds.VFIODevs {
device.VfioDevs = append(device.VfioDevs, &config.VFIODev{
ID: dev.ID,
Type: config.VFIODeviceType(dev.Type),
BDF: dev.BDF,
SysfsDev: dev.SysfsDev,
})
var vfio config.VFIODev
vfioDeviceType := (*device.VfioDevs[0]).GetType()
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
bdf := ""
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDev.BDF
}
vfio = config.VFIOPCIDev{
ID: *(*dev).GetID(),
Type: config.VFIODeviceType((*dev).GetType()),
BDF: bdf,
SysfsDev: *(*dev).GetSysfsDev(),
}
case config.VFIOAPDeviceMediatedType:
vfio = config.VFIOAPDev{
ID: *(*dev).GetID(),
SysfsDev: *(*dev).GetSysfsDev(),
}
default:
deviceLogger().WithError(
fmt.Errorf("VFIO device type unrecognized"),
).Error("Failed to append device")
return
}
device.VfioDevs = append(device.VfioDevs, &vfio)
}
}
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
// here it shares function from *GenericDevice so we don't need duplicate codes
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
vfioDeviceType = GetVFIODeviceType(deviceFileName)
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
if err != nil {
return deviceBDF, deviceSysfsDev, vfioDeviceType, err
}
switch vfioDeviceType {
case config.VFIODeviceNormalType:
case config.VFIOPCIDeviceNormalType:
// Get bdf of device eg. 0000:00:1c.0
deviceBDF = getBDF(deviceFileName)
// Get sysfs path used by cloud-hypervisor
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
case config.VFIODeviceMediatedType:
case config.VFIOPCIDeviceMediatedType:
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = getSysfsDev(sysfsDevStr)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
case config.VFIOAPDeviceMediatedType:
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
default:
err = fmt.Errorf("Incorrect tokens found while parsing vfio details: %s", deviceFileName)
}
@ -244,13 +297,6 @@ func getBDF(deviceSysStr string) string {
return tokens[1]
}
// getSysfsDev returns the sysfsdev of mediated device
// Expected input string format is absolute path to the sysfs dev node
// eg. /sys/kernel/iommu_groups/0/devices/f79944e4-5a3d-11e8-99ce-479cbab002e4
func getSysfsDev(sysfsDevStr string) (string, error) {
return filepath.EvalSymlinks(sysfsDevStr)
}
// BindDevicetoVFIO binds the device to vfio driver after unbinding from host.
// Will be called by a network interface or a generic pcie device.
func BindDevicetoVFIO(bdf, hostDriver, vendorDeviceID string) (string, error) {

View File

@ -32,9 +32,9 @@ func TestGetVFIODetails(t *testing.T) {
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
switch vfioDeviceType {
case config.VFIODeviceNormalType:
case config.VFIOPCIDeviceNormalType:
assert.Equal(t, d.expectedStr, deviceBDF)
case config.VFIODeviceMediatedType:
case config.VFIOPCIDeviceMediatedType, config.VFIOAPDeviceMediatedType:
assert.Equal(t, d.expectedStr, deviceSysfsDev)
default:
assert.NotNil(t, err)

View File

@ -857,12 +857,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
defer cancel()
// Create the clh device config via the constructor to ensure default values are properly assigned
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
if err != nil {
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
}
clh.devicesIds[device.ID] = pciInfo.GetId()
clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
// clh doesn't use bridges, so the PCI path is simply the slot
// number of the device. This will break if clh starts using
@ -879,7 +879,14 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}
device.GuestPciPath, err = types.PciPathFromString(tokens[0])
guestPciPath, err := types.PciPathFromString(tokens[0])
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
}
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
return err
}
@ -923,7 +930,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
case BlockDev:
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
case VfioDev:
deviceID = devInfo.(*config.VFIODev).ID
deviceID = *devInfo.(config.VFIODev).GetID()
default:
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")

View File

@ -624,7 +624,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
assert.NoError(err, "Hotplug remove block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
assert.NoError(err, "Hotplug remove vfio block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)

View File

@ -77,38 +77,39 @@ const (
)
var (
checkRequestTimeout = 30 * time.Second
defaultRequestTimeout = 60 * time.Second
errorMissingOCISpec = errors.New("Missing OCI specification")
defaultKataHostSharedDir = "/run/kata-containers/shared/sandboxes/"
defaultKataGuestSharedDir = "/run/kata-containers/shared/containers/"
defaultKataGuestNydusRootDir = "/run/kata-containers/shared/"
mountGuestTag = "kataShared"
defaultKataGuestSandboxDir = "/run/kata-containers/sandbox/"
type9pFs = "9p"
typeVirtioFS = "virtiofs"
typeOverlayFS = "overlay"
kata9pDevType = "9p"
kataMmioBlkDevType = "mmioblk"
kataBlkDevType = "blk"
kataBlkCCWDevType = "blk-ccw"
kataSCSIDevType = "scsi"
kataNvdimmDevType = "nvdimm"
kataVirtioFSDevType = "virtio-fs"
kataOverlayDevType = "overlayfs"
kataWatchableBindDevType = "watchable-bind"
kataVfioDevType = "vfio" // VFIO device to used as VFIO in the container
kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel
sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"}
sharedDirVirtioFSOptions = []string{}
sharedDirVirtioFSDaxOptions = "dax"
shmDir = "shm"
kataEphemeralDevType = "ephemeral"
defaultEphemeralPath = filepath.Join(defaultKataGuestSandboxDir, kataEphemeralDevType)
grpcMaxDataSize = int64(1024 * 1024)
localDirOptions = []string{"mode=0777"}
maxHostnameLen = 64
GuestDNSFile = "/etc/resolv.conf"
checkRequestTimeout = 30 * time.Second
defaultRequestTimeout = 60 * time.Second
errorMissingOCISpec = errors.New("Missing OCI specification")
defaultKataHostSharedDir = "/run/kata-containers/shared/sandboxes/"
defaultKataGuestSharedDir = "/run/kata-containers/shared/containers/"
defaultKataGuestNydusRootDir = "/run/kata-containers/shared/"
mountGuestTag = "kataShared"
defaultKataGuestSandboxDir = "/run/kata-containers/sandbox/"
type9pFs = "9p"
typeVirtioFS = "virtiofs"
typeOverlayFS = "overlay"
kata9pDevType = "9p"
kataMmioBlkDevType = "mmioblk"
kataBlkDevType = "blk"
kataBlkCCWDevType = "blk-ccw"
kataSCSIDevType = "scsi"
kataNvdimmDevType = "nvdimm"
kataVirtioFSDevType = "virtio-fs"
kataOverlayDevType = "overlayfs"
kataWatchableBindDevType = "watchable-bind"
kataVfioPciDevType = "vfio-pci" // VFIO PCI device to used as VFIO in the container
kataVfioPciGuestKernelDevType = "vfio-pci-gk" // VFIO PCI device for consumption by the guest kernel
kataVfioApDevType = "vfio-ap"
sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"}
sharedDirVirtioFSOptions = []string{}
sharedDirVirtioFSDaxOptions = "dax"
shmDir = "shm"
kataEphemeralDevType = "ephemeral"
defaultEphemeralPath = filepath.Join(defaultKataGuestSandboxDir, kataEphemeralDevType)
grpcMaxDataSize = int64(1024 * 1024)
localDirOptions = []string{"mode=0777"}
maxHostnameLen = 64
GuestDNSFile = "/etc/resolv.conf"
)
const (
@ -1117,20 +1118,25 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
groupNum := filepath.Base(dev.ContainerPath)
// Each /dev/vfio/NN device represents a VFIO group, which
// could include several PCI devices. So we give group
// information in the main structure, then list each
// individual PCI device in the Options array.
// For VFIO-PCI, each /dev/vfio/NN device represents a VFIO group,
// which could include several PCI devices. So we give group
// information in the main structure, then list each individual PCI
// device in the Options array.
//
// Each option is formatted as "DDDD:BB:DD.F=<pcipath>"
// DDDD:BB:DD.F is the device's PCI address on the
// *host*. <pcipath> is the device's PCI path in the guest
// (see qomGetPciPath() for details).
//
// For VFIO-AP, one VFIO group could include several queue devices. They are
// identified by APQNs (Adjunct Processor Queue Numbers), which do not differ
// between host and guest. They are passed as options so they can be awaited
// by the agent.
kataDevice := &grpc.Device{
ContainerPath: dev.ContainerPath,
Type: kataVfioDevType,
Type: kataVfioPciDevType,
Id: groupNum,
Options: make([]string, len(devList)),
Options: nil,
}
// We always pass the device information to the agent, since
@ -1138,11 +1144,18 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
// on the vfio_mode, we need to use a different device type so
// the agent can handle it properly
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
kataDevice.Type = kataVfioGuestKernelDevType
kataDevice.Type = kataVfioPciGuestKernelDevType
}
for i, pciDev := range devList {
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath)
if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
kataDevice.Type = kataVfioApDevType
kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
} else {
kataDevice.Options = make([]string, len(devList))
for i, device := range devList {
pciDevice := (*device).(config.VFIOPCIDev)
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
}
}
return kataDevice

View File

@ -1713,7 +1713,7 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
return err
}
devID := device.ID
devID := *(*device).GetID()
machineType := q.HypervisorConfig().HypervisorMachineType
if op == AddDevice {
@ -1730,29 +1730,31 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotplugVFIOOnRootBus {
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
switch machineType {
case QemuQ35:
if device.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
device.Bus = ""
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
default:
device.Bus = ""
}
switch machineType {
case QemuQ35:
if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
pciDevice.Bus = ""
}
default:
pciDevice.Bus = ""
}
*device = pciDevice
switch device.Type {
case config.VFIODeviceNormalType:
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile)
case config.VFIODeviceMediatedType:
if utils.IsAPVFIOMediatedDevice(device.SysfsDev) {
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
if pciDevice.Type == config.VFIOPCIDeviceNormalType {
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
} else {
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile)
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
}
default:
return fmt.Errorf("Incorrect VFIO device type found")
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
}
} else {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
@ -1766,15 +1768,17 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
}
}()
switch device.Type {
case config.VFIODeviceNormalType:
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile)
case config.VFIODeviceMediatedType:
if utils.IsAPVFIOMediatedDevice(device.SysfsDev) {
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
} else {
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile)
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
case config.VFIOPCIDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
@ -1782,13 +1786,24 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
if err != nil {
return err
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
device.GuestPciPath, err = q.qomGetPciPath(devID)
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
guestPciPath, err := q.qomGetPciPath(devID)
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
return err
}
return err
} else {
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")

View File

@ -675,16 +675,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
}
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
if vfioDev.BDF == "" {
pciDevice := vfioDev.(config.VFIOPCIDev)
if pciDevice.BDF == "" {
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
BDF: vfioDev.BDF,
VendorID: vfioDev.VendorID,
DeviceID: vfioDev.DeviceID,
Bus: vfioDev.Bus,
BDF: pciDevice.BDF,
VendorID: pciDevice.VendorID,
DeviceID: pciDevice.DeviceID,
Bus: pciDevice.Bus,
},
)

View File

@ -463,7 +463,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
},
}
vfDevice := config.VFIODev{
vfDevice := config.VFIOPCIDev{
BDF: bdf,
}
@ -483,7 +483,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
},
}
vfDevice := config.VFIODev{
vfDevice := config.VFIOPCIDev{
BDF: bdf,
VendorID: vendorID,
DeviceID: deviceID,

View File

@ -1856,11 +1856,15 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
// adding a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
}).WithError(err).Error("failed to hotplug VFIO device")
return err
}
@ -1909,11 +1913,15 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
// remove a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().WithError(err).
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
}).Error("failed to hot unplug VFIO device")
return err
}

View File

@ -89,8 +89,7 @@ func FindContextID() (*os.File, uint64, error) {
const (
procMountsFile = "/proc/mounts"
fieldsPerLine = 6
vfioAPSysfsDir = "vfio_ap"
fieldsPerLine = 6
)
const (
@ -142,18 +141,6 @@ func GetDevicePathAndFsTypeOptions(mountPoint string) (devicePath, fsType string
}
}
// IsAPVFIOMediatedDevice decides whether a device is a VFIO-AP device
// by checking for the existence of "vfio_ap" in the path
func IsAPVFIOMediatedDevice(sysfsdev string) bool {
split := strings.Split(sysfsdev, string(os.PathSeparator))
for _, el := range split {
if el == vfioAPSysfsDir {
return true
}
}
return false
}
func waitProcessUsingPidfd(pid int, timeoutSecs uint, logger *logrus.Entry) (bool, error) {
pidfd, err := unix.PidfdOpen(pid, 0)

View File

@ -63,19 +63,3 @@ func TestGetDevicePathAndFsTypeOptionsSuccessful(t *testing.T) {
assert.Equal(fstype, fstypeOut)
assert.Equal(fsOptions, optsOut)
}
func TestIsAPVFIOMediatedDeviceFalse(t *testing.T) {
assert := assert.New(t)
// Should be false for a PCI device
isAPMdev := IsAPVFIOMediatedDevice("/sys/bus/pci/devices/0000:00:02.0/a297db4a-f4c2-11e6-90f6-d3b88d6c9525")
assert.False(isAPMdev)
}
func TestIsAPVFIOMediatedDeviceTrue(t *testing.T) {
assert := assert.New(t)
// Typical AP sysfsdev
isAPMdev := IsAPVFIOMediatedDevice("/sys/devices/vfio_ap/matrix/a297db4a-f4c2-11e6-90f6-d3b88d6c9525")
assert.True(isAPMdev)
}