diff --git a/.github/workflows/run-vfio-tests.yaml b/.github/workflows/run-vfio-tests.yaml index b5aa739cec..4542ec1a3a 100644 --- a/.github/workflows/run-vfio-tests.yaml +++ b/.github/workflows/run-vfio-tests.yaml @@ -45,4 +45,5 @@ jobs: path: kata-artifacts - name: Run vfio tests + timeout-minutes: 15 run: bash tests/functional/vfio/gha-run.sh run diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index b9eb0d8d88..623357e613 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -184,12 +184,22 @@ block_device_driver = "virtio-blk" # Disable the 'seccomp' feature from Cloud Hypervisor, default false # disable_seccomp = true +# Enable vIOMMU, default false +# Enabling this will result in the VM having a vIOMMU device +# This will also add the following options to the kernel's +# command line: iommu=pt +#enable_iommu = true + # This option changes the default hypervisor and kernel parameters # to enable debug output where available. # # Default false #enable_debug = true +# Enable hot-plugging of VFIO devices to a root-port. +# The default setting is "no-port" +#hot_plug_vfio = "root-port" + # Path to OCI hook binaries in the *guest rootfs*. # This does not affect host-side hooks which must instead be added to # the OCI spec passed to the runtime. diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index f577df5263..a4e17b2390 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -1680,8 +1680,8 @@ func checkConfig(config oci.RuntimeConfig) error { // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string, hypervisorType virtcontainers.HypervisorType) error { - if hypervisorType != virtcontainers.QemuHypervisor { - kataUtilsLogger.Warn("Advanced PCIe Topology only available for QEMU hypervisor, ignoring hot(cold)_vfio_port setting") + if hypervisorType != virtcontainers.QemuHypervisor && hypervisorType != virtcontainers.ClhHypervisor { + kataUtilsLogger.Warn("Advanced PCIe Topology only available for QEMU/CLH hypervisor, ignoring hot(cold)_vfio_port setting") return nil } @@ -1696,6 +1696,14 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if machineType != "q35" && machineType != "virt" { return nil } + if hypervisorType == virtcontainers.ClhHypervisor { + if coldPlug != config.NoPort { + return fmt.Errorf("cold-plug not supported on CLH") + } + if hotPlug != config.RootPort { + return fmt.Errorf("only hot-plug=%s supported on CLH", config.RootPort) + } + } var port config.PCIePort if coldPlug != config.NoPort { @@ -1704,10 +1712,6 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if hotPlug != config.NoPort { port = hotPlug } - if port == config.NoPort { - return fmt.Errorf("invalid vfio_port=%s setting, use on of %s, %s, %s", - port, config.BridgePort, config.RootPort, config.SwitchPort) - } if port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { return nil } diff --git a/src/runtime/pkg/katautils/create_test.go b/src/runtime/pkg/katautils/create_test.go index 2608003784..903e68d95d 100644 --- a/src/runtime/pkg/katautils/create_test.go +++ b/src/runtime/pkg/katautils/create_test.go @@ -18,8 +18,10 @@ import ( "syscall" "testing" + config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock" @@ -419,3 +421,32 @@ func TestCreateContainer(t *testing.T) { assert.NoError(err) } } + +func TestVfioChecksClh(t *testing.T) { + assert := assert.New(t) + + // Check valid CLH vfio configs + f := func(coldPlug, hotPlug config.PCIePort) error { + return checkPCIeConfig(coldPlug, hotPlug, defaultMachineType, virtcontainers.ClhHypervisor) + } + assert.NoError(f(config.NoPort, config.NoPort)) + assert.NoError(f(config.NoPort, config.RootPort)) + assert.Error(f(config.RootPort, config.RootPort)) + assert.Error(f(config.RootPort, config.NoPort)) + assert.Error(f(config.NoPort, config.SwitchPort)) +} + +func TestVfioCheckQemu(t *testing.T) { + assert := assert.New(t) + + // Check valid Qemu vfio configs + f := func(coldPlug, hotPlug config.PCIePort) error { + return checkPCIeConfig(coldPlug, hotPlug, defaultMachineType, virtcontainers.QemuHypervisor) + } + + assert.NoError(f(config.NoPort, config.NoPort)) + assert.NoError(f(config.RootPort, config.NoPort)) + assert.NoError(f(config.NoPort, config.RootPort)) + assert.Error(f(config.RootPort, config.RootPort)) + assert.Error(f(config.SwitchPort, config.RootPort)) +} diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index ff92b89ecd..04aba85457 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -490,6 +490,13 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net } clh.vmconfig.Payload.SetKernel(kernelPath) + clh.vmconfig.Platform = chclient.NewPlatformConfig() + platform := clh.vmconfig.Platform + platform.SetNumPciSegments(2) + if clh.config.IOMMU { + platform.SetIommuSegments([]int32{0}) + } + if clh.config.ConfidentialGuest { if err := clh.enableProtection(); err != nil { return err @@ -528,6 +535,9 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net // start the guest kernel with 'quiet' in non-debug mode params = append(params, Param{"quiet", ""}) } + if clh.config.IOMMU { + params = append(params, Param{"iommu", "pt"}) + } // Followed by extra kernel parameters defined in the configuration file params = append(params, clh.config.KernelParams...) @@ -536,6 +546,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net // set random device generator to hypervisor clh.vmconfig.Rng = chclient.NewRngConfig(clh.config.EntropySource) + clh.vmconfig.Rng.SetIommu(clh.config.IOMMU) // set the initial root/boot disk of hypervisor assetPath, assetType, err := clh.config.ImageOrInitrdAssetPath() @@ -561,6 +572,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net } else { pmem := chclient.NewPmemConfig(assetPath) *pmem.DiscardWrites = true + pmem.SetIommu(clh.config.IOMMU) if clh.vmconfig.Pmem != nil { *clh.vmconfig.Pmem = append(*clh.vmconfig.Pmem, *pmem) @@ -594,6 +606,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net clh.vmconfig.Console = chclient.NewConsoleConfig(cctOFF) } + clh.vmconfig.Console.SetIommu(clh.config.IOMMU) cpu_topology := chclient.NewCpuTopology() cpu_topology.ThreadsPerCore = func(i int32) *int32 { return &i }(1) @@ -836,6 +849,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro queueSize := int32(1024) clhDisk.NumQueues = &queues clhDisk.QueueSize = &queueSize + clhDisk.SetIommu(clh.config.IOMMU) diskRateLimiterConfig := clh.getDiskRateLimiterConfig() if diskRateLimiterConfig != nil { @@ -861,6 +875,7 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { // Create the clh device config via the constructor to ensure default values are properly assigned clhDevice := *chclient.NewDeviceConfig(device.SysfsDev) + clhDevice.SetIommu(clh.config.IOMMU) pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) @@ -1535,6 +1550,7 @@ func (clh *cloudHypervisor) addVSock(cid int64, path string) { }).Info("Adding HybridVSock") clh.vmconfig.Vsock = chclient.NewVsockConfig(cid, path) + clh.vmconfig.Vsock.SetIommu(clh.config.IOMMU) } func (clh *cloudHypervisor) getRateLimiterConfig(bwSize, bwOneTimeBurst, opsSize, opsOneTimeBurst int64) *chclient.RateLimiterConfig { @@ -1604,6 +1620,7 @@ func (clh *cloudHypervisor) addNet(e Endpoint) error { if netRateLimiterConfig != nil { net.SetRateLimiterConfig(*netRateLimiterConfig) } + net.SetIommu(clh.config.IOMMU) if clh.netDevices != nil { *clh.netDevices = append(*clh.netDevices, *net) @@ -1636,6 +1653,7 @@ func (clh *cloudHypervisor) addVolume(volume types.Volume) error { } fs := chclient.NewFsConfig(volume.MountTag, vfsdSockPath, numQueues, queueSize) + fs.SetPciSegment(1) clh.vmconfig.Fs = &[]chclient.FsConfig{*fs} clh.Logger().Debug("Adding share volume to hypervisor: ", volume.MountTag) diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index e456f1dcf7..cc559f4414 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -68,6 +68,7 @@ func newClhConfig() (HypervisorConfig, error) { NetRateLimiterBwOneTimeBurst: int64(0), NetRateLimiterOpsMaxRate: int64(0), NetRateLimiterOpsOneTimeBurst: int64(0), + HotPlugVFIO: config.NoPort, }, nil } diff --git a/tests/common.bash b/tests/common.bash index 08b82601bf..fc0ed0a7f9 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -158,7 +158,7 @@ function clean_env_ctr() info "Wait until the containers gets removed" for task_id in "${running_tasks[@]}"; do - sudo ctr t kill -a -s SIGTERM ${task_id} >/dev/null 2>&1 + sudo timeout -s SIGKILL 30s ctr t kill -a -s SIGTERM ${task_id} >/dev/null 2>&1 || true sleep 0.5 done diff --git a/tests/functional/vfio/gha-run.sh b/tests/functional/vfio/gha-run.sh index f4cb608de6..97c72f80b7 100755 --- a/tests/functional/vfio/gha-run.sh +++ b/tests/functional/vfio/gha-run.sh @@ -15,10 +15,33 @@ source "${vfio_dir}/../../common.bash" function install_dependencies() { info "Installing the dependencies needed for running the vfio tests" + ( + source /etc/os-release || source /usr/lib/os-release + case "${ID}" in + ubuntu) + # cloud image dependencies + deps=(xorriso curl qemu-utils openssh-client) + + sudo apt-get update + sudo apt-get install -y ${deps[@]} qemu-system-x86 + ;; + fedora) + # cloud image dependencies + deps=(xorriso curl qemu-img openssh) + + sudo dnf install -y ${deps[@]} qemu-system-x86-core + ;; + + "*") + die "Unsupported distro: ${ID}" + ;; + esac + ) } function run() { info "Running cri-containerd tests using ${KATA_HYPERVISOR} hypervisor" + "${vfio_dir}"/vfio_fedora_vm_wrapper.sh } function main() { diff --git a/tests/functional/vfio/guest-kernel.json.in b/tests/functional/vfio/guest-kernel.json.in new file mode 100644 index 0000000000..31c0af9f08 --- /dev/null +++ b/tests/functional/vfio/guest-kernel.json.in @@ -0,0 +1,176 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +{ + "ociVersion": "1.0.0-rc2-dev", + "platform": { + "os": "linux", + "arch": "amd64" + }, + "annotations": { + "io.katacontainers.config.hypervisor.enable_iommu": "false", + "io.katacontainers.config.runtime.vfio_mode": "guest-kernel" + }, + "process": { + "terminal": false, + "consoleSize": { + "height": 0, + "width": 0 + }, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ "/bin/tail", "-f", "/dev/null" ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "rlimits": [{ + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + }], + "noNewPrivileges": true + }, + "root": { + "path": "@ROOTFS@", + "readonly": false + }, + "hostname": "vfio-test", + "mounts": [{ + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "hooks": {}, + "linux": { + "devices": [{ + "path": "@VFIO_PATH@", + "type": "c", + "major": @VFIO_MAJOR@, + "minor": @VFIO_MINOR@, + "fileMode": 384, + "uid": 0, + "gid": 0 + }], + "cgroupsPath": "kata/vfiotest", + "resources": { + "devices": [ + {"allow":false,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":3,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":5,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":8,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":9,"access":"rwm"}, + {"allow":true,"type":"c","major":5,"minor":0,"access":"rwm"}, + {"allow":true,"type":"c","major":5,"minor":1,"access":"rwm"}, + {"allow": true,"access": "rwm","major": @VFIO_MAJOR@,"minor": @VFIO_MINOR@,"type": "c"} + ] + }, + "namespaces": [{ + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} diff --git a/tests/functional/vfio/run.sh b/tests/functional/vfio/run.sh new file mode 100755 index 0000000000..4f36709a88 --- /dev/null +++ b/tests/functional/vfio/run.sh @@ -0,0 +1,350 @@ +#!/bin/bash +# +# Copyright (c) 2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -x +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +script_path=$(dirname "$0") +source "${script_path}/../../common.bash" + +addr= +tmp_data_dir="$(mktemp -d)" +rootfs_tar="${tmp_data_dir}/rootfs.tar" +trap cleanup EXIT + +# kata-runtime options +SANDBOX_CGROUP_ONLY="" +HYPERVISOR= +MACHINE_TYPE= +IMAGE_TYPE= + +cleanup() { + clean_env_ctr + sudo rm -rf "${tmp_data_dir}" + + [ -n "${host_pci}" ] && sudo driverctl unset-override "${host_pci}" +} + +host_pci_addr() { + lspci -D | grep "Ethernet controller" | grep "Virtio.*network device" | tail -1 | cut -d' ' -f1 +} + +get_vfio_path() { + local addr="$1" + echo "/dev/vfio/$(basename $(realpath /sys/bus/pci/drivers/vfio-pci/${host_pci}/iommu_group))" +} + +pull_rootfs() { + # pull and export busybox image in tar file + local image="quay.io/prometheus/busybox:latest" + sudo -E ctr i pull ${image} + sudo -E ctr i export "${rootfs_tar}" "${image}" + sudo chown ${USER}:${USER} "${rootfs_tar}" + sync +} + +create_bundle() { + local bundle_dir="$1" + mkdir -p "${bundle_dir}" + + # extract busybox rootfs + local rootfs_dir="${bundle_dir}/rootfs" + mkdir -p "${rootfs_dir}" + local layers_dir="$(mktemp -d)" + tar -C "${layers_dir}" -pxf "${rootfs_tar}" + for ((i=0;i<$(cat ${layers_dir}/manifest.json | jq -r ".[].Layers | length");i++)); do + tar -C ${rootfs_dir} -xf ${layers_dir}/$(cat ${layers_dir}/manifest.json | jq -r ".[].Layers[${i}]") + done + sync + + # Copy config.json + cp -a "${script_path}/config.json" "${bundle_dir}/config.json" +} + +run_container() { + local container_id="$1" + local bundle_dir="$2" + + sudo -E ctr run -d --runtime io.containerd.kata.v2 --config "${bundle_dir}/config.json" "${container_id}" +} + + +get_ctr_cmd_output() { + local container_id="$1" + shift + timeout 30s sudo -E ctr t exec --exec-id 2 "${container_id}" "${@}" +} + +check_guest_kernel() { + local container_id="$1" + # For vfio_mode=guest-kernel, the device should be bound to + # the guest kernel's native driver. To check this has worked, + # we look for an ethernet device named 'eth*' + get_ctr_cmd_output "${container_id}" ip a | grep "eth" || die "Missing VFIO network interface" +} + +check_vfio() { + local cid="$1" + # For vfio_mode=vfio, the device should be bound to the guest + # vfio-pci driver. + + # Check the control device is visible + get_ctr_cmd_output "${cid}" ls /dev/vfio/vfio || die "Couldn't find VFIO control device in container" + + # The device should *not* cause an ethernet interface to appear + ! get_ctr_cmd_output "${cid}" ip a | grep "eth" || die "Unexpected network interface" + + # There should be exactly one VFIO group device (there might + # be multiple IOMMU groups in the VM, but only one device + # should be bound to the VFIO driver, so there should still + # only be one VFIO device + group="$(get_ctr_cmd_output "${cid}" ls /dev/vfio | grep -v vfio)" + if [ $(echo "${group}" | wc -w) != "1" ] ; then + die "Expected exactly one VFIO group got: ${group}" + fi + + # There should be two devices in the IOMMU group: the ethernet + # device we care about, plus the PCIe to PCI bridge device + devs="$(get_ctr_cmd_output "${cid}" ls /sys/kernel/iommu_groups/"${group}"/devices)" + num_devices=$(echo "${devs}" | wc -w) + if [ "${HYPERVISOR}" = "qemu" ] && [ "${num_devices}" != "2" ] ; then + die "Expected exactly two devices got: ${devs}" + fi + if [ "${HYPERVISOR}" = "clh" ] && [ "${num_devices}" != "1" ] ; then + die "Expected exactly one device got: ${devs}" + fi + + # The bridge device will always sort first, because it is on + # bus zero, whereas the NIC will be on a non-zero bus + guest_pci=$(echo "${devs}" | tail -1) + + # This is a roundabout way of getting the environment + # variable, but to use the more obvious "echo $PCIDEVICE_..." + # we would have to escape the '$' enough to not be expanded + # before it's injected into the container, but not so much + # that it *is* expanded by the shell within the container. + # Doing that with another shell function in between is very + # fragile, so do it this way instead. + guest_env="$(get_ctr_cmd_output "${cid}" env | grep ^PCIDEVICE_VIRTIO_NET | sed s/^[^=]*=//)" + if [ "${guest_env}" != "${guest_pci}" ]; then + die "PCIDEVICE variable was \"${guest_env}\" instead of \"${guest_pci}\"" + fi +} + +get_dmesg() { + local container_id="$1" + get_ctr_cmd_output "${container_id}" dmesg +} + +# Show help about this script +help(){ +cat << EOF +Usage: $0 [-h] [options] + Description: + This script runs a kata container and passthrough a vfio device + Options: + -h, Help + -i , Specify initrd or image + -m , Specify kata-runtime machine type for qemu hypervisor + -p , Specify kata-runtime hypervisor + -s , Set sandbox_cgroup_only in the configuration file +EOF +} + +setup_configuration_file() { + local qemu_config_file="configuration-qemu.toml" + local clh_config_file="configuration-clh.toml" + local image_file="/opt/kata/share/kata-containers/kata-containers.img" + local initrd_file="/opt/kata/share/kata-containers/kata-containers-initrd.img" + local kata_config_file="" + + for file in $(kata-runtime --kata-show-default-config-paths); do + if [ ! -f "${file}" ]; then + continue + fi + + kata_config_file="${file}" + config_dir=$(dirname ${file}) + config_filename="" + + if [ "$HYPERVISOR" = "qemu" ]; then + config_filename="${qemu_config_file}" + elif [ "$HYPERVISOR" = "clh" ]; then + config_filename="${clh_config_file}" + fi + + config_file="${config_dir}/${config_filename}" + if [ -f "${config_file}" ]; then + rm -f "${kata_config_file}" + cp -a $(realpath "${config_file}") "${kata_config_file}" + break + fi + done + + # machine type applies to configuration.toml and configuration-qemu.toml + if [ -n "$MACHINE_TYPE" ]; then + if [ "$HYPERVISOR" = "qemu" ]; then + sed -i 's|^machine_type.*|machine_type = "'${MACHINE_TYPE}'"|g' "${kata_config_file}" + else + warn "Variable machine_type only applies to qemu. It will be ignored" + fi + fi + + # Make sure we have set hot_plug_vfio to a reasonable value + if [ "$HYPERVISOR" = "qemu" ]; then + sed -i -e 's|^#*.*hot_plug_vfio.*|hot_plug_vfio = "bridge-port"|' "${kata_config_file}" + elif [ "$HYPERVISOR" = "clh" ]; then + sed -i -e 's|^#*.*hot_plug_vfio.*|hot_plug_vfio = "root-port"|' "${kata_config_file}" + fi + + if [ -n "${SANDBOX_CGROUP_ONLY}" ]; then + sed -i 's|^sandbox_cgroup_only.*|sandbox_cgroup_only='${SANDBOX_CGROUP_ONLY}'|g' "${kata_config_file}" + fi + + # Change to initrd or image depending on user input. + # Non-default configs must be changed to specify either initrd or image, image is default. + if [ "$IMAGE_TYPE" = "initrd" ]; then + if $(grep -q "^image.*" ${kata_config_file}); then + if $(grep -q "^initrd.*" ${kata_config_file}); then + sed -i '/^image.*/d' "${kata_config_file}" + else + sed -i 's|^image.*|initrd = "'${initrd_file}'"|g' "${kata_config_file}" + fi + fi + else + if $(grep -q "^initrd.*" ${kata_config_file}); then + if $(grep -q "^image.*" ${kata_config_file}); then + sed -i '/^initrd.*/d' "${kata_config_file}" + else + sed -i 's|^initrd.*|image = "'${image_file}'"|g' "${kata_config_file}" + fi + fi + fi + + # enable debug + sed -i -e 's/^#\(enable_debug\).*=.*$/\1 = true/g' \ + -e 's/^#\(debug_console_enabled\).*=.*$/\1 = true/g' \ + -e 's/^kernel_params = "\(.*\)"/kernel_params = "\1 mitigations=off agent.log=debug"/g' \ + "${kata_config_file}" + + # enable VFIO relevant hypervisor annotations + sed -i -e 's/^\(enable_annotations\).*=.*$/\1 = ["enable_iommu"]/' \ + "${kata_config_file}" +} + +run_test_container() { + local container_id="$1" + local bundle_dir="$2" + local config_json_in="$3" + local host_pci="$4" + + # generate final config.json + sed -e '/^#.*/d' \ + -e 's|@VFIO_PATH@|'"${vfio_device}"'|g' \ + -e 's|@VFIO_MAJOR@|'"${vfio_major}"'|g' \ + -e 's|@VFIO_MINOR@|'"${vfio_minor}"'|g' \ + -e 's|@VFIO_CTL_MAJOR@|'"${vfio_ctl_major}"'|g' \ + -e 's|@VFIO_CTL_MINOR@|'"${vfio_ctl_minor}"'|g' \ + -e 's|@ROOTFS@|'"${bundle_dir}/rootfs"'|g' \ + -e 's|@HOST_PCI@|'"${host_pci}"'|g' \ + "${config_json_in}" > "${script_path}/config.json" + + create_bundle "${bundle_dir}" + + # run container + run_container "${container_id}" "${bundle_dir}" + + # output VM dmesg + get_dmesg "${container_id}" +} + +main() { + local OPTIND + while getopts "hi:m:p:s:" opt;do + case ${opt} in + h) + help + exit 0; + ;; + i) + IMAGE_TYPE="${OPTARG}" + ;; + m) + MACHINE_TYPE="${OPTARG}" + ;; + p) + HYPERVISOR="${OPTARG}" + ;; + s) + SANDBOX_CGROUP_ONLY="${OPTARG}" + ;; + ?) + # parse failure + help + die "Failed to parse arguments" + ;; + esac + done + shift $((OPTIND-1)) + + # + # Get the device ready on the host + # + setup_configuration_file + + restart_containerd_service + sudo modprobe vfio + sudo modprobe vfio-pci + + host_pci=$(host_pci_addr) + [ -n "${host_pci}" ] || die "virtio ethernet controller PCI address not found" + + cat /proc/cmdline | grep -q "intel_iommu=on" || \ + die "intel_iommu=on not found in kernel cmdline" + + sudo driverctl set-override "${host_pci}" vfio-pci + + vfio_device="$(get_vfio_path "${host_pci}")" + [ -n "${vfio_device}" ] || die "vfio device not found" + vfio_major="$(printf '%d' $(stat -c '0x%t' ${vfio_device}))" + vfio_minor="$(printf '%d' $(stat -c '0x%T' ${vfio_device}))" + + [ -n "/dev/vfio/vfio" ] || die "vfio control device not found" + vfio_ctl_major="$(printf '%d' $(stat -c '0x%t' /dev/vfio/vfio))" + vfio_ctl_minor="$(printf '%d' $(stat -c '0x%T' /dev/vfio/vfio))" + + # Get the rootfs we'll use for all tests + pull_rootfs + + # + # Run the tests + # + + # test for guest-kernel mode + guest_kernel_cid="vfio-guest-kernel-${RANDOM}" + run_test_container "${guest_kernel_cid}" \ + "${tmp_data_dir}/vfio-guest-kernel" \ + "${script_path}/guest-kernel.json.in" \ + "${host_pci}" + check_guest_kernel "${guest_kernel_cid}" + + # Remove the container so we can re-use the device for the next test + clean_env_ctr + + # test for vfio mode + vfio_cid="vfio-vfio-${RANDOM}" + run_test_container "${vfio_cid}" \ + "${tmp_data_dir}/vfio-vfio" \ + "${script_path}/vfio.json.in" \ + "${host_pci}" + check_vfio "${vfio_cid}" +} + +main $@ diff --git a/tests/functional/vfio/vfio.json.in b/tests/functional/vfio/vfio.json.in new file mode 100644 index 0000000000..19667c01e7 --- /dev/null +++ b/tests/functional/vfio/vfio.json.in @@ -0,0 +1,187 @@ +# +# Copyright (c) 2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +{ + "ociVersion": "1.0.0-rc2-dev", + "platform": { + "os": "linux", + "arch": "amd64" + }, + "annotations": { + "io.katacontainers.config.hypervisor.enable_iommu": "true", + "io.katacontainers.config.runtime.vfio_mode": "vfio" + }, + "process": { + "terminal": false, + "consoleSize": { + "height": 0, + "width": 0 + }, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ "/bin/tail", "-f", "/dev/null" ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + "PCIDEVICE_VIRTIO_NET=@HOST_PCI@" + ], + "cwd": "/", + "rlimits": [{ + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + }], + "noNewPrivileges": true + }, + "root": { + "path": "@ROOTFS@", + "readonly": false + }, + "hostname": "vfio-test", + "mounts": [{ + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "hooks": {}, + "linux": { + "devices": [{ + "path": "/dev/vfio/vfio", + "type": "c", + "major": @VFIO_CTL_MAJOR@, + "minor": @VFIO_CTL_MINOR@, + "fileMode": 438, + "uid": 0, + "gid": 0 + }, + { + "path": "@VFIO_PATH@", + "type": "c", + "major": @VFIO_MAJOR@, + "minor": @VFIO_MINOR@, + "fileMode": 384, + "uid": 0, + "gid": 0 + }], + "cgroupsPath": "kata/vfiotest", + "resources": { + "devices": [ + {"allow":false,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":3,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":5,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":8,"access":"rwm"}, + {"allow":true,"type":"c","major":1,"minor":9,"access":"rwm"}, + {"allow":true,"type":"c","major":5,"minor":0,"access":"rwm"}, + {"allow":true,"type":"c","major":5,"minor":1,"access":"rwm"}, + {"allow": true,"access": "rwm","major": @VFIO_CTL_MAJOR@,"minor": @VFIO_CTL_MINOR@,"type": "c"}, + {"allow": true,"access": "rwm","major": @VFIO_MAJOR@,"minor": @VFIO_MINOR@,"type": "c"} + ] + }, + "namespaces": [{ + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} diff --git a/tests/functional/vfio/vfio_fedora_vm_wrapper.sh b/tests/functional/vfio/vfio_fedora_vm_wrapper.sh new file mode 100755 index 0000000000..bddd034459 --- /dev/null +++ b/tests/functional/vfio/vfio_fedora_vm_wrapper.sh @@ -0,0 +1,329 @@ +#!/bin/bash +# +# Copyright (c) 2020 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +# Run the .ci/jenkins_job_build.sh script in a VM +# that supports VFIO, then run VFIO functional tests + +set -o xtrace +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +cidir=$(readlink -f $(dirname "$0")) + +source /etc/os-release || source /usr/lib/os-release +# +source "${cidir}/../../common.bash" +export WORKSPACE="${WORKSPACE:-${HOME}}" +export GIT_URL="https://github.com/kata-containers/kata-containers.git" +export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" +# + +http_proxy=${http_proxy:-} +https_proxy=${https_proxy:-} +vm_ip="127.0.15.1" +vm_port="10022" +# Don't save data in /tmp, we need it after rebooting the system +data_dir="${HOME}/functional-vfio-test" +ssh_key_file="${data_dir}/key" +arch=$(uname -m) +artifacts_dir="${WORKSPACE}/artifacts" + +kill_vms() { + sudo killall -9 qemu-system-${arch} +} + +cleanup() { + mkdir -p ${artifacts_dir} + sudo chown -R ${USER} ${artifacts_dir} + scp_vm ${artifacts_dir}/* ${artifacts_dir} || true + kill_vms + + echo "::group::L2 journal" + cat "${artifacts_dir}/journal.log" + echo "::endgroup::" + + echo "::group::L1 dmesg" + sudo dmesg + echo "::endgroup::" +} + +create_ssh_key() { + rm -f "${ssh_key_file}" + ssh-keygen -f "${ssh_key_file}" -t rsa -N "" +} + +create_meta_data() { + file="$1" + cat < "${file}" +{ + "uuid": "d1b4aafa-5d75-4f9c-87eb-2ceabe110c39", + "hostname": "test" +} +EOF +} + +create_user_data() { + file="$1" + ssh_pub_key_file="$2" + + ssh_pub_key="$(cat "${ssh_pub_key_file}")" + dnf_proxy="" + service_proxy="" + docker_user_proxy="{}" + environment=$(env | egrep "ghprb|WORKSPACE|KATA|GIT|JENKINS|_PROXY|_proxy" | \ + sed -e "s/'/'\"'\"'/g" \ + -e "s/\(^[[:alnum:]_]\+\)=/\1='/" \ + -e "s/$/'/" \ + -e 's/^/ export /') + + if [ -n "${http_proxy}" ] && [ -n "${https_proxy}" ]; then + dnf_proxy="proxy=${http_proxy}" + service_proxy='[Service] + Environment="HTTP_PROXY='${http_proxy}'" "HTTPS_PROXY='${https_proxy}'" "NO_PROXY='${no_proxy}'"' + docker_user_proxy='{"proxies": { "default": { + "httpProxy": "'${http_proxy}'", + "httpsProxy": "'${https_proxy}'", + "noProxy": "'${no_proxy}'" + } } }' + fi + + cat < "${file}" +#cloud-config +package_upgrade: false +runcmd: +- chown -R ${USER}:${USER} /home/${USER} +- touch /.done +users: +- gecos: User + gid: "1000" + lock-passwd: true + name: ${USER} + shell: /bin/bash + ssh-authorized-keys: + - ${ssh_pub_key} + sudo: ALL=(ALL) NOPASSWD:ALL + uid: "1000" +write_files: +- content: | + [main] + fastestmirror=True + gpgcheck=1 + max_parallel_downloads=10 + installonly_limit=2 + clean_requirements_on_remove=True + keepcache=True + ip_resolve=4 + path: /etc/dnf/dnf.conf +- content: | +${environment} + path: /etc/environment +- content: | + ${service_proxy} + path: /etc/systemd/system/docker.service.d/http-proxy.conf +- content: | + ${service_proxy} + path: /etc/systemd/system/containerd.service.d/http-proxy.conf +- content: | + ${docker_user_proxy} + path: ${HOME}/.docker/config.json +- content: | + ${docker_user_proxy} + path: /root/.docker/config.json +- content: | + set -x + set -o errexit + set -o nounset + set -o pipefail + set -o errtrace + . /etc/environment + . /etc/os-release + + [ "\$ID" = "fedora" ] || (echo >&2 "$0 only supports Fedora"; exit 1) + + echo "${dnf_proxy}" | sudo tee -a /etc/dnf/dnf.conf + + for i in \$(seq 1 50); do + [ -f /.done ] && break + echo "waiting for cloud-init to finish" + sleep 5; + done + + export DEBUG=true + export GOPATH=\${WORKSPACE}/go + export PATH=\${GOPATH}/bin:/usr/local/go/bin:/usr/sbin:\${PATH} + export GOROOT="/usr/local/go" + + # Make sure the packages were installed + # Sometimes cloud-init is unable to install them + sudo dnf install -y git wget pciutils driverctl + + git config --global user.email "foo@bar" + git config --global user.name "Foo Bar" + + sudo mkdir -p /workspace + sudo mount -t 9p -o access=any,trans=virtio,version=9p2000.L workspace /workspace + mkdir -p ${artifacts_dir} + trap "cd /workspace; sudo journalctl -b0 > ${artifacts_dir}/journal.log || true; sudo chown -R \${USER} ${artifacts_dir}" EXIT + + pushd /workspace + source tests/common.bash + ensure_yq + cri_containerd=\$(get_from_kata_deps "externals.containerd.lts") + cri_tools=\$(get_from_kata_deps "externals.critools.latest") + install_cri_containerd \${cri_containerd} + install_cri_tools \${cri_tools} + + kata_tarball_dir="kata-artifacts" + install_kata + + sudo /workspace/tests/functional/vfio/run.sh -s false -p \${KATA_HYPERVISOR} -m q35 -i image + sudo /workspace/tests/functional/vfio/run.sh -s true -p \${KATA_HYPERVISOR} -m q35 -i image + + path: /home/${USER}/run.sh + permissions: '0755' +EOF +} + +create_config_iso() { + iso_file="$1" + ssh_pub_key_file="${ssh_key_file}.pub" + iso_data_dir="${data_dir}/d" + meta_data_file="${iso_data_dir}/openstack/latest/meta_data.json" + user_data_file="${iso_data_dir}/openstack/latest/user_data" + + mkdir -p $(dirname "${user_data_file}") + + create_meta_data "${meta_data_file}" + create_user_data "${user_data_file}" "${ssh_pub_key_file}" + + [ -f "${iso_file}" ] && rm -f "${iso_file}" + + xorriso -as mkisofs -R -V config-2 -o "${iso_file}" "${iso_data_dir}" +} + +pull_fedora_cloud_image() { + fedora_img="$1" + fedora_version=38 + # Add a version to the image cache, otherwise the tests are going to + # use always the same image without rebuilding it, regardless the version + # set in fedora_version + fedora_img_cache="${fedora_img}.cache.${fedora_version}" + fedora_img_url="https://download.fedoraproject.org/pub/fedora/linux/releases/${fedora_version}/Cloud/${arch}/images/Fedora-Cloud-Base-${fedora_version}-1.6.${arch}.raw.xz" + + if [ ! -f "${fedora_img_cache}" ]; then + curl -sL ${fedora_img_url} -o "${fedora_img_cache}.xz" + xz -f -d "${fedora_img_cache}.xz" + fi + + cp -a "${fedora_img_cache}" "${fedora_img}" + + # setup cloud image + sudo losetup -D + loop=$(sudo losetup --show -Pf "${fedora_img}") + sudo mount "${loop}p2" /mnt + + # add intel_iommu=on to the guest kernel command line + kernelopts="intel_iommu=on iommu=pt selinux=0 mitigations=off idle=poll kvm.tdp_mmu=0" + entries=$(sudo ls /mnt/loader/entries/) + for entry in ${entries}; do + sudo sed -i '/^options / s/$/ '"${kernelopts}"' /g' /mnt/loader/entries/"${entry}" + done + sudo sed -i 's|kernelopts="|kernelopts="'"${kernelopts}"'|g' /mnt/grub2/grub.cfg + sudo sed -i 's|kernelopts=|kernelopts='"${kernelopts}"'|g' /mnt/grub2/grubenv + + # cleanup + sudo umount -R /mnt/ + sudo losetup -d "${loop}" + + qemu-img resize -f raw "${fedora_img}" +20G +} + +reload_kvm() { + # TDP_MMU is buggy on Hyper-V until v6.3/v6.4 + sudo rmmod kvm-intel kvm-amd kvm || true + sudo modprobe kvm tdp_mmu=0 + sudo modprobe kvm-intel || true + sudo modprobe kvm-amd || true +} + +run_vm() { + image="$1" + config_iso="$2" + disable_modern="off" + hostname="$(hostname)" + memory="8192M" + cpus=2 + machine_type="q35" + + reload_kvm + + sudo /usr/bin/qemu-system-${arch} -m "${memory}" -smp cpus="${cpus}" \ + -cpu host,host-phys-bits \ + -machine ${machine_type},accel=kvm,kernel_irqchip=split \ + -device intel-iommu,intremap=on,caching-mode=on,device-iotlb=on \ + -drive file=${image},if=virtio,aio=threads,format=raw \ + -drive file=${config_iso_file},if=virtio,media=cdrom \ + -daemonize -enable-kvm -device virtio-rng-pci -display none -vga none \ + -netdev user,hostfwd=tcp:${vm_ip}:${vm_port}-:22,hostname="${hostname}",id=net0 \ + -device virtio-net-pci,netdev=net0,disable-legacy=on,disable-modern="${disable_modern}",iommu_platform=on,ats=on \ + -netdev user,id=net1 \ + -device virtio-net-pci,netdev=net1,disable-legacy=on,disable-modern="${disable_modern}",iommu_platform=on,ats=on \ + -fsdev local,path=${repo_root_dir},security_model=passthrough,id=fs0 \ + -device virtio-9p-pci,fsdev=fs0,mount_tag=workspace + +} + +ssh_vm() { + cmd=$@ + ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "${ssh_key_file}" -p "${vm_port}" "${USER}@${vm_ip}" "${cmd}" +} + +scp_vm() { + guest_src=$1 + host_dest=$2 + scp -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "${ssh_key_file}" -P "${vm_port}" ${USER}@${vm_ip}:${guest_src} ${host_dest} +} + +wait_for_vm() { + for i in $(seq 1 30); do + if ssh_vm true; then + return 0 + fi + info "waiting for VM to start" + sleep 5 + done + return 1 +} + +main() { + trap cleanup EXIT + + config_iso_file="${data_dir}/config.iso" + fedora_img="${data_dir}/image.img" + + mkdir -p "${data_dir}" + + create_ssh_key + + create_config_iso "${config_iso_file}" + + for i in $(seq 1 5); do + pull_fedora_cloud_image "${fedora_img}" + run_vm "${fedora_img}" "${config_iso_file}" + if wait_for_vm; then + break + fi + info "Couldn't connect to the VM. Stopping VM and starting a new one." + kill_vms + done + + ssh_vm "/home/${USER}/run.sh" +} + +main $@ diff --git a/tools/packaging/kernel/configs/fragments/whitelist.conf b/tools/packaging/kernel/configs/fragments/whitelist.conf index 25198cebdc..e6b2711511 100644 --- a/tools/packaging/kernel/configs/fragments/whitelist.conf +++ b/tools/packaging/kernel/configs/fragments/whitelist.conf @@ -22,3 +22,4 @@ CONFIG_ARM64_UAO CONFIG_VFIO_MDEV_DEVICE CONFIG_SPECULATION_MITIGATIONS CONFIG_X86_SGX +CONFIG_VIRTIO_IOMMU diff --git a/tools/packaging/kernel/configs/fragments/x86_64/vfio.conf b/tools/packaging/kernel/configs/fragments/x86_64/vfio.conf index 09a4bf02f1..e052e10e66 100644 --- a/tools/packaging/kernel/configs/fragments/x86_64/vfio.conf +++ b/tools/packaging/kernel/configs/fragments/x86_64/vfio.conf @@ -1,3 +1,4 @@ # x86 specific items we need in order to handle vfio_mode=vfio devices CONFIG_INTEL_IOMMU=y CONFIG_IRQ_REMAP=y +CONFIG_VIRTIO_IOMMU=y diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index dee79f1094..ee977b5ecd 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -114 +115