hv: vmsi: add vmsix on msi emulation support

Some passthrough devices require multiple MSI vectors, but don't
support MSI-X. In meanwhile, Linux kernel doesn't support continuous
vector allocation.
On native platform, this issue can be mitigated by IOMMU via interrupt
remapping. However, on ACRN, there is no vIOMMU.
vMSI-X on MSI emulation is one solution to mitigate this problem on ACRN.

This patch adds MSI-X emulation on MSI capability.
For the device needs to do MSI-X emulation, HV will hide MSI capability
and present MSI-X capability to guest.

The guest driver may need to modify to reqeust MSI-X vector.
For example:
        ret = pci_alloc_irq_vectors(pdev, 1, STMMAC_MSI_VEC_MAX,
-                                   PCI_IRQ_MSI);
+                                   PCI_IRQ_MSI | PCI_IRQ_MSIX);

To enable MSI-X emulation, the device should:
- 1. The device should be in vmsix_on_msi_devs array.
- 2. Support MSI, but don't support MSI-X.
- 3. MSI capability should support per-vector mask.
- 4. The device should have an unused BAR.
- 5. The device driver should not rely on PBA for functionality.

Tracked-On: #4831
Signed-off-by: Binbin Wu <binbin.wu@intel.com>
Acked-by: Eddie Dong <eddie.dong@intel.com>
This commit is contained in:
Binbin Wu 2020-05-29 23:37:54 +08:00 committed by wenlingz
parent da1788c9a3
commit 6be27cdcab
9 changed files with 236 additions and 2 deletions

View File

@ -301,6 +301,7 @@ VP_DM_C_SRCS += dm/vpci/vpci_bridge.c
VP_DM_C_SRCS += dm/vpci/pci_pt.c
VP_DM_C_SRCS += dm/vpci/vmsi.c
VP_DM_C_SRCS += dm/vpci/vmsix.c
VP_DM_C_SRCS += dm/vpci/vmsix_on_msi.c
VP_DM_C_SRCS += dm/vpci/vsriov.c
VP_DM_C_SRCS += arch/x86/guest/vlapic.c
VP_DM_C_SRCS += arch/x86/guest/pm.c

View File

@ -374,6 +374,7 @@ void init_vdev_pt(struct pci_vdev *vdev, bool is_pf_vdev)
/* Initialize the vdev BARs except SRIOV VF, VF BARs are initialized directly from create_vf function */
if (vdev->phyfun == NULL) {
init_bars(vdev, is_pf_vdev);
init_vmsix_on_msi(vdev);
if (is_prelaunched_vm(vpci2vm(vdev->vpci)) && (!is_pf_vdev)) {
pci_command = (uint16_t)pci_pdev_read_cfg(vdev->pdev->bdf, PCIR_COMMAND, 2U);

View File

@ -174,7 +174,11 @@ static void rw_vmsix_table(struct pci_vdev *vdev, struct mmio_request *mmio, uin
/* Write to pci_vdev */
(void)memcpy_s((void *)entry + entry_offset, (size_t)mmio->size,
&mmio->value, (size_t)mmio->size);
remap_one_vmsix_entry(vdev, index);
if (vdev->msix.is_vmsix_on_msi) {
remap_one_vmsix_entry_on_msi(vdev, index);
} else {
remap_one_vmsix_entry(vdev, index);
}
} else {
pr_err("%s, Only DWORD and QWORD are permitted", __func__);
}
@ -205,6 +209,15 @@ int32_t vmsix_handle_table_mmio_access(struct io_request *io_req, void *handler_
if (msixtable_access(vdev, (uint32_t)offset)) {
rw_vmsix_table(vdev, mmio, (uint32_t)offset);
} else if (vdev->msix.is_vmsix_on_msi) {
/* According to PCI spec, PBA is read-only.
* Don't emulate PBA according to the device status, just return 0.
*/
if (mmio->direction == REQUEST_READ) {
mmio->value = 0UL;
} else {
ret = -EINVAL;
}
} else {
hva = hpa2hva(vdev->msix.mmio_hpa + offset);

View File

@ -0,0 +1,204 @@
/*
* Copyright (C) 2020 Intel Corporation. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <vm.h>
#include <ptdev.h>
#include <assign.h>
#include <vpci.h>
#include <vtd.h>
#include <board.h>
#include "vpci_priv.h"
#define PER_VECTOR_MASK_CAP 0x0100U
/* Pre-assumptions for vMSI-x on MSI emulation:
* 1. The device is in vmsix_on_msi_devs array.
* 2. The device should support MSI capability as well as per-vector mask
* 3. The device doesn't support MSI-x capability.
* 4. The device should have an unused BAR (this condition is checked inside init_vmsix_on_msi).
* 5. HV doesn't emulate PBA according to physcial device status, the device driver should not rely on PBA
* for functionality.
*/
static bool need_vmsix_on_msi_emulation(__unused struct pci_pdev *pdev, __unused uint16_t *vector_count)
{
bool ret = false;
#if (MAX_VMSIX_ON_MSI_PDEVS_NUM > 0)
uint16_t msgctrl;
uint32_t i;
for(i = 0U; i < MAX_VMSIX_ON_MSI_PDEVS_NUM; i++) {
if (pdev->bdf.value == vmsix_on_msi_devs[i].bdf.value) {
if ((pdev->msi_capoff != 0U) && (pdev->msix.capoff == 0U)) {
msgctrl = (uint16_t)pci_pdev_read_cfg(pdev->bdf, pdev->msi_capoff + PCIR_MSI_CTRL, 2U);
*vector_count = 1U << ((msgctrl & PCIM_MSICTRL_MMC_MASK) >> 1U);
if ((*vector_count > 1U) && ((msgctrl & PER_VECTOR_MASK_CAP) != 0U)) {
ret = true;
}
}
break;
}
}
#endif
return ret;
}
void reserve_vmsix_on_msi_irtes(struct pci_pdev *pdev)
{
struct intr_source intr_src;
uint16_t count = 0;
int32_t ret;
if (need_vmsix_on_msi_emulation(pdev, &count)) {
intr_src.is_msi = true;
intr_src.src.msi.value = pdev->bdf.value;
ret = dmar_reserve_irte(&intr_src, count, &pdev->irte_start);
if ((ret == 0) && (pdev->irte_start != INVALID_IRTE_ID)) {
pdev->irte_count = count;
}
}
}
static inline uint32_t get_mask_bits_offset(const struct pci_vdev *vdev)
{
return vdev->msi.is_64bit ? (vdev->msix.capoff + 0x10U) : (vdev->msix.capoff + 0xCU);
}
/**
* @pre vdev != NULL
* @pre vdev->pdev != NULL
*/
void init_vmsix_on_msi(struct pci_vdev *vdev)
{
struct pci_pdev *pdev = vdev->pdev;
uint32_t i;
/* irte_count > 1 only when the device needs vMSI-x on MSI emulation and IRTEs are reserved successfully */
if (pdev->irte_count > 1U) {
/* find an unused BAR */
for (i = 0U; i < vdev->nr_bars; i++) {
if (vdev->vbars[i].base_hpa == 0UL){
break;
}
if (vdev->vbars[i].type == PCIBAR_MEM64) {
i++;
}
}
if (i < vdev->nr_bars) {
vdev->msix.capoff = pdev->msi_capoff;
vdev->msi.capoff = 0U;
vdev->msix.is_vmsix_on_msi = true;
/* For a device support MSI with per-vector mask, the length of MSI cap is at least 20 bytes */
vdev->msix.caplen = MSIX_CAPLEN;
vdev->msix.table_bar = i;
vdev->msix.table_offset = 0U;
vdev->msix.table_count = pdev->irte_count;
/* capability ID */
pci_vdev_write_vcfg(vdev, vdev->msix.capoff, 1U, 0x11U);
/* message control, MSI-X Diabled, Function unamsked */
pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 2U, 2U, pdev->irte_count - 1U);
/* Init MSIX table vBAR, offset is 0 */
pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 4U, 4U, i);
/* Init PBA table vBAR, offset is 2048 */
pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 8U, 4U, 2048U + i);
vdev->vbars[i].type = PCIBAR_MEM32;
vdev->vbars[i].size = 4096U;
vdev->vbars[i].base_hpa = 0x0UL;
vdev->vbars[i].mask = 0xFFFFF000U & PCI_BASE_ADDRESS_MEM_MASK;
/* fixed for memory, 32bit, non-prefetchable */
vdev->vbars[i].fixed = 0U;
/* About MSI-x bar GPA:
* - For Service VM: when first time init, it is programmed as 0, then OS will program
* the value later and the value is stored in vdev->vbars[MSI-X_BAR_ID].base_gpa.
* When the device is assigned to UOS and then assgined back to SOS, the stored base
* GPA will be used.
* - For Post-launched VM: The GPA is assigned by device model.
* - For Pre-launched VM: Not supported yet.
*/
vdev->msix.mmio_gpa = vdev->vbars[i].base_gpa;
vdev_pt_write_vbar(vdev, i, (uint32_t)(vdev->vbars[i].base_gpa & 0xFFFFFFFFUL));
}
}
}
void write_vmsix_cap_reg_on_msi(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val)
{
uint16_t old_msgctrl, msgctrl;
uint16_t msi_msgctrl;
old_msgctrl = (uint16_t)pci_vdev_read_vcfg(vdev, vdev->msix.capoff + PCIR_MSIX_CTRL, 2U);
/* Write to vdev */
pci_vdev_write_vcfg(vdev, offset, bytes, val);
msgctrl = (uint16_t)pci_vdev_read_vcfg(vdev, vdev->msix.capoff + PCIR_MSIX_CTRL, 2U);
if (((old_msgctrl ^ msgctrl) & (PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK)) != 0U) {
msi_msgctrl = (uint16_t)pci_pdev_read_cfg(vdev->pdev->bdf, offset, 2U);
msi_msgctrl = msi_msgctrl & (~PCIM_MSICTRL_MME_MASK);
msi_msgctrl &= ~ PCIM_MSICTRL_MSI_ENABLE;
/* If MSI Enable is being set, make sure INTxDIS bit is set */
if ((msgctrl & PCIM_MSIXCTRL_MSIX_ENABLE) != 0U) {
enable_disable_pci_intx(vdev->pdev->bdf, false);
msi_msgctrl |= (msi_msgctrl & PCIM_MSICTRL_MMC_MASK) << 3U;
msi_msgctrl |= PCIM_MSICTRL_MSI_ENABLE;
}
pci_pdev_write_cfg(vdev->pdev->bdf, offset, 2U, msi_msgctrl);
if ((msgctrl & PCIM_MSIXCTRL_FUNCTION_MASK) != 0U) {
pci_pdev_write_cfg(vdev->pdev->bdf, get_mask_bits_offset(vdev), 4U, 0xFFFFFFFFU);
}
}
}
void remap_one_vmsix_entry_on_msi(struct pci_vdev *vdev, uint32_t index)
{
const struct msix_table_entry *ventry;
uint32_t mask_bits;
uint32_t vector_mask = 1U << index;
struct msi_info info = {};
union pci_bdf pbdf = vdev->pdev->bdf;
union irte_index ir_index;
int32_t ret = 0;
uint32_t capoff = vdev->msix.capoff;
mask_bits = pci_pdev_read_cfg(pbdf, get_mask_bits_offset(vdev), 4U);
mask_bits |= vector_mask;
pci_pdev_write_cfg(pbdf, get_mask_bits_offset(vdev), 4U, mask_bits);
ventry = &vdev->msix.table_entries[index];
if ((ventry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0U) {
info.addr.full = vdev->msix.table_entries[index].addr;
info.data.full = vdev->msix.table_entries[index].data;
ret = ptirq_prepare_msix_remap(vpci2vm(vdev->vpci), vdev->bdf.value, pbdf.value,
(uint16_t)index, &info, vdev->pdev->irte_start + (uint16_t)index);
if (ret == 0) {
if (!vdev->msix.is_vmsix_on_msi_programmed) {
ir_index.index = vdev->pdev->irte_start;
info.addr.ir_bits.shv = 1U;
info.addr.ir_bits.intr_index_high = ir_index.bits.index_high;
info.addr.ir_bits.intr_index_low = ir_index.bits.index_low;
pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_ADDR, 0x4U, (uint32_t)info.addr.full);
if (vdev->msi.is_64bit) {
pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_ADDR_HIGH, 0x4U,
(uint32_t)(info.addr.full >> 32U));
pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_DATA_64BIT, 0x2U,
(uint16_t)info.data.full);
} else {
pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_DATA, 0x2U,
(uint16_t)info.data.full);
}
vdev->msix.is_vmsix_on_msi_programmed = true;
}
mask_bits &= ~vector_mask;
}
}
pci_pdev_write_cfg(pbdf, get_mask_bits_offset(vdev), 4U, mask_bits);
}

View File

@ -500,7 +500,11 @@ static int32_t write_pt_dev_cfg(struct pci_vdev *vdev, uint32_t offset,
} else if (msicap_access(vdev, offset)) {
write_vmsi_cap_reg(vdev, offset, bytes, val);
} else if (msixcap_access(vdev, offset)) {
write_vmsix_cap_reg(vdev, offset, bytes, val);
if (vdev->msix.is_vmsix_on_msi) {
write_vmsix_cap_reg_on_msi(vdev, offset, bytes, val);
} else {
write_vmsix_cap_reg(vdev, offset, bytes, val);
}
} else if (sriovcap_access(vdev, offset)) {
write_sriov_cap_reg(vdev, offset, bytes, val);
} else {

View File

@ -128,6 +128,10 @@ void read_vmsix_cap_reg(const struct pci_vdev *vdev, uint32_t offset, uint32_t b
void write_vmsix_cap_reg(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val);
void deinit_vmsix(struct pci_vdev *vdev);
void init_vmsix_on_msi(struct pci_vdev *vdev);
void write_vmsix_cap_reg_on_msi(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val);
void remap_one_vmsix_entry_on_msi(struct pci_vdev *vdev, uint32_t index);
void init_vsriov(struct pci_vdev *vdev);
void read_sriov_cap_reg(const struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t *val);
void write_sriov_cap_reg(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val);

View File

@ -767,6 +767,7 @@ struct pci_pdev *init_pdev(uint16_t pbdf, uint32_t drhd_index)
pdev->drhd_index = drhd_index;
num_pci_pdev++;
reserve_vmsix_on_msi_irtes(pdev);
} else {
pr_err("%s, %x:%x.%x unsupported headed type: 0x%x\n",
__func__, bdf.bits.b, bdf.bits.d, bdf.bits.f, hdr_type);

View File

@ -67,6 +67,8 @@ struct pci_msix {
uint32_t table_bar;
uint32_t table_offset;
uint32_t table_count;
bool is_vmsix_on_msi;
bool is_vmsix_on_msi_programmed;
};
/* SRIOV capability structure */

View File

@ -228,6 +228,9 @@ struct pci_pdev {
/* IOMMU responsible for DMA and Interrupt Remapping for this device */
uint32_t drhd_index;
/* Used for vMSI-x on MSI emulation */
uint16_t irte_start;
uint16_t irte_count;
/* The bar info of the physical PCI device. */
uint32_t nr_bars; /* 6 for normal device, 2 for bridge, 1 for cardbus */
@ -359,4 +362,5 @@ bool is_plat_hidden_pdev(union pci_bdf bdf);
bool pdev_need_bar_restore(const struct pci_pdev *pdev);
void pdev_restore_bar(const struct pci_pdev *pdev);
void pci_switch_to_mmio_cfg_ops(void);
void reserve_vmsix_on_msi_irtes(struct pci_pdev *pdev);
#endif /* PCI_H_ */