diff --git a/hypervisor/Makefile b/hypervisor/Makefile index 068e20c22..dcf837943 100644 --- a/hypervisor/Makefile +++ b/hypervisor/Makefile @@ -301,6 +301,7 @@ VP_DM_C_SRCS += dm/vpci/vpci_bridge.c VP_DM_C_SRCS += dm/vpci/pci_pt.c VP_DM_C_SRCS += dm/vpci/vmsi.c VP_DM_C_SRCS += dm/vpci/vmsix.c +VP_DM_C_SRCS += dm/vpci/vmsix_on_msi.c VP_DM_C_SRCS += dm/vpci/vsriov.c VP_DM_C_SRCS += arch/x86/guest/vlapic.c VP_DM_C_SRCS += arch/x86/guest/pm.c diff --git a/hypervisor/dm/vpci/pci_pt.c b/hypervisor/dm/vpci/pci_pt.c index 429e2e21d..541a7060f 100644 --- a/hypervisor/dm/vpci/pci_pt.c +++ b/hypervisor/dm/vpci/pci_pt.c @@ -374,6 +374,7 @@ void init_vdev_pt(struct pci_vdev *vdev, bool is_pf_vdev) /* Initialize the vdev BARs except SRIOV VF, VF BARs are initialized directly from create_vf function */ if (vdev->phyfun == NULL) { init_bars(vdev, is_pf_vdev); + init_vmsix_on_msi(vdev); if (is_prelaunched_vm(vpci2vm(vdev->vpci)) && (!is_pf_vdev)) { pci_command = (uint16_t)pci_pdev_read_cfg(vdev->pdev->bdf, PCIR_COMMAND, 2U); diff --git a/hypervisor/dm/vpci/vmsix.c b/hypervisor/dm/vpci/vmsix.c index 588c56328..a90fb84e6 100644 --- a/hypervisor/dm/vpci/vmsix.c +++ b/hypervisor/dm/vpci/vmsix.c @@ -174,7 +174,11 @@ static void rw_vmsix_table(struct pci_vdev *vdev, struct mmio_request *mmio, uin /* Write to pci_vdev */ (void)memcpy_s((void *)entry + entry_offset, (size_t)mmio->size, &mmio->value, (size_t)mmio->size); - remap_one_vmsix_entry(vdev, index); + if (vdev->msix.is_vmsix_on_msi) { + remap_one_vmsix_entry_on_msi(vdev, index); + } else { + remap_one_vmsix_entry(vdev, index); + } } else { pr_err("%s, Only DWORD and QWORD are permitted", __func__); } @@ -205,6 +209,15 @@ int32_t vmsix_handle_table_mmio_access(struct io_request *io_req, void *handler_ if (msixtable_access(vdev, (uint32_t)offset)) { rw_vmsix_table(vdev, mmio, (uint32_t)offset); + } else if (vdev->msix.is_vmsix_on_msi) { + /* According to PCI spec, PBA is read-only. + * Don't emulate PBA according to the device status, just return 0. + */ + if (mmio->direction == REQUEST_READ) { + mmio->value = 0UL; + } else { + ret = -EINVAL; + } } else { hva = hpa2hva(vdev->msix.mmio_hpa + offset); diff --git a/hypervisor/dm/vpci/vmsix_on_msi.c b/hypervisor/dm/vpci/vmsix_on_msi.c new file mode 100644 index 000000000..5fa14a813 --- /dev/null +++ b/hypervisor/dm/vpci/vmsix_on_msi.c @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include +#include +#include +#include +#include "vpci_priv.h" + +#define PER_VECTOR_MASK_CAP 0x0100U + +/* Pre-assumptions for vMSI-x on MSI emulation: + * 1. The device is in vmsix_on_msi_devs array. + * 2. The device should support MSI capability as well as per-vector mask + * 3. The device doesn't support MSI-x capability. + * 4. The device should have an unused BAR (this condition is checked inside init_vmsix_on_msi). + * 5. HV doesn't emulate PBA according to physcial device status, the device driver should not rely on PBA + * for functionality. + */ +static bool need_vmsix_on_msi_emulation(__unused struct pci_pdev *pdev, __unused uint16_t *vector_count) +{ + bool ret = false; +#if (MAX_VMSIX_ON_MSI_PDEVS_NUM > 0) + uint16_t msgctrl; + uint32_t i; + + for(i = 0U; i < MAX_VMSIX_ON_MSI_PDEVS_NUM; i++) { + if (pdev->bdf.value == vmsix_on_msi_devs[i].bdf.value) { + if ((pdev->msi_capoff != 0U) && (pdev->msix.capoff == 0U)) { + msgctrl = (uint16_t)pci_pdev_read_cfg(pdev->bdf, pdev->msi_capoff + PCIR_MSI_CTRL, 2U); + *vector_count = 1U << ((msgctrl & PCIM_MSICTRL_MMC_MASK) >> 1U); + if ((*vector_count > 1U) && ((msgctrl & PER_VECTOR_MASK_CAP) != 0U)) { + ret = true; + } + } + break; + } + } +#endif + + return ret; +} + +void reserve_vmsix_on_msi_irtes(struct pci_pdev *pdev) +{ + struct intr_source intr_src; + uint16_t count = 0; + int32_t ret; + + if (need_vmsix_on_msi_emulation(pdev, &count)) { + intr_src.is_msi = true; + intr_src.src.msi.value = pdev->bdf.value; + ret = dmar_reserve_irte(&intr_src, count, &pdev->irte_start); + if ((ret == 0) && (pdev->irte_start != INVALID_IRTE_ID)) { + pdev->irte_count = count; + } + } +} + +static inline uint32_t get_mask_bits_offset(const struct pci_vdev *vdev) +{ + return vdev->msi.is_64bit ? (vdev->msix.capoff + 0x10U) : (vdev->msix.capoff + 0xCU); +} + +/** + * @pre vdev != NULL + * @pre vdev->pdev != NULL + */ +void init_vmsix_on_msi(struct pci_vdev *vdev) +{ + struct pci_pdev *pdev = vdev->pdev; + uint32_t i; + + /* irte_count > 1 only when the device needs vMSI-x on MSI emulation and IRTEs are reserved successfully */ + if (pdev->irte_count > 1U) { + /* find an unused BAR */ + for (i = 0U; i < vdev->nr_bars; i++) { + if (vdev->vbars[i].base_hpa == 0UL){ + break; + } + if (vdev->vbars[i].type == PCIBAR_MEM64) { + i++; + } + } + if (i < vdev->nr_bars) { + vdev->msix.capoff = pdev->msi_capoff; + vdev->msi.capoff = 0U; + vdev->msix.is_vmsix_on_msi = true; + /* For a device support MSI with per-vector mask, the length of MSI cap is at least 20 bytes */ + vdev->msix.caplen = MSIX_CAPLEN; + vdev->msix.table_bar = i; + vdev->msix.table_offset = 0U; + vdev->msix.table_count = pdev->irte_count; + + /* capability ID */ + pci_vdev_write_vcfg(vdev, vdev->msix.capoff, 1U, 0x11U); + /* message control, MSI-X Diabled, Function unamsked */ + pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 2U, 2U, pdev->irte_count - 1U); + /* Init MSIX table vBAR, offset is 0 */ + pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 4U, 4U, i); + /* Init PBA table vBAR, offset is 2048 */ + pci_vdev_write_vcfg(vdev, vdev->msix.capoff + 8U, 4U, 2048U + i); + + vdev->vbars[i].type = PCIBAR_MEM32; + vdev->vbars[i].size = 4096U; + vdev->vbars[i].base_hpa = 0x0UL; + vdev->vbars[i].mask = 0xFFFFF000U & PCI_BASE_ADDRESS_MEM_MASK; + /* fixed for memory, 32bit, non-prefetchable */ + vdev->vbars[i].fixed = 0U; + + /* About MSI-x bar GPA: + * - For Service VM: when first time init, it is programmed as 0, then OS will program + * the value later and the value is stored in vdev->vbars[MSI-X_BAR_ID].base_gpa. + * When the device is assigned to UOS and then assgined back to SOS, the stored base + * GPA will be used. + * - For Post-launched VM: The GPA is assigned by device model. + * - For Pre-launched VM: Not supported yet. + */ + vdev->msix.mmio_gpa = vdev->vbars[i].base_gpa; + vdev_pt_write_vbar(vdev, i, (uint32_t)(vdev->vbars[i].base_gpa & 0xFFFFFFFFUL)); + } + } +} + +void write_vmsix_cap_reg_on_msi(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val) +{ + uint16_t old_msgctrl, msgctrl; + uint16_t msi_msgctrl; + + old_msgctrl = (uint16_t)pci_vdev_read_vcfg(vdev, vdev->msix.capoff + PCIR_MSIX_CTRL, 2U); + /* Write to vdev */ + pci_vdev_write_vcfg(vdev, offset, bytes, val); + msgctrl = (uint16_t)pci_vdev_read_vcfg(vdev, vdev->msix.capoff + PCIR_MSIX_CTRL, 2U); + + if (((old_msgctrl ^ msgctrl) & (PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK)) != 0U) { + msi_msgctrl = (uint16_t)pci_pdev_read_cfg(vdev->pdev->bdf, offset, 2U); + + msi_msgctrl = msi_msgctrl & (~PCIM_MSICTRL_MME_MASK); + msi_msgctrl &= ~ PCIM_MSICTRL_MSI_ENABLE; + + /* If MSI Enable is being set, make sure INTxDIS bit is set */ + if ((msgctrl & PCIM_MSIXCTRL_MSIX_ENABLE) != 0U) { + enable_disable_pci_intx(vdev->pdev->bdf, false); + msi_msgctrl |= (msi_msgctrl & PCIM_MSICTRL_MMC_MASK) << 3U; + msi_msgctrl |= PCIM_MSICTRL_MSI_ENABLE; + } + pci_pdev_write_cfg(vdev->pdev->bdf, offset, 2U, msi_msgctrl); + + if ((msgctrl & PCIM_MSIXCTRL_FUNCTION_MASK) != 0U) { + pci_pdev_write_cfg(vdev->pdev->bdf, get_mask_bits_offset(vdev), 4U, 0xFFFFFFFFU); + } + } +} + +void remap_one_vmsix_entry_on_msi(struct pci_vdev *vdev, uint32_t index) +{ + const struct msix_table_entry *ventry; + uint32_t mask_bits; + uint32_t vector_mask = 1U << index; + struct msi_info info = {}; + union pci_bdf pbdf = vdev->pdev->bdf; + union irte_index ir_index; + int32_t ret = 0; + uint32_t capoff = vdev->msix.capoff; + + mask_bits = pci_pdev_read_cfg(pbdf, get_mask_bits_offset(vdev), 4U); + mask_bits |= vector_mask; + pci_pdev_write_cfg(pbdf, get_mask_bits_offset(vdev), 4U, mask_bits); + + ventry = &vdev->msix.table_entries[index]; + if ((ventry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0U) { + info.addr.full = vdev->msix.table_entries[index].addr; + info.data.full = vdev->msix.table_entries[index].data; + + ret = ptirq_prepare_msix_remap(vpci2vm(vdev->vpci), vdev->bdf.value, pbdf.value, + (uint16_t)index, &info, vdev->pdev->irte_start + (uint16_t)index); + if (ret == 0) { + if (!vdev->msix.is_vmsix_on_msi_programmed) { + ir_index.index = vdev->pdev->irte_start; + info.addr.ir_bits.shv = 1U; + info.addr.ir_bits.intr_index_high = ir_index.bits.index_high; + info.addr.ir_bits.intr_index_low = ir_index.bits.index_low; + pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_ADDR, 0x4U, (uint32_t)info.addr.full); + if (vdev->msi.is_64bit) { + pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_ADDR_HIGH, 0x4U, + (uint32_t)(info.addr.full >> 32U)); + pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_DATA_64BIT, 0x2U, + (uint16_t)info.data.full); + } else { + pci_pdev_write_cfg(pbdf, capoff + PCIR_MSI_DATA, 0x2U, + (uint16_t)info.data.full); + } + vdev->msix.is_vmsix_on_msi_programmed = true; + } + mask_bits &= ~vector_mask; + } + } + pci_pdev_write_cfg(pbdf, get_mask_bits_offset(vdev), 4U, mask_bits); +} diff --git a/hypervisor/dm/vpci/vpci.c b/hypervisor/dm/vpci/vpci.c index 7cbf1172b..bc5c4449a 100644 --- a/hypervisor/dm/vpci/vpci.c +++ b/hypervisor/dm/vpci/vpci.c @@ -500,7 +500,11 @@ static int32_t write_pt_dev_cfg(struct pci_vdev *vdev, uint32_t offset, } else if (msicap_access(vdev, offset)) { write_vmsi_cap_reg(vdev, offset, bytes, val); } else if (msixcap_access(vdev, offset)) { - write_vmsix_cap_reg(vdev, offset, bytes, val); + if (vdev->msix.is_vmsix_on_msi) { + write_vmsix_cap_reg_on_msi(vdev, offset, bytes, val); + } else { + write_vmsix_cap_reg(vdev, offset, bytes, val); + } } else if (sriovcap_access(vdev, offset)) { write_sriov_cap_reg(vdev, offset, bytes, val); } else { diff --git a/hypervisor/dm/vpci/vpci_priv.h b/hypervisor/dm/vpci/vpci_priv.h index 749678c74..769926a8a 100644 --- a/hypervisor/dm/vpci/vpci_priv.h +++ b/hypervisor/dm/vpci/vpci_priv.h @@ -128,6 +128,10 @@ void read_vmsix_cap_reg(const struct pci_vdev *vdev, uint32_t offset, uint32_t b void write_vmsix_cap_reg(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val); void deinit_vmsix(struct pci_vdev *vdev); +void init_vmsix_on_msi(struct pci_vdev *vdev); +void write_vmsix_cap_reg_on_msi(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val); +void remap_one_vmsix_entry_on_msi(struct pci_vdev *vdev, uint32_t index); + void init_vsriov(struct pci_vdev *vdev); void read_sriov_cap_reg(const struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t *val); void write_sriov_cap_reg(struct pci_vdev *vdev, uint32_t offset, uint32_t bytes, uint32_t val); diff --git a/hypervisor/hw/pci.c b/hypervisor/hw/pci.c index 76dcdd9b4..4e5a2a09a 100644 --- a/hypervisor/hw/pci.c +++ b/hypervisor/hw/pci.c @@ -767,6 +767,7 @@ struct pci_pdev *init_pdev(uint16_t pbdf, uint32_t drhd_index) pdev->drhd_index = drhd_index; num_pci_pdev++; + reserve_vmsix_on_msi_irtes(pdev); } else { pr_err("%s, %x:%x.%x unsupported headed type: 0x%x\n", __func__, bdf.bits.b, bdf.bits.d, bdf.bits.f, hdr_type); diff --git a/hypervisor/include/dm/vpci.h b/hypervisor/include/dm/vpci.h index cd3679af2..e0e2f7170 100644 --- a/hypervisor/include/dm/vpci.h +++ b/hypervisor/include/dm/vpci.h @@ -67,6 +67,8 @@ struct pci_msix { uint32_t table_bar; uint32_t table_offset; uint32_t table_count; + bool is_vmsix_on_msi; + bool is_vmsix_on_msi_programmed; }; /* SRIOV capability structure */ diff --git a/hypervisor/include/hw/pci.h b/hypervisor/include/hw/pci.h index 997dc3c81..d18ca728e 100644 --- a/hypervisor/include/hw/pci.h +++ b/hypervisor/include/hw/pci.h @@ -228,6 +228,9 @@ struct pci_pdev { /* IOMMU responsible for DMA and Interrupt Remapping for this device */ uint32_t drhd_index; + /* Used for vMSI-x on MSI emulation */ + uint16_t irte_start; + uint16_t irte_count; /* The bar info of the physical PCI device. */ uint32_t nr_bars; /* 6 for normal device, 2 for bridge, 1 for cardbus */ @@ -359,4 +362,5 @@ bool is_plat_hidden_pdev(union pci_bdf bdf); bool pdev_need_bar_restore(const struct pci_pdev *pdev); void pdev_restore_bar(const struct pci_pdev *pdev); void pci_switch_to_mmio_cfg_ops(void); +void reserve_vmsix_on_msi_irtes(struct pci_pdev *pdev); #endif /* PCI_H_ */