/* * Copyright (C) 2018-2022 Intel Corporation. * * SPDX-License-Identifier: BSD-3-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_SECURITY_VM_FIXUP #include #endif #include #include /* Local variables */ /* pre-assumption: TRUSTY_RAM_SIZE is 2M aligned */ static struct page post_user_vm_sworld_memory[MAX_POST_VM_NUM][TRUSTY_RAM_SIZE >> PAGE_SHIFT] __aligned(MEM_2M); static struct acrn_vm vm_array[CONFIG_MAX_VM_NUM] __aligned(PAGE_SIZE); static struct acrn_vm *service_vm_ptr = NULL; void *get_sworld_memory_base(void) { return post_user_vm_sworld_memory; } uint16_t get_unused_vmid(void) { uint16_t vm_id; struct acrn_vm_config *vm_config; for (vm_id = 0; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { vm_config = get_vm_config(vm_id); if ((vm_config->name[0] == '\0') && ((vm_config->guest_flags & GUEST_FLAG_STATIC_VM) == 0U)) { break; } } return (vm_id < CONFIG_MAX_VM_NUM) ? (vm_id) : (ACRN_INVALID_VMID); } uint16_t get_vmid_by_name(const char *name) { uint16_t vm_id; for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { if ((*name != '\0') && vm_has_matched_name(vm_id, name)) { break; } } return (vm_id < CONFIG_MAX_VM_NUM) ? (vm_id) : (ACRN_INVALID_VMID); } /** * @pre vm != NULL */ bool is_poweroff_vm(const struct acrn_vm *vm) { return (vm->state == VM_POWERED_OFF); } /** * @pre vm != NULL */ bool is_created_vm(const struct acrn_vm *vm) { return (vm->state == VM_CREATED); } /** * @pre vm != NULL */ bool is_paused_vm(const struct acrn_vm *vm) { return (vm->state == VM_PAUSED); } bool is_service_vm(const struct acrn_vm *vm) { return (vm != NULL) && (get_vm_config(vm->vm_id)->load_order == SERVICE_VM); } /** * @pre vm != NULL * @pre vm->vmid < CONFIG_MAX_VM_NUM */ bool is_postlaunched_vm(const struct acrn_vm *vm) { return (get_vm_config(vm->vm_id)->load_order == POST_LAUNCHED_VM); } /** * @pre vm != NULL * @pre vm->vmid < CONFIG_MAX_VM_NUM */ bool is_prelaunched_vm(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config; vm_config = get_vm_config(vm->vm_id); return (vm_config->load_order == PRE_LAUNCHED_VM); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_lapic_pt_configured(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_LAPIC_PASSTHROUGH) != 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_pmu_pt_configured(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_PMU_PASSTHROUGH) != 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_rt_vm(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_RT) != 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM * * Stateful VM refers to VM that has its own state (such as internal file cache), * and will experience state loss (file system corruption) if force powered down. */ bool is_stateful_vm(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); /* TEE VM doesn't has its own state. The TAs will do the content * flush by themselves, HV and OS doesn't need to care about the state. */ return ((vm_config->guest_flags & GUEST_FLAG_TEE) == 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_nvmx_configured(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_NVMX_ENABLED) != 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_vcat_configured(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_VCAT_ENABLED) != 0U); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool is_static_configured_vm(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_STATIC_VM) != 0U); } /** * @brief VT-d PI posted mode can possibly be used for PTDEVs assigned * to this VM if platform supports VT-d PI AND lapic passthru is not configured * for this VM. * However, as we can only post single destination IRQ, so meeting these 2 conditions * does not necessarily mean posted mode will be used for all PTDEVs belonging * to the VM, unless the IRQ is single-destination for the specific PTDEV * @pre vm != NULL */ bool is_pi_capable(const struct acrn_vm *vm) { return (platform_caps.pi && (!is_lapic_pt_configured(vm))); } struct acrn_vm *get_highest_severity_vm(bool runtime) { uint16_t vm_id, highest_vm_id = 0U; for (vm_id = 1U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { if (runtime && is_poweroff_vm(get_vm_from_vmid(vm_id))) { /* If vm is non-existed or shutdown, it's not highest severity VM */ continue; } if (get_vm_severity(vm_id) > get_vm_severity(highest_vm_id)) { highest_vm_id = vm_id; } } return get_vm_from_vmid(highest_vm_id); } /** * @pre vm != NULL && vm_config != NULL && vm->vmid < CONFIG_MAX_VM_NUM */ bool vm_hide_mtrr(const struct acrn_vm *vm) { struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id); return ((vm_config->guest_flags & GUEST_FLAG_HIDE_MTRR) != 0U); } /** * @brief Initialize the I/O bitmap for \p vm * * @param vm The VM whose I/O bitmap is to be initialized */ static void setup_io_bitmap(struct acrn_vm *vm) { if (is_service_vm(vm)) { (void)memset(vm->arch_vm.io_bitmap, 0x00U, PAGE_SIZE * 2U); } else { /* block all IO port access from Guest */ (void)memset(vm->arch_vm.io_bitmap, 0xFFU, PAGE_SIZE * 2U); } } /** * return a pointer to the virtual machine structure associated with * this VM ID * * @pre vm_id < CONFIG_MAX_VM_NUM */ struct acrn_vm *get_vm_from_vmid(uint16_t vm_id) { return &vm_array[vm_id]; } /* return a pointer to the virtual machine structure of Service VM */ struct acrn_vm *get_service_vm(void) { ASSERT(service_vm_ptr != NULL, "service_vm_ptr is NULL"); return service_vm_ptr; } /** * @pre vm_config != NULL */ static inline uint16_t get_configured_bsp_pcpu_id(const struct acrn_vm_config *vm_config) { /* * The set least significant bit represents the pCPU ID for BSP * vm_config->cpu_affinity has been sanitized to contain valid pCPU IDs */ return ffs64(vm_config->cpu_affinity); } /** * @pre vm != NULL && vm_config != NULL */ static void prepare_prelaunched_vm_memmap(struct acrn_vm *vm, const struct acrn_vm_config *vm_config) { uint64_t base_hpa; uint64_t base_gpa; uint64_t remaining_entry_size; uint32_t hpa_index; uint64_t base_size; uint32_t i; struct vm_hpa_regions tmp_vm_hpa; const struct e820_entry *entry; hpa_index = 0U; tmp_vm_hpa = vm_config->memory.host_regions[0]; for (i = 0U; i < vm->e820_entry_num; i++) { entry = &(vm->e820_entries[i]); if (entry->length == 0UL) { continue; } else { if (is_software_sram_enabled() && (entry->baseaddr == PRE_RTVM_SW_SRAM_BASE_GPA) && ((vm_config->guest_flags & GUEST_FLAG_RT) != 0U)){ /* pass through Software SRAM to pre-RTVM */ ept_add_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, get_software_sram_base(), PRE_RTVM_SW_SRAM_BASE_GPA, get_software_sram_size(), EPT_RWX | EPT_WB); continue; } } if ((entry->type == E820_TYPE_RESERVED) && (entry->baseaddr > MEM_1M)) { continue; } base_gpa = entry->baseaddr; remaining_entry_size = entry->length; while ((hpa_index < vm_config->memory.region_num) && (remaining_entry_size > 0)) { base_hpa = tmp_vm_hpa.start_hpa; base_size = min(remaining_entry_size, tmp_vm_hpa.size_hpa); if (tmp_vm_hpa.size_hpa > remaining_entry_size) { /* from low to high */ tmp_vm_hpa.start_hpa += base_size; tmp_vm_hpa.size_hpa -= base_size; } else { hpa_index++; if (hpa_index < vm_config->memory.region_num) { tmp_vm_hpa = vm_config->memory.host_regions[hpa_index]; } } if (entry->type != E820_TYPE_RESERVED) { ept_add_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, base_hpa, base_gpa, base_size, EPT_RWX | EPT_WB); } else { /* GPAs under 1MB are always backed by physical memory */ ept_add_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, base_hpa, base_gpa, base_size, EPT_RWX | EPT_UNCACHED); } remaining_entry_size -= base_size; base_gpa += base_size; } } for (i = 0U; i < MAX_MMIO_DEV_NUM; i++) { /* Now we include the TPM2 event log region in ACPI NVS, so we need to * delete this potential mapping first. */ (void)deassign_mmio_dev(vm, &vm_config->mmiodevs[i]); (void)assign_mmio_dev(vm, &vm_config->mmiodevs[i]); #ifdef P2SB_VGPIO_DM_ENABLED if ((vm_config->pt_p2sb_bar) && (vm_config->mmiodevs[i].res[0].host_pa == P2SB_BAR_ADDR)) { register_vgpio_handler(vm, &vm_config->mmiodevs[i].res[0]); } #endif } } static void deny_pci_bar_access(struct acrn_vm *service_vm, const struct pci_pdev *pdev) { uint32_t idx; struct pci_vbar vbar = {}; uint64_t base = 0UL, size = 0UL, mask; uint64_t *pml4_page; pml4_page = (uint64_t *)service_vm->arch_vm.nworld_eptp; for ( idx= 0; idx < pdev->nr_bars; idx++) { vbar.bar_type.bits = pdev->bars[idx].phy_bar; if (!is_pci_reserved_bar(&vbar)) { base = pdev->bars[idx].phy_bar; size = pdev->bars[idx].size_mask; if (is_pci_mem64lo_bar(&vbar)) { idx++; base |= (((uint64_t)pdev->bars[idx].phy_bar) << 32UL); size |= (((uint64_t)pdev->bars[idx].size_mask) << 32UL); } mask = (is_pci_io_bar(&vbar)) ? PCI_BASE_ADDRESS_IO_MASK : PCI_BASE_ADDRESS_MEM_MASK; base &= mask; size &= mask; size = size & ~(size - 1UL); if ((base != 0UL)) { if (is_pci_io_bar(&vbar)) { base &= 0xffffU; deny_guest_pio_access(service_vm, base, size); } else { /*for passthru device MMIO BAR base must be 4K aligned. This is the requirement of passthru devices.*/ ASSERT((base & PAGE_MASK) != 0U, "%02x:%02x.%d bar[%d] 0x%lx, is not 4K aligned!", pdev->bdf.bits.b, pdev->bdf.bits.d, pdev->bdf.bits.f, idx, base); size = round_page_up(size); ept_del_mr(service_vm, pml4_page, base, size); } } } } } static void deny_pdevs(struct acrn_vm *service_vm, struct acrn_vm_pci_dev_config *pci_devs, uint16_t pci_dev_num) { uint16_t i; for (i = 0; i < pci_dev_num; i++) { if ( pci_devs[i].pdev != NULL) { deny_pci_bar_access(service_vm, pci_devs[i].pdev); } } } static void deny_hv_owned_devices(struct acrn_vm *service_vm) { uint16_t pio_address; uint32_t nbytes, i; const struct pci_pdev **hv_owned = get_hv_owned_pdevs(); for (i = 0U; i < get_hv_owned_pdev_num(); i++) { deny_pci_bar_access(service_vm, hv_owned[i]); } if (get_pio_dbg_uart_cfg(&pio_address, &nbytes)) { deny_guest_pio_access(service_vm, pio_address, nbytes); } } /** * @param[inout] vm pointer to a vm descriptor * * @retval 0 on success * * @pre vm != NULL * @pre is_service_vm(vm) == true */ static void prepare_service_vm_memmap(struct acrn_vm *vm) { uint16_t vm_id; uint32_t i; uint64_t hv_hpa; uint64_t service_vm_high64_max_ram = MEM_4G; struct acrn_vm_config *vm_config; uint64_t *pml4_page = (uint64_t *)vm->arch_vm.nworld_eptp; struct epc_section* epc_secs; const struct e820_entry *entry; uint32_t entries_count = vm->e820_entry_num; const struct e820_entry *p_e820 = vm->e820_entries; struct pci_mmcfg_region *pci_mmcfg; uint64_t trampoline_memory_size = round_page_up((uint64_t)(&ld_trampoline_end - &ld_trampoline_start)); pr_dbg("Service VM e820 layout:\n"); for (i = 0U; i < entries_count; i++) { entry = p_e820 + i; pr_dbg("e820 table: %d type: 0x%x", i, entry->type); pr_dbg("BaseAddress: 0x%016lx length: 0x%016lx\n", entry->baseaddr, entry->length); if (entry->type == E820_TYPE_RAM) { service_vm_high64_max_ram = max((entry->baseaddr + entry->length), service_vm_high64_max_ram); } } /* create real ept map for [0, service_vm_high64_max_ram) with UC */ ept_add_mr(vm, pml4_page, 0UL, 0UL, service_vm_high64_max_ram, EPT_RWX | EPT_UNCACHED); /* update ram entries to WB attr */ for (i = 0U; i < entries_count; i++) { entry = p_e820 + i; if (entry->type == E820_TYPE_RAM) { ept_modify_mr(vm, pml4_page, entry->baseaddr, entry->length, EPT_WB, EPT_MT_MASK); } } /* Unmap all platform EPC resource from Service VM. * This part has already been marked as reserved by BIOS in E820 * will cause EPT violation if Service VM accesses EPC resource. */ epc_secs = get_phys_epc(); for (i = 0U; (i < MAX_EPC_SECTIONS) && (epc_secs[i].size != 0UL); i++) { ept_del_mr(vm, pml4_page, epc_secs[i].base, epc_secs[i].size); } /* unmap hypervisor itself for safety * will cause EPT violation if Service VM accesses hv memory */ hv_hpa = hva2hpa((void *)(get_hv_image_base())); ept_del_mr(vm, pml4_page, hv_hpa, get_hv_ram_size()); /* unmap prelaunch VM memory */ for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { vm_config = get_vm_config(vm_id); if (vm_config->load_order == PRE_LAUNCHED_VM) { for (i = 0; i < vm_config->memory.region_num; i++){ ept_del_mr(vm, pml4_page, vm_config->memory.host_regions[i].start_hpa, vm_config->memory.host_regions[i].size_hpa); } /* Remove MMIO/IO bars of pre-launched VM's ptdev */ deny_pdevs(vm, vm_config->pci_devs, vm_config->pci_dev_num); } for (i = 0U; i < MAX_MMIO_DEV_NUM; i++) { (void)deassign_mmio_dev(vm, &vm_config->mmiodevs[i]); } } /* unmap AP trampoline code for security * This buffer is guaranteed to be page aligned. */ ept_del_mr(vm, pml4_page, get_trampoline_start16_paddr(), trampoline_memory_size); /* unmap PCIe MMCONFIG region since it's owned by hypervisor */ pci_mmcfg = get_mmcfg_region(); ept_del_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, pci_mmcfg->address, get_pci_mmcfg_size(pci_mmcfg)); if (is_software_sram_enabled()) { /* * Native Software SRAM resources shall be assigned to either Pre-launched RTVM * or Service VM. Software SRAM support for Post-launched RTVM is virtualized * in Service VM. * * 1) Native Software SRAM resources are assigned to Pre-launched RTVM: * - Remove Software SRAM regions from Service VM EPT, to prevent * Service VM from using clflush to flush the Software SRAM cache. * - PRE_RTVM_SW_SRAM_MAX_SIZE is the size of Software SRAM that * Pre-launched RTVM uses, presumed to be starting from Software SRAM base. * For other cases, PRE_RTVM_SW_SRAM_MAX_SIZE should be defined as 0, * and no region will be removed from Service VM EPT. * * 2) Native Software SRAM resources are assigned to Service VM: * - Software SRAM regions are added to EPT of Service VM by default * with memory type UC. * - But, Service VM need to access Software SRAM regions * when virtualizing them for Post-launched RTVM. * - So memory type of Software SRAM regions in EPT shall be updated to EPT_WB. */ #if (PRE_RTVM_SW_SRAM_MAX_SIZE > 0U) ept_del_mr(vm, pml4_page, service_vm_hpa2gpa(get_software_sram_base()), PRE_RTVM_SW_SRAM_MAX_SIZE); #else ept_modify_mr(vm, pml4_page, service_vm_hpa2gpa(get_software_sram_base()), get_software_sram_size(), EPT_WB, EPT_MT_MASK); #endif } /* unmap Intel IOMMU register pages for below reason: * Service VM can detect IOMMU capability in its ACPI table hence it may access * IOMMU hardware resources, which is not expected, as IOMMU hardware is owned by hypervisor. */ for (i = 0U; i < plat_dmar_info.drhd_count; i++) { ept_del_mr(vm, pml4_page, plat_dmar_info.drhd_units[i].reg_base_addr, PAGE_SIZE); } } /* Add EPT mapping of EPC reource for the VM */ static void prepare_epc_vm_memmap(struct acrn_vm *vm) { struct epc_map* vm_epc_maps; uint32_t i; if (is_vsgx_supported(vm->vm_id)) { vm_epc_maps = get_epc_mapping(vm->vm_id); for (i = 0U; (i < MAX_EPC_SECTIONS) && (vm_epc_maps[i].size != 0UL); i++) { ept_add_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, vm_epc_maps[i].hpa, vm_epc_maps[i].gpa, vm_epc_maps[i].size, EPT_RWX | EPT_WB); } } } /** * @brief get bitmap of pCPUs whose vCPUs have LAPIC PT enabled * * @param[in] vm pointer to vm data structure * @pre vm != NULL * * @return pCPU bitmap */ static uint64_t lapic_pt_enabled_pcpu_bitmap(struct acrn_vm *vm) { uint16_t i; struct acrn_vcpu *vcpu; uint64_t bitmap = 0UL; if (is_lapic_pt_configured(vm)) { foreach_vcpu(i, vm, vcpu) { if (is_x2apic_enabled(vcpu_vlapic(vcpu))) { bitmap_set_nolock(pcpuid_from_vcpu(vcpu), &bitmap); } } } return bitmap; } void prepare_vm_identical_memmap(struct acrn_vm *vm, uint16_t e820_entry_type, uint64_t prot_orig) { const struct e820_entry *entry; const struct e820_entry *p_e820 = vm->e820_entries; uint32_t entries_count = vm->e820_entry_num; uint64_t *pml4_page = (uint64_t *)vm->arch_vm.nworld_eptp; uint32_t i; for (i = 0U; i < entries_count; i++) { entry = p_e820 + i; if (entry->type == e820_entry_type) { ept_add_mr(vm, pml4_page, entry->baseaddr, entry->baseaddr, entry->length, prot_orig); } } } /** * @pre vm_id < CONFIG_MAX_VM_NUM && vm_config != NULL && rtn_vm != NULL * @pre vm->state == VM_POWERED_OFF */ int32_t create_vm(uint16_t vm_id, uint64_t pcpu_bitmap, struct acrn_vm_config *vm_config, struct acrn_vm **rtn_vm) { struct acrn_vm *vm = NULL; int32_t status = 0; uint16_t pcpu_id; /* Allocate memory for virtual machine */ vm = &vm_array[vm_id]; vm->vm_id = vm_id; vm->hw.created_vcpus = 0U; init_ept_pgtable(&vm->arch_vm.ept_pgtable, vm->vm_id); vm->arch_vm.nworld_eptp = pgtable_create_root(&vm->arch_vm.ept_pgtable); (void)memcpy_s(&vm->name[0], MAX_VM_NAME_LEN, &vm_config->name[0], MAX_VM_NAME_LEN); if (is_service_vm(vm)) { /* Only for Service VM */ create_service_vm_e820(vm); prepare_service_vm_memmap(vm); status = init_vm_boot_info(vm); } else { /* For PRE_LAUNCHED_VM and POST_LAUNCHED_VM */ if ((vm_config->guest_flags & GUEST_FLAG_SECURE_WORLD_ENABLED) != 0U) { vm->sworld_control.flag.supported = 1U; } if (vm->sworld_control.flag.supported != 0UL) { uint16_t service_vm_id = (get_service_vm())->vm_id; uint16_t page_idx = vmid_2_rel_vmid(service_vm_id, vm_id) - 1U; ept_add_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, hva2hpa(post_user_vm_sworld_memory[page_idx]), TRUSTY_EPT_REBASE_GPA, TRUSTY_RAM_SIZE, EPT_WB | EPT_RWX); } if (vm_config->name[0] == '\0') { /* if VM name is not configured, specify with VM ID */ snprintf(vm_config->name, 16, "ACRN VM_%d", vm_id); } if (vm_config->load_order == PRE_LAUNCHED_VM) { /* * If a prelaunched VM has the flag GUEST_FLAG_TEE set then it * is a special prelaunched VM called TEE VM which need special * memmap, e.g. mapping the REE VM into its space. Otherwise, * just use the standard preplaunched VM memmap. */ if ((vm_config->guest_flags & GUEST_FLAG_TEE) != 0U) { prepare_tee_vm_memmap(vm, vm_config); } else { create_prelaunched_vm_e820(vm); prepare_prelaunched_vm_memmap(vm, vm_config); } status = init_vm_boot_info(vm); } } if (status == 0) { prepare_epc_vm_memmap(vm); spinlock_init(&vm->vlapic_mode_lock); spinlock_init(&vm->ept_lock); spinlock_init(&vm->emul_mmio_lock); spinlock_init(&vm->arch_vm.iwkey_backup_lock); vm->arch_vm.vlapic_mode = VM_VLAPIC_XAPIC; vm->arch_vm.vm_mwait_cap = has_monitor_cap(); vm->intr_inject_delay_delta = 0UL; vm->nr_emul_mmio_regions = 0U; vm->vcpuid_entry_nr = 0U; /* Set up IO bit-mask such that VM exit occurs on * selected IO ranges */ setup_io_bitmap(vm); init_guest_pm(vm); if (is_nvmx_configured(vm)) { init_nested_vmx(vm); } if (!is_lapic_pt_configured(vm)) { vpic_init(vm); } if (is_rt_vm(vm) || !is_postlaunched_vm(vm)) { vrtc_init(vm); } if (is_service_vm(vm)) { deny_hv_owned_devices(vm); } #ifdef CONFIG_SECURITY_VM_FIXUP passthrough_smbios(vm, get_acrn_boot_info()); #endif status = init_vpci(vm); if (status == 0) { enable_iommu(); /* Create virtual uart;*/ init_legacy_vuarts(vm, vm_config->vuart); register_reset_port_handler(vm); /* vpic wire_mode default is INTR */ vm->wire_mode = VPIC_WIRE_INTR; /* Init full emulated vIOAPIC instance: * Present a virtual IOAPIC to guest, as a placeholder interrupt controller, * even if the guest uses PT LAPIC. This is to satisfy the guest OSes, * in some cases, though the functionality of vIOAPIC doesn't work. */ vioapic_init(vm); /* Populate return VM handle */ *rtn_vm = vm; vm->sw.io_shared_page = NULL; if ((vm_config->load_order == POST_LAUNCHED_VM) && ((vm_config->guest_flags & GUEST_FLAG_IO_COMPLETION_POLLING) != 0U)) { /* enable IO completion polling mode per its guest flags in vm_config. */ vm->sw.is_polling_ioreq = true; } status = set_vcpuid_entries(vm); if (status == 0) { vm->state = VM_CREATED; } } } if (status == 0) { /* We have assumptions: * 1) vcpus used by Service VM has been offlined by DM before User VM re-use it. * 2) pcpu_bitmap passed sanitization is OK for vcpu creating. */ vm->hw.cpu_affinity = pcpu_bitmap; uint64_t tmp64 = pcpu_bitmap; while (tmp64 != 0UL) { pcpu_id = ffs64(tmp64); bitmap_clear_nolock(pcpu_id, &tmp64); status = prepare_vcpu(vm, pcpu_id); if (status != 0) { break; } } } if (status == 0) { uint16_t i; for (i = 0U; i < vm_config->pt_intx_num; i++) { status = ptirq_add_intx_remapping(vm, vm_config->pt_intx[i].virt_gsi, vm_config->pt_intx[i].phys_gsi, false); if (status != 0) { ptirq_remove_configured_intx_remappings(vm); break; } } } if ((status != 0) && (vm->arch_vm.nworld_eptp != NULL)) { (void)memset(vm->arch_vm.nworld_eptp, 0U, PAGE_SIZE); } return status; } static bool is_ready_for_system_shutdown(void) { bool ret = true; uint16_t vm_id; struct acrn_vm *vm; for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { vm = get_vm_from_vmid(vm_id); /* TODO: Update code to cover hybrid mode */ if (!is_poweroff_vm(vm) && is_stateful_vm(vm)) { ret = false; break; } } return ret; } static int32_t offline_lapic_pt_enabled_pcpus(const struct acrn_vm *vm, uint64_t pcpu_mask) { int32_t ret = 0; uint16_t i; uint64_t mask = pcpu_mask; const struct acrn_vcpu *vcpu = NULL; uint16_t this_pcpu_id = get_pcpu_id(); if (bitmap_test(this_pcpu_id, &mask)) { bitmap_clear_nolock(this_pcpu_id, &mask); if (vm->state == VM_POWERED_OFF) { /* * If the current pcpu needs to offline itself, * it will be done after shutdown_vm() completes * in the idle thread. */ make_pcpu_offline(this_pcpu_id); } else { /* * The current pcpu can't reset itself */ pr_warn("%s: cannot offline self(%u)", __func__, this_pcpu_id); ret = -EINVAL; } } foreach_vcpu(i, vm, vcpu) { if (bitmap_test(pcpuid_from_vcpu(vcpu), &mask)) { make_pcpu_offline(pcpuid_from_vcpu(vcpu)); } } wait_pcpus_offline(mask); if (!start_pcpus(mask)) { pr_fatal("Failed to start all cpus in mask(0x%lx)", mask); ret = -ETIMEDOUT; } return ret; } /* * @pre vm != NULL * @pre vm->state == VM_PAUSED */ int32_t shutdown_vm(struct acrn_vm *vm) { uint16_t i; uint64_t mask; struct acrn_vcpu *vcpu = NULL; struct acrn_vm_config *vm_config = NULL; int32_t ret = 0; /* Only allow shutdown paused vm */ vm->state = VM_POWERED_OFF; if (is_service_vm(vm)) { sbuf_reset(); } ptirq_remove_configured_intx_remappings(vm); deinit_legacy_vuarts(vm); deinit_vpci(vm); deinit_emul_io(vm); /* Free EPT allocated resources assigned to VM */ destroy_ept(vm); mask = lapic_pt_enabled_pcpu_bitmap(vm); if (mask != 0UL) { ret = offline_lapic_pt_enabled_pcpus(vm, mask); } foreach_vcpu(i, vm, vcpu) { offline_vcpu(vcpu); } /* after guest_flags not used, then clear it */ vm_config = get_vm_config(vm->vm_id); vm_config->guest_flags &= ~DM_OWNED_GUEST_FLAG_MASK; if (!is_static_configured_vm(vm)) { memset(vm_config->name, 0U, MAX_VM_NAME_LEN); } if (is_ready_for_system_shutdown()) { /* If no any guest running, shutdown system */ shutdown_system(); } /* Return status to caller */ return ret; } /** * @pre vm != NULL * @pre vm->state == VM_CREATED */ void start_vm(struct acrn_vm *vm) { struct acrn_vcpu *bsp = NULL; vm->state = VM_RUNNING; /* Only start BSP (vid = 0) and let BSP start other APs */ bsp = vcpu_from_vid(vm, BSP_CPU_ID); vcpu_make_request(bsp, ACRN_REQUEST_INIT_VMCS); launch_vcpu(bsp); } /** * @pre vm != NULL * @pre vm->state == VM_PAUSED */ int32_t reset_vm(struct acrn_vm *vm) { uint16_t i; uint64_t mask; struct acrn_vcpu *vcpu = NULL; int32_t ret = 0; mask = lapic_pt_enabled_pcpu_bitmap(vm); if (mask != 0UL) { ret = offline_lapic_pt_enabled_pcpus(vm, mask); } foreach_vcpu(i, vm, vcpu) { reset_vcpu(vcpu, COLD_RESET); } /* * Set VM vLAPIC state to VM_VLAPIC_XAPIC */ vm->arch_vm.vlapic_mode = VM_VLAPIC_XAPIC; if (is_service_vm(vm)) { (void)prepare_os_image(vm); } reset_vm_ioreqs(vm); reset_vioapics(vm); destroy_secure_world(vm, false); vm->sworld_control.flag.active = 0UL; vm->arch_vm.iwkey_backup_status = 0UL; vm->state = VM_CREATED; return ret; } /** * @pre vm != NULL */ void poweroff_if_rt_vm(struct acrn_vm *vm) { if (is_rt_vm(vm) && !is_paused_vm(vm) && !is_poweroff_vm(vm)) { vm->state = VM_READY_TO_POWEROFF; } } /** * @pre vm != NULL */ void pause_vm(struct acrn_vm *vm) { uint16_t i; struct acrn_vcpu *vcpu = NULL; /* For RTVM, we can only pause its vCPUs when it is powering off by itself */ if (((!is_rt_vm(vm)) && (vm->state == VM_RUNNING)) || ((is_rt_vm(vm)) && (vm->state == VM_READY_TO_POWEROFF)) || (vm->state == VM_CREATED)) { foreach_vcpu(i, vm, vcpu) { zombie_vcpu(vcpu, VCPU_ZOMBIE); } vm->state = VM_PAUSED; } } /** * @brief Resume vm from S3 state * * To resume vm after guest enter S3 state: * - reset BSP * - BSP will be put to real mode with entry set as wakeup_vec * - init_vmcs BSP. We could call init_vmcs here because we know current * pcpu is mapped to BSP of vm. * * @vm[in] vm pointer to vm data structure * @wakeup_vec[in] The resume address of vm * * @pre vm != NULL * @pre is_service_vm(vm) && vm->state == VM_PAUSED */ void resume_vm_from_s3(struct acrn_vm *vm, uint32_t wakeup_vec) { struct acrn_vcpu *bsp = vcpu_from_vid(vm, BSP_CPU_ID); vm->state = VM_RUNNING; reset_vcpu(bsp, POWER_ON_RESET); /* When Service VM resume from S3, it will return to real mode * with entry set to wakeup_vec. */ set_vcpu_startup_entry(bsp, wakeup_vec); init_vmcs(bsp); launch_vcpu(bsp); } static uint8_t loaded_pre_vm_nr = 0U; /** * Prepare to create vm/vcpu for vm * * @pre vm_id < CONFIG_MAX_VM_NUM && vm_config != NULL */ int32_t prepare_vm(uint16_t vm_id, struct acrn_vm_config *vm_config) { int32_t err = 0; struct acrn_vm *vm = NULL; #ifdef CONFIG_SECURITY_VM_FIXUP security_vm_fixup(vm_id); #endif if (get_vmid_by_name(vm_config->name) != vm_id) { pr_err("Invalid VM name: %s", vm_config->name); err = -1; } else { /* Service VM and pre-launched VMs launch on all pCPUs defined in vm_config->cpu_affinity */ err = create_vm(vm_id, vm_config->cpu_affinity, vm_config, &vm); } if (err == 0) { if (is_prelaunched_vm(vm)) { build_vrsdp(vm); } if (is_service_vm(vm)) { /* We need to ensure all modules of pre-launched VMs have been loaded already * before loading Service VM modules, otherwise the module of pre-launched VMs could * be corrupted because Service VM kernel might pick any usable RAM to extract kernel * when KASLR enabled. * In case the pre-launched VMs aren't loaded successfuly that cause deadlock here, * use a 10000ms timer to break the waiting loop. */ uint64_t start_tick = cpu_ticks(); while (loaded_pre_vm_nr != PRE_VM_NUM) { uint64_t timeout = ticks_to_ms(cpu_ticks() - start_tick); if (timeout > 10000U) { pr_err("Loading pre-launched VMs timeout!"); break; } } } err = prepare_os_image(vm); if (is_prelaunched_vm(vm)) { loaded_pre_vm_nr++; } } return err; } /** * @pre vm_config != NULL * @Application constraint: The validity of vm_config->cpu_affinity should be guaranteed before run-time. */ void launch_vms(uint16_t pcpu_id) { uint16_t vm_id; struct acrn_vm_config *vm_config; for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { vm_config = get_vm_config(vm_id); if (((vm_config->guest_flags & GUEST_FLAG_REE) != 0U) && ((vm_config->guest_flags & GUEST_FLAG_TEE) != 0U)) { ASSERT(false, "%s: Wrong VM (VM id: %u) configuration, can't set both REE and TEE flags", __func__, vm_id); } if ((vm_config->load_order == SERVICE_VM) || (vm_config->load_order == PRE_LAUNCHED_VM)) { if (pcpu_id == get_configured_bsp_pcpu_id(vm_config)) { if (vm_config->load_order == SERVICE_VM) { service_vm_ptr = &vm_array[vm_id]; } /* * We can only start a VM when there is no error in prepare_vm. * Otherwise, print out the corresponding error. * * We can only start REE VM when get the notification from TEE VM. * so skip "start_vm" here for REE, and start it in TEE hypercall * HC_TEE_VCPU_BOOT_DONE. */ if (prepare_vm(vm_id, vm_config) == 0) { if ((vm_config->guest_flags & GUEST_FLAG_REE) != 0U) { /* Nothing need to do here, REE will start in TEE hypercall */ } else { start_vm(get_vm_from_vmid(vm_id)); pr_acrnlog("Start VM id: %x name: %s", vm_id, vm_config->name); } } } } } } /* * @brief Update state of vLAPICs of a VM * vLAPICs of VM switch between modes in an asynchronous fashion. This API * captures the "transition" state triggered when one vLAPIC switches mode. * When the VM is created, the state is set to "xAPIC" as all vLAPICs are setup * in xAPIC mode. * * Upon reset, all LAPICs switch to xAPIC mode accroding to SDM 10.12.5 * Considering VM uses x2apic mode for vLAPIC, in reset or shutdown flow, vLAPIC state * moves to "xAPIC" directly without going thru "transition". * * VM_VLAPIC_X2APIC - All the online vCPUs/vLAPICs of this VM use x2APIC mode * VM_VLAPIC_XAPIC - All the online vCPUs/vLAPICs of this VM use xAPIC mode * VM_VLAPIC_DISABLED - All the online vCPUs/vLAPICs of this VM are in Disabled mode * VM_VLAPIC_TRANSITION - Online vCPUs/vLAPICs of this VM are in between transistion * * @pre vm != NULL */ void update_vm_vlapic_state(struct acrn_vm *vm) { uint16_t i; struct acrn_vcpu *vcpu; uint16_t vcpus_in_x2apic, vcpus_in_xapic; enum vm_vlapic_mode vlapic_mode = VM_VLAPIC_XAPIC; vcpus_in_x2apic = 0U; vcpus_in_xapic = 0U; spinlock_obtain(&vm->vlapic_mode_lock); foreach_vcpu(i, vm, vcpu) { /* Skip vCPU in state outside of VCPU_RUNNING as it may be offline. */ if (vcpu->state == VCPU_RUNNING) { if (is_x2apic_enabled(vcpu_vlapic(vcpu))) { vcpus_in_x2apic++; } else if (is_xapic_enabled(vcpu_vlapic(vcpu))) { vcpus_in_xapic++; } else { /* * vCPU is using vLAPIC in Disabled mode */ } } } if ((vcpus_in_x2apic == 0U) && (vcpus_in_xapic == 0U)) { /* * Check if the counts vcpus_in_x2apic and vcpus_in_xapic are zero * VM_VLAPIC_DISABLED */ vlapic_mode = VM_VLAPIC_DISABLED; } else if ((vcpus_in_x2apic != 0U) && (vcpus_in_xapic != 0U)) { /* * Check if the counts vcpus_in_x2apic and vcpus_in_xapic are non-zero * VM_VLAPIC_TRANSITION */ vlapic_mode = VM_VLAPIC_TRANSITION; } else if (vcpus_in_x2apic != 0U) { /* * Check if the counts vcpus_in_x2apic is non-zero * VM_VLAPIC_X2APIC */ vlapic_mode = VM_VLAPIC_X2APIC; } else { /* * Count vcpus_in_xapic is non-zero * VM_VLAPIC_XAPIC */ vlapic_mode = VM_VLAPIC_XAPIC; } vm->arch_vm.vlapic_mode = vlapic_mode; spinlock_release(&vm->vlapic_mode_lock); } /* * @brief Check mode of vLAPICs of a VM * * @pre vm != NULL */ enum vm_vlapic_mode check_vm_vlapic_mode(const struct acrn_vm *vm) { enum vm_vlapic_mode vlapic_mode; vlapic_mode = vm->arch_vm.vlapic_mode; return vlapic_mode; } /** * if there is RT VM return true otherwise return false. */ bool has_rt_vm(void) { uint16_t vm_id; for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) { if (is_rt_vm(get_vm_from_vmid(vm_id))) { break; } } return (vm_id != CONFIG_MAX_VM_NUM); } void make_shutdown_vm_request(uint16_t pcpu_id) { bitmap_set_lock(NEED_SHUTDOWN_VM, &per_cpu(pcpu_flag, pcpu_id)); if (get_pcpu_id() != pcpu_id) { send_single_ipi(pcpu_id, NOTIFY_VCPU_VECTOR); } } bool need_shutdown_vm(uint16_t pcpu_id) { return bitmap_test_and_clear_lock(NEED_SHUTDOWN_VM, &per_cpu(pcpu_flag, pcpu_id)); } /* * @pre vm != NULL */ void get_vm_lock(struct acrn_vm *vm) { spinlock_obtain(&vm->vm_state_lock); } /* * @pre vm != NULL */ void put_vm_lock(struct acrn_vm *vm) { spinlock_release(&vm->vm_state_lock); }