From 731c4836dde78d123f2c5d9421cfb6c4d3431d23 Mon Sep 17 00:00:00 2001 From: Jason Chen CJ Date: Mon, 17 Dec 2018 13:56:42 +0800 Subject: [PATCH] modulization: separate vmx.c into two parts one part is pure vmx operations which keeps in vmx.c the other part is vmcs operations which is vcpu related, move them into vmcs.c Changes to be committed: modified: Makefile copied: arch/x86/vmx.c -> arch/x86/vmcs.c modified: arch/x86/vmx.c modified: arch/x86/vmx_asm.S modified: include/arch/x86/hv_arch.h new file: include/arch/x86/vmcs.h modified: include/arch/x86/vmx.h Tracked-On: #1842 Signed-off-by: Jason Chen CJ --- hypervisor/Makefile | 1 + hypervisor/arch/x86/vmcs.c | 969 ++++++++++++++++++++++++++ hypervisor/arch/x86/vmx.c | 960 +------------------------ hypervisor/arch/x86/vmx_asm.S | 2 +- hypervisor/include/arch/x86/hv_arch.h | 1 + hypervisor/include/arch/x86/vmcs.h | 85 +++ hypervisor/include/arch/x86/vmx.h | 79 +-- 7 files changed, 1061 insertions(+), 1036 deletions(-) create mode 100644 hypervisor/arch/x86/vmcs.c create mode 100644 hypervisor/include/arch/x86/vmcs.h diff --git a/hypervisor/Makefile b/hypervisor/Makefile index c9e88bac5..e805c65d0 100644 --- a/hypervisor/Makefile +++ b/hypervisor/Makefile @@ -151,6 +151,7 @@ C_SRCS += arch/x86/io.c C_SRCS += arch/x86/virq.c C_SRCS += arch/x86/vmexit.c C_SRCS += arch/x86/vmx.c +C_SRCS += arch/x86/vmcs.c C_SRCS += arch/x86/assign.c C_SRCS += arch/x86/trusty.c C_SRCS += arch/x86/cpu_state_tbl.c diff --git a/hypervisor/arch/x86/vmcs.c b/hypervisor/arch/x86/vmcs.c new file mode 100644 index 000000000..a8a2f625b --- /dev/null +++ b/hypervisor/arch/x86/vmcs.c @@ -0,0 +1,969 @@ +/* + * Copyright (C) 2018 Intel Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * this file contains vmcs operations which is vcpu related + */ + +#include +#include +#include +#ifdef CONFIG_EFI_STUB +extern struct efi_context* efi_ctx; +#endif + +#define REAL_MODE_BSP_INIT_CODE_SEL (0xf000U) +#define REAL_MODE_DATA_SEG_AR (0x0093U) +#define REAL_MODE_CODE_SEG_AR (0x009fU) +#define PROTECTED_MODE_DATA_SEG_AR (0xc093U) +#define PROTECTED_MODE_CODE_SEG_AR (0xc09bU) +#define DR7_INIT_VALUE (0x400UL) +#define LDTR_AR (0x0082U) /* LDT, type must be 2, refer to SDM Vol3 26.3.1.2 */ +#define TR_AR (0x008bU) /* TSS (busy), refer to SDM Vol3 26.3.1.2 */ + +static uint64_t cr0_host_mask; +static uint64_t cr0_always_on_mask; +static uint64_t cr0_always_off_mask; +static uint64_t cr4_host_mask; +static uint64_t cr4_always_on_mask; +static uint64_t cr4_always_off_mask; + +bool is_vmx_disabled(void) +{ + uint64_t msr_val; + + /* Read Feature ControL MSR */ + msr_val = msr_read(MSR_IA32_FEATURE_CONTROL); + + /* Check if feature control is locked and vmx cannot be enabled */ + if ((msr_val & MSR_IA32_FEATURE_CONTROL_LOCK) != 0U && + (msr_val & MSR_IA32_FEATURE_CONTROL_VMX_NO_SMX) == 0U) { + return true; + } + return false; +} + +static void init_cr0_cr4_host_mask(void) +{ + static bool inited = false; + uint64_t fixed0, fixed1; + if (!inited) { + /* Read the CR0 fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); + + cr0_host_mask = ~(fixed0 ^ fixed1); + /* Add the bit hv wants to trap */ + cr0_host_mask |= CR0_TRAP_MASK; + /* CR0 clear PE/PG from always on bits due to "unrestructed + * guest" feature */ + cr0_always_on_mask = fixed0 & (~(CR0_PE | CR0_PG)); + cr0_always_off_mask = ~fixed1; + /* SDM 2.5 + * bit 63:32 of CR0 and CR4 ar reserved and must be written + * zero. We could merge it with always off mask. + */ + cr0_always_off_mask |= 0xFFFFFFFF00000000UL; + + /* Read the CR4 fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); + + cr4_host_mask = ~(fixed0 ^ fixed1); + /* Add the bit hv wants to trap */ + cr4_host_mask |= CR4_TRAP_MASK; + cr4_always_on_mask = fixed0; + /* Record the bit fixed to 0 for CR4, including reserved bits */ + cr4_always_off_mask = ~fixed1; + /* SDM 2.5 + * bit 63:32 of CR0 and CR4 ar reserved and must be written + * zero. We could merge it with always off mask. + */ + cr4_always_off_mask |= 0xFFFFFFFF00000000UL; + cr4_always_off_mask |= CR4_RESERVED_MASK; + inited = true; + } + + exec_vmwrite(VMX_CR0_MASK, cr0_host_mask); + /* Output CR0 mask value */ + pr_dbg("CR0 mask value: 0x%016llx", cr0_host_mask); + + + exec_vmwrite(VMX_CR4_MASK, cr4_host_mask); + /* Output CR4 mask value */ + pr_dbg("CR4 mask value: 0x%016llx", cr4_host_mask); +} + +uint64_t vmx_rdmsr_pat(const struct acrn_vcpu *vcpu) +{ + /* + * note: if run_ctx->cr0.CD is set, the actual value in guest's + * IA32_PAT MSR is PAT_ALL_UC_VALUE, which may be different from + * the saved value guest_msrs[MSR_IA32_PAT] + */ + return vcpu_get_guest_msr(vcpu, MSR_IA32_PAT); +} + +int32_t vmx_wrmsr_pat(struct acrn_vcpu *vcpu, uint64_t value) +{ + uint32_t i; + uint64_t field; + + for (i = 0U; i < 8U; i++) { + field = (value >> (i * 8U)) & 0xffUL; + if (pat_mem_type_invalid(field) || + ((PAT_FIELD_RSV_BITS & field) != 0UL)) { + pr_err("invalid guest IA32_PAT: 0x%016llx", value); + return -EINVAL; + } + } + + vcpu_set_guest_msr(vcpu, MSR_IA32_PAT, value); + + /* + * If context->cr0.CD is set, we defer any further requests to write + * guest's IA32_PAT, until the time when guest's CR0.CD is being cleared + */ + if ((vcpu_get_cr0(vcpu) & CR0_CD) == 0UL) { + exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, value); + } + + return 0; +} + +static void load_pdptrs(const struct acrn_vcpu *vcpu) +{ + uint64_t guest_cr3 = exec_vmread(VMX_GUEST_CR3); + /* TODO: check whether guest cr3 is valid */ + uint64_t *guest_cr3_hva = (uint64_t *)gpa2hva(vcpu->vm, guest_cr3); + + stac(); + exec_vmwrite64(VMX_GUEST_PDPTE0_FULL, get_pgentry(guest_cr3_hva + 0UL)); + exec_vmwrite64(VMX_GUEST_PDPTE1_FULL, get_pgentry(guest_cr3_hva + 1UL)); + exec_vmwrite64(VMX_GUEST_PDPTE2_FULL, get_pgentry(guest_cr3_hva + 2UL)); + exec_vmwrite64(VMX_GUEST_PDPTE3_FULL, get_pgentry(guest_cr3_hva + 3UL)); + clac(); +} + +static bool is_cr0_write_valid(struct acrn_vcpu *vcpu, uint64_t cr0) +{ + /* Shouldn't set always off bit */ + if ((cr0 & cr0_always_off_mask) != 0UL) { + return false; + } + + /* SDM 25.3 "Changes to instruction behavior in VMX non-root" + * + * We always require "unrestricted guest" control enabled. So + * + * CR0.PG = 1, CR4.PAE = 0 and IA32_EFER.LME = 1 is invalid. + * CR0.PE = 0 and CR0.PG = 1 is invalid. + */ + if (((cr0 & CR0_PG) != 0UL) && !is_pae(vcpu) + && ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL)) { + return false; + } + + if (((cr0 & CR0_PE) == 0UL) && ((cr0 & CR0_PG) != 0UL)) { + return false; + } + + /* SDM 6.15 "Exception and Interrupt Refrerence" GP Exception + * + * Loading CR0 regsiter with a set NW flag and a clear CD flag + * is invalid + */ + if (((cr0 & CR0_CD) == 0UL) && ((cr0 & CR0_NW) != 0UL)) { + return false; + } + + return true; +} + +/* + * Handling of CR0: + * Assume "unrestricted guest" feature is supported by vmx. + * For mode switch, hv only needs to take care of enabling/disabling long mode, + * thanks to "unrestricted guest" feature. + * + * - PE (0) Trapped to track cpu mode. + * Set the value according to the value from guest. + * - MP (1) Flexible to guest + * - EM (2) Flexible to guest + * - TS (3) Flexible to guest + * - ET (4) Flexible to guest + * - NE (5) must always be 1 + * - WP (16) Trapped to get if it inhibits supervisor level procedures to + * write into ro-pages. + * - AM (18) Flexible to guest + * - NW (29) Trapped to emulate cache disable situation + * - CD (30) Trapped to emulate cache disable situation + * - PG (31) Trapped to track cpu/paging mode. + * Set the value according to the value from guest. + */ +void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0) +{ + uint64_t cr0_vmx; + uint32_t entry_ctrls; + bool old_paging_enabled = is_paging_enabled(vcpu); + uint64_t cr0_changed_bits = vcpu_get_cr0(vcpu) ^ cr0; + + if (!is_cr0_write_valid(vcpu, cr0)) { + pr_dbg("Invalid cr0 write operation from guest"); + vcpu_inject_gp(vcpu, 0U); + return; + } + + /* SDM 2.5 + * When loading a control register, reserved bit should always set + * to the value previously read. + */ + cr0 &= ~CR0_RESERVED_MASK; + + if (!old_paging_enabled && ((cr0 & CR0_PG) != 0UL)) { + if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { + /* Enable long mode */ + pr_dbg("VMM: Enable long mode"); + entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); + entry_ctrls |= VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); + + vcpu_set_efer(vcpu, + vcpu_get_efer(vcpu) | MSR_IA32_EFER_LMA_BIT); + } else if (is_pae(vcpu)) { + /* enabled PAE from paging disabled */ + load_pdptrs(vcpu); + } else { + /* do nothing */ + } + } else if (old_paging_enabled && ((cr0 & CR0_PG) == 0UL)) { + if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { + /* Disable long mode */ + pr_dbg("VMM: Disable long mode"); + entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); + entry_ctrls &= ~VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); + + vcpu_set_efer(vcpu, + vcpu_get_efer(vcpu) & ~MSR_IA32_EFER_LMA_BIT); + } + } else { + /* do nothing */ + } + + /* If CR0.CD or CR0.NW get cr0_changed_bits */ + if ((cr0_changed_bits & (CR0_CD | CR0_NW)) != 0UL) { + /* No action if only CR0.NW is cr0_changed_bits */ + if ((cr0_changed_bits & CR0_CD) != 0UL) { + if ((cr0 & CR0_CD) != 0UL) { + /* + * When the guest requests to set CR0.CD, we don't allow + * guest's CR0.CD to be actually set, instead, we write guest + * IA32_PAT with all-UC entries to emulate the cache + * disabled behavior + */ + exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, PAT_ALL_UC_VALUE); + if (!iommu_snoop_supported(vcpu->vm)) { + cache_flush_invalidate_all(); + } + } else { + /* Restore IA32_PAT to enable cache again */ + exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, + vcpu_get_guest_msr(vcpu, MSR_IA32_PAT)); + } + vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); + } + } + + if ((cr0_changed_bits & (CR0_PG | CR0_WP)) != 0UL) { + vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); + } + + /* CR0 has no always off bits, except the always on bits, and reserved + * bits, allow to set according to guest. + */ + cr0_vmx = cr0_always_on_mask | cr0; + + /* Don't set CD or NW bit to guest */ + cr0_vmx &= ~(CR0_CD | CR0_NW); + exec_vmwrite(VMX_GUEST_CR0, cr0_vmx & 0xFFFFFFFFUL); + exec_vmwrite(VMX_CR0_READ_SHADOW, cr0 & 0xFFFFFFFFUL); + + /* clear read cache, next time read should from VMCS */ + bitmap_clear_lock(CPU_REG_CR0, &vcpu->reg_cached); + + pr_dbg("VMM: Try to write %016llx, allow to write 0x%016llx to CR0", + cr0, cr0_vmx); +} + +static bool is_cr4_write_valid(struct acrn_vcpu *vcpu, uint64_t cr4) +{ + /* Check if guest try to set fixed to 0 bits or reserved bits */ + if ((cr4 & cr4_always_off_mask) != 0U) { + return false; + } + + /* Do NOT support nested guest */ + if ((cr4 & CR4_VMXE) != 0UL) { + return false; + } + + /* Do NOT support PCID in guest */ + if ((cr4 & CR4_PCIDE) != 0UL) { + return false; + } + + if (is_long_mode(vcpu)) { + if ((cr4 & CR4_PAE) == 0UL) { + return false; + } + } + + return true; +} + +/* + * Handling of CR4: + * Assume "unrestricted guest" feature is supported by vmx. + * + * For CR4, if some feature is not supported by hardware, the corresponding bit + * will be set in cr4_always_off_mask. If guest try to set these bits after + * vmexit, will inject a #GP. + * If a bit for a feature not supported by hardware, which is flexible to guest, + * and write to it do not lead to a VM exit, a #GP should be generated inside + * guest. + * + * - VME (0) Flexible to guest + * - PVI (1) Flexible to guest + * - TSD (2) Flexible to guest + * - DE (3) Flexible to guest + * - PSE (4) Trapped to track paging mode. + * Set the value according to the value from guest. + * - PAE (5) Trapped to track paging mode. + * Set the value according to the value from guest. + * - MCE (6) Flexible to guest + * - PGE (7) Flexible to guest + * - PCE (8) Flexible to guest + * - OSFXSR (9) Flexible to guest + * - OSXMMEXCPT (10) Flexible to guest + * - VMXE (13) Trapped to hide from guest + * - SMXE (14) must always be 0 => must lead to a VM exit + * - PCIDE (17) Trapped to hide from guest + * - OSXSAVE (18) Flexible to guest + * - XSAVE (19) Flexible to guest + * We always keep align with physical cpu. So it's flexible to + * guest + * - SMEP (20) Flexible to guest + * - SMAP (21) Flexible to guest + * - PKE (22) Flexible to guest + */ +void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4) +{ + uint64_t cr4_vmx; + uint64_t old_cr4 = vcpu_get_cr4(vcpu); + + if (!is_cr4_write_valid(vcpu, cr4)) { + pr_dbg("Invalid cr4 write operation from guest"); + vcpu_inject_gp(vcpu, 0U); + return; + } + + if (((cr4 ^ old_cr4) & (CR4_PGE | CR4_PSE | CR4_PAE | + CR4_SMEP | CR4_SMAP | CR4_PKE)) != 0UL) { + if (((cr4 & CR4_PAE) != 0UL) && is_paging_enabled(vcpu) && + (is_long_mode(vcpu))) { + load_pdptrs(vcpu); + } + + vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); + } + + /* Aways off bits and reserved bits has been filtered above */ + cr4_vmx = cr4_always_on_mask | cr4; + exec_vmwrite(VMX_GUEST_CR4, cr4_vmx & 0xFFFFFFFFUL); + exec_vmwrite(VMX_CR4_READ_SHADOW, cr4 & 0xFFFFFFFFUL); + + /* clear read cache, next time read should from VMCS */ + bitmap_clear_lock(CPU_REG_CR4, &vcpu->reg_cached); + + pr_dbg("VMM: Try to write %016llx, allow to write 0x%016llx to CR4", + cr4, cr4_vmx); +} + +/* rip, rsp, ia32_efer and rflags are written to VMCS in start_vcpu */ +static void init_guest_vmx(struct acrn_vcpu *vcpu, uint64_t cr0, uint64_t cr3, + uint64_t cr4) +{ + struct cpu_context *ctx = + &vcpu->arch.contexts[vcpu->arch.cur_context]; + struct ext_context *ectx = &ctx->ext_ctx; + + vcpu_set_cr4(vcpu, cr4); + vcpu_set_cr0(vcpu, cr0); + exec_vmwrite(VMX_GUEST_CR3, cr3); + + exec_vmwrite(VMX_GUEST_GDTR_BASE, ectx->gdtr.base); + pr_dbg("VMX_GUEST_GDTR_BASE: 0x%016llx", ectx->gdtr.base); + exec_vmwrite32(VMX_GUEST_GDTR_LIMIT, ectx->gdtr.limit); + pr_dbg("VMX_GUEST_GDTR_LIMIT: 0x%016llx", ectx->gdtr.limit); + + exec_vmwrite(VMX_GUEST_IDTR_BASE, ectx->idtr.base); + pr_dbg("VMX_GUEST_IDTR_BASE: 0x%016llx", ectx->idtr.base); + exec_vmwrite32(VMX_GUEST_IDTR_LIMIT, ectx->idtr.limit); + pr_dbg("VMX_GUEST_IDTR_LIMIT: 0x%016llx", ectx->idtr.limit); + + /* init segment selectors: es, cs, ss, ds, fs, gs, ldtr, tr */ + load_segment(ectx->cs, VMX_GUEST_CS); + load_segment(ectx->ss, VMX_GUEST_SS); + load_segment(ectx->ds, VMX_GUEST_DS); + load_segment(ectx->es, VMX_GUEST_ES); + load_segment(ectx->fs, VMX_GUEST_FS); + load_segment(ectx->gs, VMX_GUEST_GS); + load_segment(ectx->tr, VMX_GUEST_TR); + load_segment(ectx->ldtr, VMX_GUEST_LDTR); + + /* fixed values */ + exec_vmwrite32(VMX_GUEST_IA32_SYSENTER_CS, 0U); + exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, 0UL); + exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, 0UL); + exec_vmwrite(VMX_GUEST_PENDING_DEBUG_EXCEPT, 0UL); + exec_vmwrite(VMX_GUEST_IA32_DEBUGCTL_FULL, 0UL); + exec_vmwrite32(VMX_GUEST_INTERRUPTIBILITY_INFO, 0U); + exec_vmwrite32(VMX_GUEST_ACTIVITY_STATE, 0U); + exec_vmwrite32(VMX_GUEST_SMBASE, 0U); + vcpu_set_guest_msr(vcpu, MSR_IA32_PAT, PAT_POWER_ON_VALUE); + exec_vmwrite(VMX_GUEST_IA32_PAT_FULL, PAT_POWER_ON_VALUE); + exec_vmwrite(VMX_GUEST_DR7, DR7_INIT_VALUE); +} + +static void init_guest_state(struct acrn_vcpu *vcpu) +{ + struct cpu_context *ctx = + &vcpu->arch.contexts[vcpu->arch.cur_context]; + + init_guest_vmx(vcpu, ctx->run_ctx.cr0, ctx->ext_ctx.cr3, + ctx->run_ctx.cr4 & ~CR4_VMXE); +} + +static void init_host_state(void) +{ + uint16_t value16; + uint64_t value64; + uint64_t value; + uint64_t tss_addr; + uint64_t gdt_base; + uint64_t idt_base; + + pr_dbg("*********************"); + pr_dbg("Initialize host state"); + pr_dbg("*********************"); + + /*************************************************** + * 16 - Bit fields + * Move the current ES, CS, SS, DS, FS, GS, TR, LDTR * values to the + * corresponding 16-bit host * segment selection (ES, CS, SS, DS, FS, + * GS), * Task Register (TR), * Local Descriptor Table Register (LDTR) + * + ***************************************************/ + CPU_SEG_READ(es, &value16); + exec_vmwrite16(VMX_HOST_ES_SEL, value16); + pr_dbg("VMX_HOST_ES_SEL: 0x%hx ", value16); + + CPU_SEG_READ(cs, &value16); + exec_vmwrite16(VMX_HOST_CS_SEL, value16); + pr_dbg("VMX_HOST_CS_SEL: 0x%hx ", value16); + + CPU_SEG_READ(ss, &value16); + exec_vmwrite16(VMX_HOST_SS_SEL, value16); + pr_dbg("VMX_HOST_SS_SEL: 0x%hx ", value16); + + CPU_SEG_READ(ds, &value16); + exec_vmwrite16(VMX_HOST_DS_SEL, value16); + pr_dbg("VMX_HOST_DS_SEL: 0x%hx ", value16); + + CPU_SEG_READ(fs, &value16); + exec_vmwrite16(VMX_HOST_FS_SEL, value16); + pr_dbg("VMX_HOST_FS_SEL: 0x%hx ", value16); + + CPU_SEG_READ(gs, &value16); + exec_vmwrite16(VMX_HOST_GS_SEL, value16); + pr_dbg("VMX_HOST_GS_SEL: 0x%hx ", value16); + + exec_vmwrite16(VMX_HOST_TR_SEL, HOST_GDT_RING0_CPU_TSS_SEL); + pr_dbg("VMX_HOST_TR_SEL: 0x%hx ", HOST_GDT_RING0_CPU_TSS_SEL); + + /****************************************************** + * 32-bit fields + * Set up the 32 bit host state fields - pg 3418 B.3.3 * Set limit for + * ES, CS, DD, DS, FS, GS, LDTR, Guest TR, * GDTR, and IDTR + ******************************************************/ + + /* TODO: Should guest GDTB point to host GDTB ? */ + /* Obtain the current global descriptor table base */ + gdt_base = sgdt(); + + if (((gdt_base >> 47U) & 0x1UL) != 0UL) { + gdt_base |= 0xffff000000000000UL; + } + + /* Set up the guest and host GDTB base fields with current GDTB base */ + exec_vmwrite(VMX_HOST_GDTR_BASE, gdt_base); + pr_dbg("VMX_HOST_GDTR_BASE: 0x%x ", gdt_base); + + tss_addr = hva2hpa((void *)&get_cpu_var(tss)); + /* Set up host TR base fields */ + exec_vmwrite(VMX_HOST_TR_BASE, tss_addr); + pr_dbg("VMX_HOST_TR_BASE: 0x%016llx ", tss_addr); + + /* Obtain the current interrupt descriptor table base */ + idt_base = sidt(); + /* base */ + if (((idt_base >> 47U) & 0x1UL) != 0UL) { + idt_base |= 0xffff000000000000UL; + } + + exec_vmwrite(VMX_HOST_IDTR_BASE, idt_base); + pr_dbg("VMX_HOST_IDTR_BASE: 0x%x ", idt_base); + + /**************************************************/ + /* 64-bit fields */ + pr_dbg("64-bit********"); + + value64 = msr_read(MSR_IA32_PAT); + exec_vmwrite64(VMX_HOST_IA32_PAT_FULL, value64); + pr_dbg("VMX_HOST_IA32_PAT: 0x%016llx ", value64); + + value64 = msr_read(MSR_IA32_EFER); + exec_vmwrite64(VMX_HOST_IA32_EFER_FULL, value64); + pr_dbg("VMX_HOST_IA32_EFER: 0x%016llx ", + value64); + + /**************************************************/ + /* Natural width fields */ + pr_dbg("Natural-width********"); + /* Set up host CR0 field */ + CPU_CR_READ(cr0, &value); + exec_vmwrite(VMX_HOST_CR0, value); + pr_dbg("VMX_HOST_CR0: 0x%016llx ", value); + + /* Set up host CR3 field */ + CPU_CR_READ(cr3, &value); + exec_vmwrite(VMX_HOST_CR3, value); + pr_dbg("VMX_HOST_CR3: 0x%016llx ", value); + + /* Set up host CR4 field */ + CPU_CR_READ(cr4, &value); + exec_vmwrite(VMX_HOST_CR4, value); + pr_dbg("VMX_HOST_CR4: 0x%016llx ", value); + + /* Set up host and guest FS base address */ + value = msr_read(MSR_IA32_FS_BASE); + exec_vmwrite(VMX_HOST_FS_BASE, value); + pr_dbg("VMX_HOST_FS_BASE: 0x%016llx ", value); + value = msr_read(MSR_IA32_GS_BASE); + exec_vmwrite(VMX_HOST_GS_BASE, value); + pr_dbg("VMX_HOST_GS_BASE: 0x%016llx ", value); + + /* Set up host instruction pointer on VM Exit */ + value64 = (uint64_t)&vm_exit; + pr_dbg("HOST RIP on VMExit %016llx ", value64); + exec_vmwrite(VMX_HOST_RIP, value64); + pr_dbg("vm exit return address = %016llx ", value64); + + /* As a type I hypervisor, just init sysenter fields to 0 */ + exec_vmwrite32(VMX_HOST_IA32_SYSENTER_CS, 0U); + exec_vmwrite(VMX_HOST_IA32_SYSENTER_ESP, 0UL); + exec_vmwrite(VMX_HOST_IA32_SYSENTER_EIP, 0UL); +} + +static uint32_t check_vmx_ctrl(uint32_t msr, uint32_t ctrl_req) +{ + uint64_t vmx_msr; + uint32_t vmx_msr_low, vmx_msr_high; + uint32_t ctrl = ctrl_req; + + vmx_msr = msr_read(msr); + vmx_msr_low = (uint32_t)vmx_msr; + vmx_msr_high = (uint32_t)(vmx_msr >> 32U); + pr_dbg("VMX_PIN_VM_EXEC_CONTROLS:low=0x%x, high=0x%x\n", + vmx_msr_low, vmx_msr_high); + + /* high 32b: must 0 setting + * low 32b: must 1 setting + */ + ctrl &= vmx_msr_high; + ctrl |= vmx_msr_low; + + if ((ctrl_req & ~ctrl) != 0U) { + pr_err("VMX ctrl 0x%x not fully enabled: " + "request 0x%x but get 0x%x\n", + msr, ctrl_req, ctrl); + } + + return ctrl; + +} + +static void init_exec_ctrl(struct acrn_vcpu *vcpu) +{ + uint32_t value32; + uint64_t value64; + struct acrn_vm *vm = vcpu->vm; + + /* Log messages to show initializing VMX execution controls */ + pr_dbg("*****************************"); + pr_dbg("Initialize execution control "); + pr_dbg("*****************************"); + + /* Set up VM Execution control to enable Set VM-exits on external + * interrupts preemption timer - pg 2899 24.6.1 + */ + /* enable external interrupt VM Exit */ + value32 = check_vmx_ctrl(MSR_IA32_VMX_PINBASED_CTLS, + VMX_PINBASED_CTLS_IRQ_EXIT); + + if (is_apicv_posted_intr_supported()) { + value32 |= VMX_PINBASED_CTLS_POST_IRQ; + } + + exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32); + pr_dbg("VMX_PIN_VM_EXEC_CONTROLS: 0x%x ", value32); + + /* Set up primary processor based VM execution controls - pg 2900 + * 24.6.2. Set up for: + * Enable TSC offsetting + * Enable TSC exiting + * guest access to IO bit-mapped ports causes VM exit + * guest access to MSR causes VM exit + * Activate secondary controls + */ + /* These are bits 1,4-6,8,13-16, and 26, the corresponding bits of + * the IA32_VMX_PROCBASED_CTRLS MSR are always read as 1 --- A.3.2 + */ + value32 = check_vmx_ctrl(MSR_IA32_VMX_PROCBASED_CTLS, + VMX_PROCBASED_CTLS_TSC_OFF | + /* VMX_PROCBASED_CTLS_RDTSC | */ + VMX_PROCBASED_CTLS_TPR_SHADOW | + VMX_PROCBASED_CTLS_IO_BITMAP | + VMX_PROCBASED_CTLS_MSR_BITMAP | + VMX_PROCBASED_CTLS_SECONDARY); + + /*Disable VM_EXIT for CR3 access*/ + value32 &= ~(VMX_PROCBASED_CTLS_CR3_LOAD | + VMX_PROCBASED_CTLS_CR3_STORE); + + /* + * Disable VM_EXIT for invlpg execution. + */ + value32 &= ~VMX_PROCBASED_CTLS_INVLPG; + + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32); + pr_dbg("VMX_PROC_VM_EXEC_CONTROLS: 0x%x ", value32); + + /* Set up secondary processor based VM execution controls - pg 2901 + * 24.6.2. Set up for: * Enable EPT * Enable RDTSCP * Unrestricted + * guest (optional) + */ + value32 = check_vmx_ctrl(MSR_IA32_VMX_PROCBASED_CTLS2, + VMX_PROCBASED_CTLS2_VAPIC | + VMX_PROCBASED_CTLS2_EPT | + VMX_PROCBASED_CTLS2_RDTSCP | + VMX_PROCBASED_CTLS2_UNRESTRICT| + VMX_PROCBASED_CTLS2_VAPIC_REGS); + + if (vcpu->arch.vpid != 0U) { + value32 |= VMX_PROCBASED_CTLS2_VPID; + } else { + value32 &= ~VMX_PROCBASED_CTLS2_VPID; + } + + if (is_apicv_intr_delivery_supported()) { + value32 |= VMX_PROCBASED_CTLS2_VIRQ; + } else { + /* + * This field exists only on processors that support + * the 1-setting of the "use TPR shadow" + * VM-execution control. + * + * Set up TPR threshold for virtual interrupt delivery + * - pg 2904 24.6.8 + */ + exec_vmwrite32(VMX_TPR_THRESHOLD, 0U); + } + + if (cpu_has_cap(X86_FEATURE_OSXSAVE)) { + exec_vmwrite64(VMX_XSS_EXITING_BITMAP_FULL, 0UL); + value32 |= VMX_PROCBASED_CTLS2_XSVE_XRSTR; + } + + value32 |= VMX_PROCBASED_CTLS2_WBINVD; + + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); + pr_dbg("VMX_PROC_VM_EXEC_CONTROLS2: 0x%x ", value32); + + /*APIC-v, config APIC-access address*/ + value64 = vlapic_apicv_get_apic_access_addr(); + exec_vmwrite64(VMX_APIC_ACCESS_ADDR_FULL, value64); + + /*APIC-v, config APIC virtualized page address*/ + value64 = vlapic_apicv_get_apic_page_addr(vcpu_vlapic(vcpu)); + exec_vmwrite64(VMX_VIRTUAL_APIC_PAGE_ADDR_FULL, value64); + + if (is_apicv_intr_delivery_supported()) { + /* Disable all EOI VMEXIT by default and + * clear RVI and SVI. + */ + exec_vmwrite64(VMX_EOI_EXIT0_FULL, 0UL); + exec_vmwrite64(VMX_EOI_EXIT1_FULL, 0UL); + exec_vmwrite64(VMX_EOI_EXIT2_FULL, 0UL); + exec_vmwrite64(VMX_EOI_EXIT3_FULL, 0UL); + + exec_vmwrite16(VMX_GUEST_INTR_STATUS, 0U); + if (is_apicv_posted_intr_supported()) { + exec_vmwrite16(VMX_POSTED_INTR_VECTOR, + VECTOR_POSTED_INTR); + exec_vmwrite64(VMX_PIR_DESC_ADDR_FULL, + apicv_get_pir_desc_paddr(vcpu)); + } + } + + /* Load EPTP execution control + * TODO: introduce API to make this data driven based + * on VMX_EPT_VPID_CAP + */ + value64 = hva2hpa(vm->arch_vm.nworld_eptp) | (3UL << 3U) | 6UL; + exec_vmwrite64(VMX_EPT_POINTER_FULL, value64); + pr_dbg("VMX_EPT_POINTER: 0x%016llx ", value64); + + /* Set up guest exception mask bitmap setting a bit * causes a VM exit + * on corresponding guest * exception - pg 2902 24.6.3 + * enable VM exit on MC only + */ + value32 = (1U << IDT_MC); + exec_vmwrite32(VMX_EXCEPTION_BITMAP, value32); + + /* Set up page fault error code mask - second paragraph * pg 2902 + * 24.6.3 - guest page fault exception causing * vmexit is governed by + * both VMX_EXCEPTION_BITMAP and * VMX_PF_ERROR_CODE_MASK + */ + exec_vmwrite32(VMX_PF_ERROR_CODE_MASK, 0U); + + /* Set up page fault error code match - second paragraph * pg 2902 + * 24.6.3 - guest page fault exception causing * vmexit is governed by + * both VMX_EXCEPTION_BITMAP and * VMX_PF_ERROR_CODE_MATCH + */ + exec_vmwrite32(VMX_PF_ERROR_CODE_MATCH, 0U); + + /* Set up CR3 target count - An execution of mov to CR3 * by guest + * causes HW to evaluate operand match with * one of N CR3-Target Value + * registers. The CR3 target * count values tells the number of + * target-value regs to evaluate + */ + exec_vmwrite32(VMX_CR3_TARGET_COUNT, 0U); + + /* Set up IO bitmap register A and B - pg 2902 24.6.4 */ + value64 = hva2hpa(vm->arch_vm.io_bitmap); + exec_vmwrite64(VMX_IO_BITMAP_A_FULL, value64); + pr_dbg("VMX_IO_BITMAP_A: 0x%016llx ", value64); + value64 = hva2hpa((void *)&(vm->arch_vm.io_bitmap[PAGE_SIZE])); + exec_vmwrite64(VMX_IO_BITMAP_B_FULL, value64); + pr_dbg("VMX_IO_BITMAP_B: 0x%016llx ", value64); + + init_msr_emulation(vcpu); + + /* Set up executive VMCS pointer - pg 2905 24.6.10 */ + exec_vmwrite64(VMX_EXECUTIVE_VMCS_PTR_FULL, 0UL); + + /* Setup Time stamp counter offset - pg 2902 24.6.5 */ + exec_vmwrite64(VMX_TSC_OFFSET_FULL, 0UL); + + /* Set up the link pointer */ + exec_vmwrite64(VMX_VMS_LINK_PTR_FULL, 0xFFFFFFFFFFFFFFFFUL); + + /* Natural-width */ + pr_dbg("Natural-width*********"); + + init_cr0_cr4_host_mask(); + + /* The CR3 target registers work in concert with VMX_CR3_TARGET_COUNT + * field. Using these registers guest CR3 access can be managed. i.e., + * if operand does not match one of these register values a VM exit + * would occur + */ + exec_vmwrite(VMX_CR3_TARGET_0, 0UL); + exec_vmwrite(VMX_CR3_TARGET_1, 0UL); + exec_vmwrite(VMX_CR3_TARGET_2, 0UL); + exec_vmwrite(VMX_CR3_TARGET_3, 0UL); +} + +static void init_entry_ctrl(const struct acrn_vcpu *vcpu) +{ + uint32_t value32; + + /* Log messages to show initializing VMX entry controls */ + pr_dbg("*************************"); + pr_dbg("Initialize Entry control "); + pr_dbg("*************************"); + + /* Set up VMX entry controls - pg 2908 24.8.1 * Set IA32e guest mode - + * on VM entry processor is in IA32e 64 bitmode * Start guest with host + * IA32_PAT and IA32_EFER + */ + value32 = (VMX_ENTRY_CTLS_LOAD_EFER | + VMX_ENTRY_CTLS_LOAD_PAT); + + if (get_vcpu_mode(vcpu) == CPU_MODE_64BIT) { + value32 |= (VMX_ENTRY_CTLS_IA32E_MODE); + } + + value32 = check_vmx_ctrl(MSR_IA32_VMX_ENTRY_CTLS, value32); + + exec_vmwrite32(VMX_ENTRY_CONTROLS, value32); + pr_dbg("VMX_ENTRY_CONTROLS: 0x%x ", value32); + + /* Set up VMX entry MSR load count - pg 2908 24.8.2 Tells the number of + * MSRs on load from memory on VM entry from mem address provided by + * VM-entry MSR load address field + */ + exec_vmwrite32(VMX_ENTRY_MSR_LOAD_COUNT, MSR_AREA_COUNT); + exec_vmwrite64(VMX_ENTRY_MSR_LOAD_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.guest); + + /* Set up VM entry interrupt information field pg 2909 24.8.3 */ + exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD, 0U); + + /* Set up VM entry exception error code - pg 2910 24.8.3 */ + exec_vmwrite32(VMX_ENTRY_EXCEPTION_ERROR_CODE, 0U); + + /* Set up VM entry instruction length - pg 2910 24.8.3 */ + exec_vmwrite32(VMX_ENTRY_INSTR_LENGTH, 0U); +} + +static void init_exit_ctrl(struct acrn_vcpu *vcpu) +{ + uint32_t value32; + + /* Log messages to show initializing VMX entry controls */ + pr_dbg("************************"); + pr_dbg("Initialize Exit control "); + pr_dbg("************************"); + + /* Set up VM exit controls - pg 2907 24.7.1 for: Host address space + * size is 64 bit Set up to acknowledge interrupt on exit, if 1 the HW + * acks the interrupt in VMX non-root and saves the interrupt vector to + * the relevant VM exit field for further processing by Hypervisor + * Enable saving and loading of IA32_PAT and IA32_EFER on VMEXIT Enable + * saving of pre-emption timer on VMEXIT + */ + value32 = check_vmx_ctrl(MSR_IA32_VMX_EXIT_CTLS, + VMX_EXIT_CTLS_ACK_IRQ | + VMX_EXIT_CTLS_SAVE_PAT | + VMX_EXIT_CTLS_LOAD_PAT | + VMX_EXIT_CTLS_LOAD_EFER | + VMX_EXIT_CTLS_SAVE_EFER | + VMX_EXIT_CTLS_HOST_ADDR64); + + exec_vmwrite32(VMX_EXIT_CONTROLS, value32); + pr_dbg("VMX_EXIT_CONTROL: 0x%x ", value32); + + /* Set up VM exit MSR store and load counts pg 2908 24.7.2 - tells the + * HW number of MSRs to stored to mem and loaded from mem on VM exit. + * The 64 bit VM-exit MSR store and load address fields provide the + * corresponding addresses + */ + exec_vmwrite32(VMX_EXIT_MSR_STORE_COUNT, MSR_AREA_COUNT); + exec_vmwrite32(VMX_EXIT_MSR_LOAD_COUNT, MSR_AREA_COUNT); + exec_vmwrite64(VMX_EXIT_MSR_STORE_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.guest); + exec_vmwrite64(VMX_EXIT_MSR_LOAD_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.host); +} + +/** + * @pre vcpu != NULL + */ +void init_vmcs(struct acrn_vcpu *vcpu) +{ + uint64_t vmx_rev_id; + uint64_t vmcs_pa; + + /* Log message */ + pr_dbg("Initializing VMCS"); + + /* Obtain the VM Rev ID from HW and populate VMCS page with it */ + vmx_rev_id = msr_read(MSR_IA32_VMX_BASIC); + (void)memcpy_s(vcpu->arch.vmcs, 4U, (void *)&vmx_rev_id, 4U); + + /* Execute VMCLEAR on current VMCS */ + vmcs_pa = hva2hpa(vcpu->arch.vmcs); + exec_vmclear((void *)&vmcs_pa); + + /* Load VMCS pointer */ + exec_vmptrld((void *)&vmcs_pa); + + /* Initialize the Virtual Machine Control Structure (VMCS) */ + init_host_state(); + /* init exec_ctrl needs to run before init_guest_state */ + init_exec_ctrl(vcpu); + init_guest_state(vcpu); + init_entry_ctrl(vcpu); + init_exit_ctrl(vcpu); +} + +#ifndef CONFIG_PARTITION_MODE +void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu) +{ + uint32_t value32; + value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); + value32 &= ~VMX_PROCBASED_CTLS2_VAPIC; + value32 |= VMX_PROCBASED_CTLS2_VX2APIC; + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); + update_msr_bitmap_x2apic_apicv(vcpu); +} +#else +void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu) +{ + uint32_t value32; + if(vcpu->vm->vm_desc->lapic_pt) { + /* + * Disable external interrupt exiting and irq ack + * Disable posted interrupt processing + * update x2apic msr bitmap for pass-thru + * enable inteception only for ICR + * disable pre-emption for TSC DEADLINE MSR + * Disable Register Virtualization and virtual interrupt delivery + * Disable "use TPR shadow" + */ + + value32 = exec_vmread32(VMX_PIN_VM_EXEC_CONTROLS); + value32 &= ~VMX_PINBASED_CTLS_IRQ_EXIT; + if (is_apicv_posted_intr_supported()) { + value32 &= ~VMX_PINBASED_CTLS_POST_IRQ; + } + exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32); + + value32 = exec_vmread32(VMX_EXIT_CONTROLS); + value32 &= ~VMX_EXIT_CTLS_ACK_IRQ; + exec_vmwrite32(VMX_EXIT_CONTROLS, value32); + + value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS); + value32 &= ~VMX_PROCBASED_CTLS_TPR_SHADOW; + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32); + + exec_vmwrite32(VMX_TPR_THRESHOLD, 0U); + + value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); + value32 &= ~VMX_PROCBASED_CTLS2_VAPIC_REGS; + if (is_apicv_intr_delivery_supported()) { + value32 &= ~VMX_PROCBASED_CTLS2_VIRQ; + } + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); + + update_msr_bitmap_x2apic_passthru(vcpu); + } else { + value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); + value32 &= ~VMX_PROCBASED_CTLS2_VAPIC; + value32 |= VMX_PROCBASED_CTLS2_VX2APIC; + exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); + update_msr_bitmap_x2apic_apicv(vcpu); + } +} +#endif diff --git a/hypervisor/arch/x86/vmx.c b/hypervisor/arch/x86/vmx.c index 39a1f316e..d7531ba07 100644 --- a/hypervisor/arch/x86/vmx.c +++ b/hypervisor/arch/x86/vmx.c @@ -2,45 +2,13 @@ * Copyright (C) 2018 Intel Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause + * + * this file contains pure vmx operations */ #include #include #include -#ifdef CONFIG_EFI_STUB -extern struct efi_context* efi_ctx; -#endif - -#define REAL_MODE_BSP_INIT_CODE_SEL (0xf000U) -#define REAL_MODE_DATA_SEG_AR (0x0093U) -#define REAL_MODE_CODE_SEG_AR (0x009fU) -#define PROTECTED_MODE_DATA_SEG_AR (0xc093U) -#define PROTECTED_MODE_CODE_SEG_AR (0xc09bU) -#define DR7_INIT_VALUE (0x400UL) -#define LDTR_AR (0x0082U) /* LDT, type must be 2, refer to SDM Vol3 26.3.1.2 */ -#define TR_AR (0x008bU) /* TSS (busy), refer to SDM Vol3 26.3.1.2 */ - -static uint64_t cr0_host_mask; -static uint64_t cr0_always_on_mask; -static uint64_t cr0_always_off_mask; -static uint64_t cr4_host_mask; -static uint64_t cr4_always_on_mask; -static uint64_t cr4_always_off_mask; - -bool is_vmx_disabled(void) -{ - uint64_t msr_val; - - /* Read Feature ControL MSR */ - msr_val = msr_read(MSR_IA32_FEATURE_CONTROL); - - /* Check if feature control is locked and vmx cannot be enabled */ - if ((msr_val & MSR_IA32_FEATURE_CONTROL_LOCK) != 0U && - (msr_val & MSR_IA32_FEATURE_CONTROL_VMX_NO_SMX) == 0U) { - return true; - } - return false; -} /** * @pre addr != NULL && addr is 4KB-aligned @@ -202,927 +170,3 @@ void exec_vmwrite16(uint32_t field, uint16_t value) { exec_vmwrite64(field, (uint64_t)value); } - -static void init_cr0_cr4_host_mask(void) -{ - static bool inited = false; - uint64_t fixed0, fixed1; - if (!inited) { - /* Read the CR0 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); - - cr0_host_mask = ~(fixed0 ^ fixed1); - /* Add the bit hv wants to trap */ - cr0_host_mask |= CR0_TRAP_MASK; - /* CR0 clear PE/PG from always on bits due to "unrestructed - * guest" feature */ - cr0_always_on_mask = fixed0 & (~(CR0_PE | CR0_PG)); - cr0_always_off_mask = ~fixed1; - /* SDM 2.5 - * bit 63:32 of CR0 and CR4 ar reserved and must be written - * zero. We could merge it with always off mask. - */ - cr0_always_off_mask |= 0xFFFFFFFF00000000UL; - - /* Read the CR4 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); - - cr4_host_mask = ~(fixed0 ^ fixed1); - /* Add the bit hv wants to trap */ - cr4_host_mask |= CR4_TRAP_MASK; - cr4_always_on_mask = fixed0; - /* Record the bit fixed to 0 for CR4, including reserved bits */ - cr4_always_off_mask = ~fixed1; - /* SDM 2.5 - * bit 63:32 of CR0 and CR4 ar reserved and must be written - * zero. We could merge it with always off mask. - */ - cr4_always_off_mask |= 0xFFFFFFFF00000000UL; - cr4_always_off_mask |= CR4_RESERVED_MASK; - inited = true; - } - - exec_vmwrite(VMX_CR0_MASK, cr0_host_mask); - /* Output CR0 mask value */ - pr_dbg("CR0 mask value: 0x%016llx", cr0_host_mask); - - - exec_vmwrite(VMX_CR4_MASK, cr4_host_mask); - /* Output CR4 mask value */ - pr_dbg("CR4 mask value: 0x%016llx", cr4_host_mask); -} - -uint64_t vmx_rdmsr_pat(const struct acrn_vcpu *vcpu) -{ - /* - * note: if run_ctx->cr0.CD is set, the actual value in guest's - * IA32_PAT MSR is PAT_ALL_UC_VALUE, which may be different from - * the saved value guest_msrs[MSR_IA32_PAT] - */ - return vcpu_get_guest_msr(vcpu, MSR_IA32_PAT); -} - -int32_t vmx_wrmsr_pat(struct acrn_vcpu *vcpu, uint64_t value) -{ - uint32_t i; - uint64_t field; - - for (i = 0U; i < 8U; i++) { - field = (value >> (i * 8U)) & 0xffUL; - if (pat_mem_type_invalid(field) || - ((PAT_FIELD_RSV_BITS & field) != 0UL)) { - pr_err("invalid guest IA32_PAT: 0x%016llx", value); - return -EINVAL; - } - } - - vcpu_set_guest_msr(vcpu, MSR_IA32_PAT, value); - - /* - * If context->cr0.CD is set, we defer any further requests to write - * guest's IA32_PAT, until the time when guest's CR0.CD is being cleared - */ - if ((vcpu_get_cr0(vcpu) & CR0_CD) == 0UL) { - exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, value); - } - - return 0; -} - -static void load_pdptrs(const struct acrn_vcpu *vcpu) -{ - uint64_t guest_cr3 = exec_vmread(VMX_GUEST_CR3); - /* TODO: check whether guest cr3 is valid */ - uint64_t *guest_cr3_hva = (uint64_t *)gpa2hva(vcpu->vm, guest_cr3); - - stac(); - exec_vmwrite64(VMX_GUEST_PDPTE0_FULL, get_pgentry(guest_cr3_hva + 0UL)); - exec_vmwrite64(VMX_GUEST_PDPTE1_FULL, get_pgentry(guest_cr3_hva + 1UL)); - exec_vmwrite64(VMX_GUEST_PDPTE2_FULL, get_pgentry(guest_cr3_hva + 2UL)); - exec_vmwrite64(VMX_GUEST_PDPTE3_FULL, get_pgentry(guest_cr3_hva + 3UL)); - clac(); -} - -static bool is_cr0_write_valid(struct acrn_vcpu *vcpu, uint64_t cr0) -{ - /* Shouldn't set always off bit */ - if ((cr0 & cr0_always_off_mask) != 0UL) { - return false; - } - - /* SDM 25.3 "Changes to instruction behavior in VMX non-root" - * - * We always require "unrestricted guest" control enabled. So - * - * CR0.PG = 1, CR4.PAE = 0 and IA32_EFER.LME = 1 is invalid. - * CR0.PE = 0 and CR0.PG = 1 is invalid. - */ - if (((cr0 & CR0_PG) != 0UL) && !is_pae(vcpu) - && ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL)) { - return false; - } - - if (((cr0 & CR0_PE) == 0UL) && ((cr0 & CR0_PG) != 0UL)) { - return false; - } - - /* SDM 6.15 "Exception and Interrupt Refrerence" GP Exception - * - * Loading CR0 regsiter with a set NW flag and a clear CD flag - * is invalid - */ - if (((cr0 & CR0_CD) == 0UL) && ((cr0 & CR0_NW) != 0UL)) { - return false; - } - - return true; -} - -/* - * Handling of CR0: - * Assume "unrestricted guest" feature is supported by vmx. - * For mode switch, hv only needs to take care of enabling/disabling long mode, - * thanks to "unrestricted guest" feature. - * - * - PE (0) Trapped to track cpu mode. - * Set the value according to the value from guest. - * - MP (1) Flexible to guest - * - EM (2) Flexible to guest - * - TS (3) Flexible to guest - * - ET (4) Flexible to guest - * - NE (5) must always be 1 - * - WP (16) Trapped to get if it inhibits supervisor level procedures to - * write into ro-pages. - * - AM (18) Flexible to guest - * - NW (29) Trapped to emulate cache disable situation - * - CD (30) Trapped to emulate cache disable situation - * - PG (31) Trapped to track cpu/paging mode. - * Set the value according to the value from guest. - */ -void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0) -{ - uint64_t cr0_vmx; - uint32_t entry_ctrls; - bool old_paging_enabled = is_paging_enabled(vcpu); - uint64_t cr0_changed_bits = vcpu_get_cr0(vcpu) ^ cr0; - - if (!is_cr0_write_valid(vcpu, cr0)) { - pr_dbg("Invalid cr0 write operation from guest"); - vcpu_inject_gp(vcpu, 0U); - return; - } - - /* SDM 2.5 - * When loading a control register, reserved bit should always set - * to the value previously read. - */ - cr0 &= ~CR0_RESERVED_MASK; - - if (!old_paging_enabled && ((cr0 & CR0_PG) != 0UL)) { - if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { - /* Enable long mode */ - pr_dbg("VMM: Enable long mode"); - entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); - entry_ctrls |= VMX_ENTRY_CTLS_IA32E_MODE; - exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); - - vcpu_set_efer(vcpu, - vcpu_get_efer(vcpu) | MSR_IA32_EFER_LMA_BIT); - } else if (is_pae(vcpu)) { - /* enabled PAE from paging disabled */ - load_pdptrs(vcpu); - } else { - /* do nothing */ - } - } else if (old_paging_enabled && ((cr0 & CR0_PG) == 0UL)) { - if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { - /* Disable long mode */ - pr_dbg("VMM: Disable long mode"); - entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); - entry_ctrls &= ~VMX_ENTRY_CTLS_IA32E_MODE; - exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); - - vcpu_set_efer(vcpu, - vcpu_get_efer(vcpu) & ~MSR_IA32_EFER_LMA_BIT); - } - } else { - /* do nothing */ - } - - /* If CR0.CD or CR0.NW get cr0_changed_bits */ - if ((cr0_changed_bits & (CR0_CD | CR0_NW)) != 0UL) { - /* No action if only CR0.NW is cr0_changed_bits */ - if ((cr0_changed_bits & CR0_CD) != 0UL) { - if ((cr0 & CR0_CD) != 0UL) { - /* - * When the guest requests to set CR0.CD, we don't allow - * guest's CR0.CD to be actually set, instead, we write guest - * IA32_PAT with all-UC entries to emulate the cache - * disabled behavior - */ - exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, PAT_ALL_UC_VALUE); - if (!iommu_snoop_supported(vcpu->vm)) { - cache_flush_invalidate_all(); - } - } else { - /* Restore IA32_PAT to enable cache again */ - exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, - vcpu_get_guest_msr(vcpu, MSR_IA32_PAT)); - } - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); - } - } - - if ((cr0_changed_bits & (CR0_PG | CR0_WP)) != 0UL) { - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); - } - - /* CR0 has no always off bits, except the always on bits, and reserved - * bits, allow to set according to guest. - */ - cr0_vmx = cr0_always_on_mask | cr0; - - /* Don't set CD or NW bit to guest */ - cr0_vmx &= ~(CR0_CD | CR0_NW); - exec_vmwrite(VMX_GUEST_CR0, cr0_vmx & 0xFFFFFFFFUL); - exec_vmwrite(VMX_CR0_READ_SHADOW, cr0 & 0xFFFFFFFFUL); - - /* clear read cache, next time read should from VMCS */ - bitmap_clear_lock(CPU_REG_CR0, &vcpu->reg_cached); - - pr_dbg("VMM: Try to write %016llx, allow to write 0x%016llx to CR0", - cr0, cr0_vmx); -} - -static bool is_cr4_write_valid(struct acrn_vcpu *vcpu, uint64_t cr4) -{ - /* Check if guest try to set fixed to 0 bits or reserved bits */ - if ((cr4 & cr4_always_off_mask) != 0U) { - return false; - } - - /* Do NOT support nested guest */ - if ((cr4 & CR4_VMXE) != 0UL) { - return false; - } - - /* Do NOT support PCID in guest */ - if ((cr4 & CR4_PCIDE) != 0UL) { - return false; - } - - if (is_long_mode(vcpu)) { - if ((cr4 & CR4_PAE) == 0UL) { - return false; - } - } - - return true; -} - -/* - * Handling of CR4: - * Assume "unrestricted guest" feature is supported by vmx. - * - * For CR4, if some feature is not supported by hardware, the corresponding bit - * will be set in cr4_always_off_mask. If guest try to set these bits after - * vmexit, will inject a #GP. - * If a bit for a feature not supported by hardware, which is flexible to guest, - * and write to it do not lead to a VM exit, a #GP should be generated inside - * guest. - * - * - VME (0) Flexible to guest - * - PVI (1) Flexible to guest - * - TSD (2) Flexible to guest - * - DE (3) Flexible to guest - * - PSE (4) Trapped to track paging mode. - * Set the value according to the value from guest. - * - PAE (5) Trapped to track paging mode. - * Set the value according to the value from guest. - * - MCE (6) Flexible to guest - * - PGE (7) Flexible to guest - * - PCE (8) Flexible to guest - * - OSFXSR (9) Flexible to guest - * - OSXMMEXCPT (10) Flexible to guest - * - VMXE (13) Trapped to hide from guest - * - SMXE (14) must always be 0 => must lead to a VM exit - * - PCIDE (17) Trapped to hide from guest - * - OSXSAVE (18) Flexible to guest - * - XSAVE (19) Flexible to guest - * We always keep align with physical cpu. So it's flexible to - * guest - * - SMEP (20) Flexible to guest - * - SMAP (21) Flexible to guest - * - PKE (22) Flexible to guest - */ -void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4) -{ - uint64_t cr4_vmx; - uint64_t old_cr4 = vcpu_get_cr4(vcpu); - - if (!is_cr4_write_valid(vcpu, cr4)) { - pr_dbg("Invalid cr4 write operation from guest"); - vcpu_inject_gp(vcpu, 0U); - return; - } - - if (((cr4 ^ old_cr4) & (CR4_PGE | CR4_PSE | CR4_PAE | - CR4_SMEP | CR4_SMAP | CR4_PKE)) != 0UL) { - if (((cr4 & CR4_PAE) != 0UL) && is_paging_enabled(vcpu) && - (is_long_mode(vcpu))) { - load_pdptrs(vcpu); - } - - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); - } - - /* Aways off bits and reserved bits has been filtered above */ - cr4_vmx = cr4_always_on_mask | cr4; - exec_vmwrite(VMX_GUEST_CR4, cr4_vmx & 0xFFFFFFFFUL); - exec_vmwrite(VMX_CR4_READ_SHADOW, cr4 & 0xFFFFFFFFUL); - - /* clear read cache, next time read should from VMCS */ - bitmap_clear_lock(CPU_REG_CR4, &vcpu->reg_cached); - - pr_dbg("VMM: Try to write %016llx, allow to write 0x%016llx to CR4", - cr4, cr4_vmx); -} - -/* rip, rsp, ia32_efer and rflags are written to VMCS in start_vcpu */ -static void init_guest_vmx(struct acrn_vcpu *vcpu, uint64_t cr0, uint64_t cr3, - uint64_t cr4) -{ - struct cpu_context *ctx = - &vcpu->arch.contexts[vcpu->arch.cur_context]; - struct ext_context *ectx = &ctx->ext_ctx; - - vcpu_set_cr4(vcpu, cr4); - vcpu_set_cr0(vcpu, cr0); - exec_vmwrite(VMX_GUEST_CR3, cr3); - - exec_vmwrite(VMX_GUEST_GDTR_BASE, ectx->gdtr.base); - pr_dbg("VMX_GUEST_GDTR_BASE: 0x%016llx", ectx->gdtr.base); - exec_vmwrite32(VMX_GUEST_GDTR_LIMIT, ectx->gdtr.limit); - pr_dbg("VMX_GUEST_GDTR_LIMIT: 0x%016llx", ectx->gdtr.limit); - - exec_vmwrite(VMX_GUEST_IDTR_BASE, ectx->idtr.base); - pr_dbg("VMX_GUEST_IDTR_BASE: 0x%016llx", ectx->idtr.base); - exec_vmwrite32(VMX_GUEST_IDTR_LIMIT, ectx->idtr.limit); - pr_dbg("VMX_GUEST_IDTR_LIMIT: 0x%016llx", ectx->idtr.limit); - - /* init segment selectors: es, cs, ss, ds, fs, gs, ldtr, tr */ - load_segment(ectx->cs, VMX_GUEST_CS); - load_segment(ectx->ss, VMX_GUEST_SS); - load_segment(ectx->ds, VMX_GUEST_DS); - load_segment(ectx->es, VMX_GUEST_ES); - load_segment(ectx->fs, VMX_GUEST_FS); - load_segment(ectx->gs, VMX_GUEST_GS); - load_segment(ectx->tr, VMX_GUEST_TR); - load_segment(ectx->ldtr, VMX_GUEST_LDTR); - - /* fixed values */ - exec_vmwrite32(VMX_GUEST_IA32_SYSENTER_CS, 0U); - exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, 0UL); - exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, 0UL); - exec_vmwrite(VMX_GUEST_PENDING_DEBUG_EXCEPT, 0UL); - exec_vmwrite(VMX_GUEST_IA32_DEBUGCTL_FULL, 0UL); - exec_vmwrite32(VMX_GUEST_INTERRUPTIBILITY_INFO, 0U); - exec_vmwrite32(VMX_GUEST_ACTIVITY_STATE, 0U); - exec_vmwrite32(VMX_GUEST_SMBASE, 0U); - vcpu_set_guest_msr(vcpu, MSR_IA32_PAT, PAT_POWER_ON_VALUE); - exec_vmwrite(VMX_GUEST_IA32_PAT_FULL, PAT_POWER_ON_VALUE); - exec_vmwrite(VMX_GUEST_DR7, DR7_INIT_VALUE); -} - -static void init_guest_state(struct acrn_vcpu *vcpu) -{ - struct cpu_context *ctx = - &vcpu->arch.contexts[vcpu->arch.cur_context]; - - init_guest_vmx(vcpu, ctx->run_ctx.cr0, ctx->ext_ctx.cr3, - ctx->run_ctx.cr4 & ~CR4_VMXE); -} - -static void init_host_state(void) -{ - uint16_t value16; - uint64_t value64; - uint64_t value; - uint64_t tss_addr; - uint64_t gdt_base; - uint64_t idt_base; - - pr_dbg("*********************"); - pr_dbg("Initialize host state"); - pr_dbg("*********************"); - - /*************************************************** - * 16 - Bit fields - * Move the current ES, CS, SS, DS, FS, GS, TR, LDTR * values to the - * corresponding 16-bit host * segment selection (ES, CS, SS, DS, FS, - * GS), * Task Register (TR), * Local Descriptor Table Register (LDTR) - * - ***************************************************/ - CPU_SEG_READ(es, &value16); - exec_vmwrite16(VMX_HOST_ES_SEL, value16); - pr_dbg("VMX_HOST_ES_SEL: 0x%hx ", value16); - - CPU_SEG_READ(cs, &value16); - exec_vmwrite16(VMX_HOST_CS_SEL, value16); - pr_dbg("VMX_HOST_CS_SEL: 0x%hx ", value16); - - CPU_SEG_READ(ss, &value16); - exec_vmwrite16(VMX_HOST_SS_SEL, value16); - pr_dbg("VMX_HOST_SS_SEL: 0x%hx ", value16); - - CPU_SEG_READ(ds, &value16); - exec_vmwrite16(VMX_HOST_DS_SEL, value16); - pr_dbg("VMX_HOST_DS_SEL: 0x%hx ", value16); - - CPU_SEG_READ(fs, &value16); - exec_vmwrite16(VMX_HOST_FS_SEL, value16); - pr_dbg("VMX_HOST_FS_SEL: 0x%hx ", value16); - - CPU_SEG_READ(gs, &value16); - exec_vmwrite16(VMX_HOST_GS_SEL, value16); - pr_dbg("VMX_HOST_GS_SEL: 0x%hx ", value16); - - exec_vmwrite16(VMX_HOST_TR_SEL, HOST_GDT_RING0_CPU_TSS_SEL); - pr_dbg("VMX_HOST_TR_SEL: 0x%hx ", HOST_GDT_RING0_CPU_TSS_SEL); - - /****************************************************** - * 32-bit fields - * Set up the 32 bit host state fields - pg 3418 B.3.3 * Set limit for - * ES, CS, DD, DS, FS, GS, LDTR, Guest TR, * GDTR, and IDTR - ******************************************************/ - - /* TODO: Should guest GDTB point to host GDTB ? */ - /* Obtain the current global descriptor table base */ - gdt_base = sgdt(); - - if (((gdt_base >> 47U) & 0x1UL) != 0UL) { - gdt_base |= 0xffff000000000000UL; - } - - /* Set up the guest and host GDTB base fields with current GDTB base */ - exec_vmwrite(VMX_HOST_GDTR_BASE, gdt_base); - pr_dbg("VMX_HOST_GDTR_BASE: 0x%x ", gdt_base); - - tss_addr = hva2hpa((void *)&get_cpu_var(tss)); - /* Set up host TR base fields */ - exec_vmwrite(VMX_HOST_TR_BASE, tss_addr); - pr_dbg("VMX_HOST_TR_BASE: 0x%016llx ", tss_addr); - - /* Obtain the current interrupt descriptor table base */ - idt_base = sidt(); - /* base */ - if (((idt_base >> 47U) & 0x1UL) != 0UL) { - idt_base |= 0xffff000000000000UL; - } - - exec_vmwrite(VMX_HOST_IDTR_BASE, idt_base); - pr_dbg("VMX_HOST_IDTR_BASE: 0x%x ", idt_base); - - /**************************************************/ - /* 64-bit fields */ - pr_dbg("64-bit********"); - - value64 = msr_read(MSR_IA32_PAT); - exec_vmwrite64(VMX_HOST_IA32_PAT_FULL, value64); - pr_dbg("VMX_HOST_IA32_PAT: 0x%016llx ", value64); - - value64 = msr_read(MSR_IA32_EFER); - exec_vmwrite64(VMX_HOST_IA32_EFER_FULL, value64); - pr_dbg("VMX_HOST_IA32_EFER: 0x%016llx ", - value64); - - /**************************************************/ - /* Natural width fields */ - pr_dbg("Natural-width********"); - /* Set up host CR0 field */ - CPU_CR_READ(cr0, &value); - exec_vmwrite(VMX_HOST_CR0, value); - pr_dbg("VMX_HOST_CR0: 0x%016llx ", value); - - /* Set up host CR3 field */ - CPU_CR_READ(cr3, &value); - exec_vmwrite(VMX_HOST_CR3, value); - pr_dbg("VMX_HOST_CR3: 0x%016llx ", value); - - /* Set up host CR4 field */ - CPU_CR_READ(cr4, &value); - exec_vmwrite(VMX_HOST_CR4, value); - pr_dbg("VMX_HOST_CR4: 0x%016llx ", value); - - /* Set up host and guest FS base address */ - value = msr_read(MSR_IA32_FS_BASE); - exec_vmwrite(VMX_HOST_FS_BASE, value); - pr_dbg("VMX_HOST_FS_BASE: 0x%016llx ", value); - value = msr_read(MSR_IA32_GS_BASE); - exec_vmwrite(VMX_HOST_GS_BASE, value); - pr_dbg("VMX_HOST_GS_BASE: 0x%016llx ", value); - - /* Set up host instruction pointer on VM Exit */ - value64 = (uint64_t)&vm_exit; - pr_dbg("HOST RIP on VMExit %016llx ", value64); - exec_vmwrite(VMX_HOST_RIP, value64); - pr_dbg("vm exit return address = %016llx ", value64); - - /* As a type I hypervisor, just init sysenter fields to 0 */ - exec_vmwrite32(VMX_HOST_IA32_SYSENTER_CS, 0U); - exec_vmwrite(VMX_HOST_IA32_SYSENTER_ESP, 0UL); - exec_vmwrite(VMX_HOST_IA32_SYSENTER_EIP, 0UL); -} - -static uint32_t check_vmx_ctrl(uint32_t msr, uint32_t ctrl_req) -{ - uint64_t vmx_msr; - uint32_t vmx_msr_low, vmx_msr_high; - uint32_t ctrl = ctrl_req; - - vmx_msr = msr_read(msr); - vmx_msr_low = (uint32_t)vmx_msr; - vmx_msr_high = (uint32_t)(vmx_msr >> 32U); - pr_dbg("VMX_PIN_VM_EXEC_CONTROLS:low=0x%x, high=0x%x\n", - vmx_msr_low, vmx_msr_high); - - /* high 32b: must 0 setting - * low 32b: must 1 setting - */ - ctrl &= vmx_msr_high; - ctrl |= vmx_msr_low; - - if ((ctrl_req & ~ctrl) != 0U) { - pr_err("VMX ctrl 0x%x not fully enabled: " - "request 0x%x but get 0x%x\n", - msr, ctrl_req, ctrl); - } - - return ctrl; - -} - -static void init_exec_ctrl(struct acrn_vcpu *vcpu) -{ - uint32_t value32; - uint64_t value64; - struct acrn_vm *vm = vcpu->vm; - - /* Log messages to show initializing VMX execution controls */ - pr_dbg("*****************************"); - pr_dbg("Initialize execution control "); - pr_dbg("*****************************"); - - /* Set up VM Execution control to enable Set VM-exits on external - * interrupts preemption timer - pg 2899 24.6.1 - */ - /* enable external interrupt VM Exit */ - value32 = check_vmx_ctrl(MSR_IA32_VMX_PINBASED_CTLS, - VMX_PINBASED_CTLS_IRQ_EXIT); - - if (is_apicv_posted_intr_supported()) { - value32 |= VMX_PINBASED_CTLS_POST_IRQ; - } - - exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32); - pr_dbg("VMX_PIN_VM_EXEC_CONTROLS: 0x%x ", value32); - - /* Set up primary processor based VM execution controls - pg 2900 - * 24.6.2. Set up for: - * Enable TSC offsetting - * Enable TSC exiting - * guest access to IO bit-mapped ports causes VM exit - * guest access to MSR causes VM exit - * Activate secondary controls - */ - /* These are bits 1,4-6,8,13-16, and 26, the corresponding bits of - * the IA32_VMX_PROCBASED_CTRLS MSR are always read as 1 --- A.3.2 - */ - value32 = check_vmx_ctrl(MSR_IA32_VMX_PROCBASED_CTLS, - VMX_PROCBASED_CTLS_TSC_OFF | - /* VMX_PROCBASED_CTLS_RDTSC | */ - VMX_PROCBASED_CTLS_TPR_SHADOW | - VMX_PROCBASED_CTLS_IO_BITMAP | - VMX_PROCBASED_CTLS_MSR_BITMAP | - VMX_PROCBASED_CTLS_SECONDARY); - - /*Disable VM_EXIT for CR3 access*/ - value32 &= ~(VMX_PROCBASED_CTLS_CR3_LOAD | - VMX_PROCBASED_CTLS_CR3_STORE); - - /* - * Disable VM_EXIT for invlpg execution. - */ - value32 &= ~VMX_PROCBASED_CTLS_INVLPG; - - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32); - pr_dbg("VMX_PROC_VM_EXEC_CONTROLS: 0x%x ", value32); - - /* Set up secondary processor based VM execution controls - pg 2901 - * 24.6.2. Set up for: * Enable EPT * Enable RDTSCP * Unrestricted - * guest (optional) - */ - value32 = check_vmx_ctrl(MSR_IA32_VMX_PROCBASED_CTLS2, - VMX_PROCBASED_CTLS2_VAPIC | - VMX_PROCBASED_CTLS2_EPT | - VMX_PROCBASED_CTLS2_RDTSCP | - VMX_PROCBASED_CTLS2_UNRESTRICT| - VMX_PROCBASED_CTLS2_VAPIC_REGS); - - if (vcpu->arch.vpid != 0U) { - value32 |= VMX_PROCBASED_CTLS2_VPID; - } else { - value32 &= ~VMX_PROCBASED_CTLS2_VPID; - } - - if (is_apicv_intr_delivery_supported()) { - value32 |= VMX_PROCBASED_CTLS2_VIRQ; - } else { - /* - * This field exists only on processors that support - * the 1-setting of the "use TPR shadow" - * VM-execution control. - * - * Set up TPR threshold for virtual interrupt delivery - * - pg 2904 24.6.8 - */ - exec_vmwrite32(VMX_TPR_THRESHOLD, 0U); - } - - if (cpu_has_cap(X86_FEATURE_OSXSAVE)) { - exec_vmwrite64(VMX_XSS_EXITING_BITMAP_FULL, 0UL); - value32 |= VMX_PROCBASED_CTLS2_XSVE_XRSTR; - } - - value32 |= VMX_PROCBASED_CTLS2_WBINVD; - - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); - pr_dbg("VMX_PROC_VM_EXEC_CONTROLS2: 0x%x ", value32); - - /*APIC-v, config APIC-access address*/ - value64 = vlapic_apicv_get_apic_access_addr(); - exec_vmwrite64(VMX_APIC_ACCESS_ADDR_FULL, value64); - - /*APIC-v, config APIC virtualized page address*/ - value64 = vlapic_apicv_get_apic_page_addr(vcpu_vlapic(vcpu)); - exec_vmwrite64(VMX_VIRTUAL_APIC_PAGE_ADDR_FULL, value64); - - if (is_apicv_intr_delivery_supported()) { - /* Disable all EOI VMEXIT by default and - * clear RVI and SVI. - */ - exec_vmwrite64(VMX_EOI_EXIT0_FULL, 0UL); - exec_vmwrite64(VMX_EOI_EXIT1_FULL, 0UL); - exec_vmwrite64(VMX_EOI_EXIT2_FULL, 0UL); - exec_vmwrite64(VMX_EOI_EXIT3_FULL, 0UL); - - exec_vmwrite16(VMX_GUEST_INTR_STATUS, 0U); - if (is_apicv_posted_intr_supported()) { - exec_vmwrite16(VMX_POSTED_INTR_VECTOR, - VECTOR_POSTED_INTR); - exec_vmwrite64(VMX_PIR_DESC_ADDR_FULL, - apicv_get_pir_desc_paddr(vcpu)); - } - } - - /* Load EPTP execution control - * TODO: introduce API to make this data driven based - * on VMX_EPT_VPID_CAP - */ - value64 = hva2hpa(vm->arch_vm.nworld_eptp) | (3UL << 3U) | 6UL; - exec_vmwrite64(VMX_EPT_POINTER_FULL, value64); - pr_dbg("VMX_EPT_POINTER: 0x%016llx ", value64); - - /* Set up guest exception mask bitmap setting a bit * causes a VM exit - * on corresponding guest * exception - pg 2902 24.6.3 - * enable VM exit on MC only - */ - value32 = (1U << IDT_MC); - exec_vmwrite32(VMX_EXCEPTION_BITMAP, value32); - - /* Set up page fault error code mask - second paragraph * pg 2902 - * 24.6.3 - guest page fault exception causing * vmexit is governed by - * both VMX_EXCEPTION_BITMAP and * VMX_PF_ERROR_CODE_MASK - */ - exec_vmwrite32(VMX_PF_ERROR_CODE_MASK, 0U); - - /* Set up page fault error code match - second paragraph * pg 2902 - * 24.6.3 - guest page fault exception causing * vmexit is governed by - * both VMX_EXCEPTION_BITMAP and * VMX_PF_ERROR_CODE_MATCH - */ - exec_vmwrite32(VMX_PF_ERROR_CODE_MATCH, 0U); - - /* Set up CR3 target count - An execution of mov to CR3 * by guest - * causes HW to evaluate operand match with * one of N CR3-Target Value - * registers. The CR3 target * count values tells the number of - * target-value regs to evaluate - */ - exec_vmwrite32(VMX_CR3_TARGET_COUNT, 0U); - - /* Set up IO bitmap register A and B - pg 2902 24.6.4 */ - value64 = hva2hpa(vm->arch_vm.io_bitmap); - exec_vmwrite64(VMX_IO_BITMAP_A_FULL, value64); - pr_dbg("VMX_IO_BITMAP_A: 0x%016llx ", value64); - value64 = hva2hpa((void *)&(vm->arch_vm.io_bitmap[PAGE_SIZE])); - exec_vmwrite64(VMX_IO_BITMAP_B_FULL, value64); - pr_dbg("VMX_IO_BITMAP_B: 0x%016llx ", value64); - - init_msr_emulation(vcpu); - - /* Set up executive VMCS pointer - pg 2905 24.6.10 */ - exec_vmwrite64(VMX_EXECUTIVE_VMCS_PTR_FULL, 0UL); - - /* Setup Time stamp counter offset - pg 2902 24.6.5 */ - exec_vmwrite64(VMX_TSC_OFFSET_FULL, 0UL); - - /* Set up the link pointer */ - exec_vmwrite64(VMX_VMS_LINK_PTR_FULL, 0xFFFFFFFFFFFFFFFFUL); - - /* Natural-width */ - pr_dbg("Natural-width*********"); - - init_cr0_cr4_host_mask(); - - /* The CR3 target registers work in concert with VMX_CR3_TARGET_COUNT - * field. Using these registers guest CR3 access can be managed. i.e., - * if operand does not match one of these register values a VM exit - * would occur - */ - exec_vmwrite(VMX_CR3_TARGET_0, 0UL); - exec_vmwrite(VMX_CR3_TARGET_1, 0UL); - exec_vmwrite(VMX_CR3_TARGET_2, 0UL); - exec_vmwrite(VMX_CR3_TARGET_3, 0UL); -} - -static void init_entry_ctrl(const struct acrn_vcpu *vcpu) -{ - uint32_t value32; - - /* Log messages to show initializing VMX entry controls */ - pr_dbg("*************************"); - pr_dbg("Initialize Entry control "); - pr_dbg("*************************"); - - /* Set up VMX entry controls - pg 2908 24.8.1 * Set IA32e guest mode - - * on VM entry processor is in IA32e 64 bitmode * Start guest with host - * IA32_PAT and IA32_EFER - */ - value32 = (VMX_ENTRY_CTLS_LOAD_EFER | - VMX_ENTRY_CTLS_LOAD_PAT); - - if (get_vcpu_mode(vcpu) == CPU_MODE_64BIT) { - value32 |= (VMX_ENTRY_CTLS_IA32E_MODE); - } - - value32 = check_vmx_ctrl(MSR_IA32_VMX_ENTRY_CTLS, value32); - - exec_vmwrite32(VMX_ENTRY_CONTROLS, value32); - pr_dbg("VMX_ENTRY_CONTROLS: 0x%x ", value32); - - /* Set up VMX entry MSR load count - pg 2908 24.8.2 Tells the number of - * MSRs on load from memory on VM entry from mem address provided by - * VM-entry MSR load address field - */ - exec_vmwrite32(VMX_ENTRY_MSR_LOAD_COUNT, MSR_AREA_COUNT); - exec_vmwrite64(VMX_ENTRY_MSR_LOAD_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.guest); - - /* Set up VM entry interrupt information field pg 2909 24.8.3 */ - exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD, 0U); - - /* Set up VM entry exception error code - pg 2910 24.8.3 */ - exec_vmwrite32(VMX_ENTRY_EXCEPTION_ERROR_CODE, 0U); - - /* Set up VM entry instruction length - pg 2910 24.8.3 */ - exec_vmwrite32(VMX_ENTRY_INSTR_LENGTH, 0U); -} - -static void init_exit_ctrl(struct acrn_vcpu *vcpu) -{ - uint32_t value32; - - /* Log messages to show initializing VMX entry controls */ - pr_dbg("************************"); - pr_dbg("Initialize Exit control "); - pr_dbg("************************"); - - /* Set up VM exit controls - pg 2907 24.7.1 for: Host address space - * size is 64 bit Set up to acknowledge interrupt on exit, if 1 the HW - * acks the interrupt in VMX non-root and saves the interrupt vector to - * the relevant VM exit field for further processing by Hypervisor - * Enable saving and loading of IA32_PAT and IA32_EFER on VMEXIT Enable - * saving of pre-emption timer on VMEXIT - */ - value32 = check_vmx_ctrl(MSR_IA32_VMX_EXIT_CTLS, - VMX_EXIT_CTLS_ACK_IRQ | - VMX_EXIT_CTLS_SAVE_PAT | - VMX_EXIT_CTLS_LOAD_PAT | - VMX_EXIT_CTLS_LOAD_EFER | - VMX_EXIT_CTLS_SAVE_EFER | - VMX_EXIT_CTLS_HOST_ADDR64); - - exec_vmwrite32(VMX_EXIT_CONTROLS, value32); - pr_dbg("VMX_EXIT_CONTROL: 0x%x ", value32); - - /* Set up VM exit MSR store and load counts pg 2908 24.7.2 - tells the - * HW number of MSRs to stored to mem and loaded from mem on VM exit. - * The 64 bit VM-exit MSR store and load address fields provide the - * corresponding addresses - */ - exec_vmwrite32(VMX_EXIT_MSR_STORE_COUNT, MSR_AREA_COUNT); - exec_vmwrite32(VMX_EXIT_MSR_LOAD_COUNT, MSR_AREA_COUNT); - exec_vmwrite64(VMX_EXIT_MSR_STORE_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.guest); - exec_vmwrite64(VMX_EXIT_MSR_LOAD_ADDR_FULL, (uint64_t)vcpu->arch.msr_area.host); -} - -/** - * @pre vcpu != NULL - */ -void init_vmcs(struct acrn_vcpu *vcpu) -{ - uint64_t vmx_rev_id; - uint64_t vmcs_pa; - - /* Log message */ - pr_dbg("Initializing VMCS"); - - /* Obtain the VM Rev ID from HW and populate VMCS page with it */ - vmx_rev_id = msr_read(MSR_IA32_VMX_BASIC); - (void)memcpy_s(vcpu->arch.vmcs, 4U, (void *)&vmx_rev_id, 4U); - - /* Execute VMCLEAR on current VMCS */ - vmcs_pa = hva2hpa(vcpu->arch.vmcs); - exec_vmclear((void *)&vmcs_pa); - - /* Load VMCS pointer */ - exec_vmptrld((void *)&vmcs_pa); - - /* Initialize the Virtual Machine Control Structure (VMCS) */ - init_host_state(); - /* init exec_ctrl needs to run before init_guest_state */ - init_exec_ctrl(vcpu); - init_guest_state(vcpu); - init_entry_ctrl(vcpu); - init_exit_ctrl(vcpu); -} - -#ifndef CONFIG_PARTITION_MODE -void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu) -{ - uint32_t value32; - value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); - value32 &= ~VMX_PROCBASED_CTLS2_VAPIC; - value32 |= VMX_PROCBASED_CTLS2_VX2APIC; - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); - update_msr_bitmap_x2apic_apicv(vcpu); -} -#else -void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu) -{ - uint32_t value32; - if(vcpu->vm->vm_desc->lapic_pt) { - /* - * Disable external interrupt exiting and irq ack - * Disable posted interrupt processing - * update x2apic msr bitmap for pass-thru - * enable inteception only for ICR - * disable pre-emption for TSC DEADLINE MSR - * Disable Register Virtualization and virtual interrupt delivery - * Disable "use TPR shadow" - */ - - value32 = exec_vmread32(VMX_PIN_VM_EXEC_CONTROLS); - value32 &= ~VMX_PINBASED_CTLS_IRQ_EXIT; - if (is_apicv_posted_intr_supported()) { - value32 &= ~VMX_PINBASED_CTLS_POST_IRQ; - } - exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32); - - value32 = exec_vmread32(VMX_EXIT_CONTROLS); - value32 &= ~VMX_EXIT_CTLS_ACK_IRQ; - exec_vmwrite32(VMX_EXIT_CONTROLS, value32); - - value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS); - value32 &= ~VMX_PROCBASED_CTLS_TPR_SHADOW; - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32); - - exec_vmwrite32(VMX_TPR_THRESHOLD, 0U); - - value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); - value32 &= ~VMX_PROCBASED_CTLS2_VAPIC_REGS; - if (is_apicv_intr_delivery_supported()) { - value32 &= ~VMX_PROCBASED_CTLS2_VIRQ; - } - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); - - update_msr_bitmap_x2apic_passthru(vcpu); - } else { - value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2); - value32 &= ~VMX_PROCBASED_CTLS2_VAPIC; - value32 |= VMX_PROCBASED_CTLS2_VX2APIC; - exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, value32); - update_msr_bitmap_x2apic_apicv(vcpu); - } -} -#endif diff --git a/hypervisor/arch/x86/vmx_asm.S b/hypervisor/arch/x86/vmx_asm.S index 3bd085a16..40c9f07d9 100644 --- a/hypervisor/arch/x86/vmx_asm.S +++ b/hypervisor/arch/x86/vmx_asm.S @@ -4,7 +4,7 @@ * SPDX-License-Identifier: BSD-3-Clause */ -#include +#include #include #include #include diff --git a/hypervisor/include/arch/x86/hv_arch.h b/hypervisor/include/arch/x86/hv_arch.h index e1874af4d..ff7f1b657 100644 --- a/hypervisor/include/arch/x86/hv_arch.h +++ b/hypervisor/include/arch/x86/hv_arch.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/hypervisor/include/arch/x86/vmcs.h b/hypervisor/include/arch/x86/vmcs.h new file mode 100644 index 000000000..f1ffe2b40 --- /dev/null +++ b/hypervisor/include/arch/x86/vmcs.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2018 Intel Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef VMCS_H_ +#define VMCS_H_ + +#define VM_SUCCESS 0 +#define VM_FAIL -1 + +#define VMX_VMENTRY_FAIL 0x80000000U + +#ifndef ASSEMBLER + +static inline uint32_t vmx_eoi_exit(uint32_t vector) +{ + return (VMX_EOI_EXIT0_FULL + ((vector >> 6U) * 2U)); +} + +/* VM exit qulifications for APIC-access + * Access type: + * 0 = linear access for a data read during instruction execution + * 1 = linear access for a data write during instruction execution + * 2 = linear access for an instruction fetch + * 3 = linear access (read or write) during event delivery + * 10 = guest-physical access during event delivery + * 15 = guest-physical access for an instructon fetch or during + * instruction execution + */ +static inline uint64_t apic_access_type(uint64_t qual) +{ + return ((qual >> 12U) & 0xFUL); +} + +static inline uint64_t apic_access_offset(uint64_t qual) +{ + return (qual & 0xFFFUL); +} + +#define RFLAGS_C (1U<<0U) +#define RFLAGS_Z (1U<<6U) +#define RFLAGS_AC (1U<<18U) + +/* CR0 bits hv want to trap to track status change */ +#define CR0_TRAP_MASK (CR0_PE | CR0_PG | CR0_WP | CR0_CD | CR0_NW ) +#define CR0_RESERVED_MASK ~(CR0_PG | CR0_CD | CR0_NW | CR0_AM | CR0_WP | \ + CR0_NE | CR0_ET | CR0_TS | CR0_EM | CR0_MP | CR0_PE) + +/* CR4 bits hv want to trap to track status change */ +#define CR4_TRAP_MASK (CR4_PSE | CR4_PAE | CR4_VMXE | CR4_PCIDE) +#define CR4_RESERVED_MASK ~(CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_PSE | \ + CR4_PAE | CR4_MCE | CR4_PGE | CR4_PCE | \ + CR4_OSFXSR | CR4_PCIDE | CR4_OSXSAVE | \ + CR4_SMEP | CR4_FSGSBASE | CR4_VMXE | \ + CR4_OSXMMEXCPT | CR4_SMAP | CR4_PKE | \ + CR4_SMXE | CR4_UMIP ) + +#define VMX_SUPPORT_UNRESTRICTED_GUEST (1U<<5U) + +void init_vmcs(struct acrn_vcpu *vcpu); + +uint64_t vmx_rdmsr_pat(const struct acrn_vcpu *vcpu); +int32_t vmx_wrmsr_pat(struct acrn_vcpu *vcpu, uint64_t value); + +void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0); +void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4); +bool is_vmx_disabled(void); +void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu); + +static inline enum vm_cpu_mode get_vcpu_mode(const struct acrn_vcpu *vcpu) +{ + return vcpu->arch.cpu_mode; +} + +static inline bool cpu_has_vmx_unrestricted_guest_cap(void) +{ + return ((msr_read(MSR_IA32_VMX_MISC) & VMX_SUPPORT_UNRESTRICTED_GUEST) + != 0UL); +} + +#endif /* ASSEMBLER */ + +#endif /* VMCS_H_ */ diff --git a/hypervisor/include/arch/x86/vmx.h b/hypervisor/include/arch/x86/vmx.h index 00b6fa1ba..c5d919646 100644 --- a/hypervisor/include/arch/x86/vmx.h +++ b/hypervisor/include/arch/x86/vmx.h @@ -377,61 +377,11 @@ #define VMX_INT_TYPE_HW_EXP 3U #define VMX_INT_TYPE_SW_EXP 6U -#define VM_SUCCESS 0 -#define VM_FAIL -1 - -#define VMX_VMENTRY_FAIL 0x80000000U - -#ifndef ASSEMBLER - -static inline uint32_t vmx_eoi_exit(uint32_t vector) -{ - return (VMX_EOI_EXIT0_FULL + ((vector >> 6U) * 2U)); -} - -/* VM exit qulifications for APIC-access - * Access type: - * 0 = linear access for a data read during instruction execution - * 1 = linear access for a data write during instruction execution - * 2 = linear access for an instruction fetch - * 3 = linear access (read or write) during event delivery - * 10 = guest-physical access during event delivery - * 15 = guest-physical access for an instructon fetch or during - * instruction execution - */ -static inline uint64_t apic_access_type(uint64_t qual) -{ - return ((qual >> 12U) & 0xFUL); -} - -static inline uint64_t apic_access_offset(uint64_t qual) -{ - return (qual & 0xFFFUL); -} - -#define RFLAGS_C (1U<<0U) -#define RFLAGS_Z (1U<<6U) -#define RFLAGS_AC (1U<<18U) - -/* CR0 bits hv want to trap to track status change */ -#define CR0_TRAP_MASK (CR0_PE | CR0_PG | CR0_WP | CR0_CD | CR0_NW ) -#define CR0_RESERVED_MASK ~(CR0_PG | CR0_CD | CR0_NW | CR0_AM | CR0_WP | \ - CR0_NE | CR0_ET | CR0_TS | CR0_EM | CR0_MP | CR0_PE) - -/* CR4 bits hv want to trap to track status change */ -#define CR4_TRAP_MASK (CR4_PSE | CR4_PAE | CR4_VMXE | CR4_PCIDE) -#define CR4_RESERVED_MASK ~(CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_PSE | \ - CR4_PAE | CR4_MCE | CR4_PGE | CR4_PCE | \ - CR4_OSFXSR | CR4_PCIDE | CR4_OSXSAVE | \ - CR4_SMEP | CR4_FSGSBASE | CR4_VMXE | \ - CR4_OSXMMEXCPT | CR4_SMAP | CR4_PKE | \ - CR4_SMXE | CR4_UMIP ) - -#define VMX_SUPPORT_UNRESTRICTED_GUEST (1U<<5U) - /* External Interfaces */ void exec_vmxon_instr(uint16_t pcpu_id); +void vmx_off(uint16_t pcpu_id); + /** * Read field from VMCS. * @@ -451,32 +401,7 @@ void exec_vmwrite32(uint32_t field, uint32_t value); void exec_vmwrite64(uint32_t field_full, uint64_t value); #define exec_vmwrite exec_vmwrite64 -void init_vmcs(struct acrn_vcpu *vcpu); - -void vmx_off(uint16_t pcpu_id); - void exec_vmclear(void *addr); void exec_vmptrld(void *addr); -uint64_t vmx_rdmsr_pat(const struct acrn_vcpu *vcpu); -int32_t vmx_wrmsr_pat(struct acrn_vcpu *vcpu, uint64_t value); - -void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0); -void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4); -bool is_vmx_disabled(void); -void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu); - -static inline enum vm_cpu_mode get_vcpu_mode(const struct acrn_vcpu *vcpu) -{ - return vcpu->arch.cpu_mode; -} - -static inline bool cpu_has_vmx_unrestricted_guest_cap(void) -{ - return ((msr_read(MSR_IA32_VMX_MISC) & VMX_SUPPORT_UNRESTRICTED_GUEST) - != 0UL); -} - -#endif /* ASSEMBLER */ - #endif /* VMX_H_ */