From 442fc301171f759e340bbc81add8cbbc00da1a0e Mon Sep 17 00:00:00 2001 From: Yonghua Huang Date: Wed, 9 Dec 2020 21:45:12 +0800 Subject: [PATCH] hv: refine virtualization flow for cr0 and cr4 - The current code to virtualize CR0/CR4 is not well designed, and hard to read. This patch reshuffle the logic to make it clear and classify those bits into PASSTHRU, TRAP_AND_PASSTHRU, TRAP_AND_EMULATE & reserved bits. Tracked-On: #5586 Signed-off-by: Eddie Dong Signed-off-by: Yonghua Huang --- hypervisor/arch/x86/guest/vcpu.c | 14 + hypervisor/arch/x86/guest/vcpuid.c | 6 +- hypervisor/arch/x86/guest/virtual_cr.c | 463 ++++++++++-------- hypervisor/arch/x86/guest/vmcs.c | 7 +- hypervisor/arch/x86/init.c | 9 + hypervisor/common/hypercall.c | 6 +- hypervisor/include/arch/x86/guest/vcpu.h | 2 + .../include/arch/x86/guest/virtual_cr.h | 4 +- hypervisor/include/arch/x86/vmx.h | 3 + 9 files changed, 316 insertions(+), 198 deletions(-) diff --git a/hypervisor/arch/x86/guest/vcpu.c b/hypervisor/arch/x86/guest/vcpu.c index efc74c649..5e755b22a 100644 --- a/hypervisor/arch/x86/guest/vcpu.c +++ b/hypervisor/arch/x86/guest/vcpu.c @@ -393,6 +393,20 @@ static struct acrn_vcpu_regs protect_mode_init_vregs = { .es_sel = 0x18U, }; +bool sanitize_cr0_cr4_pattern(void) +{ + bool ret = false; + + if (is_valid_cr0_cr4(realmode_init_vregs.cr0, realmode_init_vregs.cr4) && + is_valid_cr0_cr4(protect_mode_init_vregs.cr0, protect_mode_init_vregs.cr4)) { + ret = true; + } else { + pr_err("Wrong CR0/CR4 pattern: real %lx %lx; protected %lx %lx\n", realmode_init_vregs.cr0, + realmode_init_vregs.cr4, protect_mode_init_vregs.cr0, protect_mode_init_vregs.cr4); + } + return ret; +} + void reset_vcpu_regs(struct acrn_vcpu *vcpu) { set_vcpu_regs(vcpu, &realmode_init_vregs); diff --git a/hypervisor/arch/x86/guest/vcpuid.c b/hypervisor/arch/x86/guest/vcpuid.c index d6bfa79ec..658b16f70 100644 --- a/hypervisor/arch/x86/guest/vcpuid.c +++ b/hypervisor/arch/x86/guest/vcpuid.c @@ -431,6 +431,9 @@ static void guest_cpuid_01h(struct acrn_vcpu *vcpu, uint32_t *eax, uint32_t *ebx /* set Hypervisor Present Bit */ *ecx |= CPUID_ECX_HV; + if ((get_cr4_reserved_bits() & CR4_PCIDE) != 0UL) { + *ecx &= ~CPUID_ECX_PCID; + } /* if guest disabed monitor/mwait, clear cpuid.01h[3] */ if ((guest_ia32_misc_enable & MSR_IA32_MISC_ENABLE_MONITOR_ENA) == 0UL) { @@ -441,7 +444,7 @@ static void guest_cpuid_01h(struct acrn_vcpu *vcpu, uint32_t *eax, uint32_t *ebx if ((*ecx & CPUID_ECX_XSAVE) != 0U) { uint64_t cr4; /*read guest CR4*/ - cr4 = exec_vmread(VMX_GUEST_CR4); + cr4 = vcpu_get_cr4(vcpu); if ((cr4 & CR4_OSXSAVE) != 0UL) { *ecx |= CPUID_ECX_OSXSAVE; } @@ -449,6 +452,7 @@ static void guest_cpuid_01h(struct acrn_vcpu *vcpu, uint32_t *eax, uint32_t *ebx /* mask Debug Store feature */ *edx &= ~CPUID_EDX_DTES; + *edx &= ~CPUID_EDX_MCE; } static void guest_cpuid_0bh(struct acrn_vcpu *vcpu, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) diff --git a/hypervisor/arch/x86/guest/virtual_cr.c b/hypervisor/arch/x86/guest/virtual_cr.c index 0675d8485..25c74ae1c 100644 --- a/hypervisor/arch/x86/guest/virtual_cr.c +++ b/hypervisor/arch/x86/guest/virtual_cr.c @@ -21,28 +21,89 @@ #include #include -/* CR0 bits hv want to trap to track status change */ -#define CR0_TRAP_MASK (CR0_PE | CR0_PG | CR0_WP | CR0_CD | CR0_NW) -#define CR0_RESERVED_MASK ~(CR0_PG | CR0_CD | CR0_NW | CR0_AM | CR0_WP | \ - CR0_NE | CR0_ET | CR0_TS | CR0_EM | CR0_MP | CR0_PE) +/* + * Physical CR4 bits in VMX operation may be either flexible or fixed. + * Guest CR4 bits may be operatable or reserved. + * + * All the guest reserved bits should be TRAPed and EMULATed by HV + * (inject #GP). + * + * For guest operatable bits, it may be: + * CR4_PASSTHRU_BITS: + * Bits that may be passed through to guest. The actual passthru bits + * should be masked by flexible bits. + * + * CR4_TRAP_AND_PASSTHRU_BITS: + * The bits are trapped by HV and HV emulation will eventually write + * the guest value to physical CR4 (GUEST_CR4) too. The actual bits + * should be masked by flexible bits. + * + * CR4_TRAP_AND_EMULATE_BITS: + * The bits are trapped by HV and emulated, but HV updates vCR4 only + * (no update to physical CR4), i.e. pure software emulation. + * + * CR4_EMULATED_RESERVE_BITS: + * The bits are trapped, but are emulated by injecting a #GP. + * + * NOTE: Above bits should not overlap. + * + */ +#define CR4_PASSTHRU_BITS (CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | \ + CR4_PGE | CR4_PCE | CR4_OSFXSR | CR4_PCIDE | \ + CR4_OSXSAVE | CR4_FSGSBASE | CR4_OSXMMEXCPT | \ + CR4_UMIP) +static uint64_t cr4_passthru_mask = CR4_PASSTHRU_BITS; /* bound to flexible bits */ -/* CR4 bits hv want to trap to track status change */ -#define CR4_TRAP_MASK (CR4_PSE | CR4_PAE | CR4_VMXE | CR4_SMEP | CR4_SMAP | CR4_PKE | CR4_CET) -#define CR4_RESERVED_MASK ~(CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_PSE | \ - CR4_PAE | CR4_MCE | CR4_PGE | CR4_PCE | \ - CR4_OSFXSR | CR4_PCIDE | CR4_OSXSAVE | \ - CR4_SMEP | CR4_FSGSBASE | CR4_VMXE | \ - CR4_OSXMMEXCPT | CR4_SMAP | CR4_PKE | \ - CR4_SMXE | CR4_UMIP | CR4_CET) +#define CR4_TRAP_AND_PASSTHRU_BITS (CR4_PSE | CR4_PAE | CR4_SMEP | CR4_SMAP | CR4_PKE) +static uint64_t cr4_trap_and_passthru_mask = CR4_TRAP_AND_PASSTHRU_BITS; /* bound to flexible bits */ + +#define CR4_TRAP_AND_EMULATE_BITS 0UL /* software emulated bits even if host is fixed */ + +/* Change of these bits should change vcpuid too */ +#define CR4_EMULATED_RESERVE_BITS (CR4_VMXE | CR4_MCE | CR4_CET | CR4_SMXE) + +/* The physical CR4 value for bits of CR4_EMULATED_RESERVE_BITS */ +#define CR4_EMRSV_BITS_PHYS_VALUE (CR4_VMXE | CR4_MCE) + +/* The CR4 value guest expected to see for bits of CR4_EMULATED_RESERVE_BITS */ +#define CR4_EMRSV_BITS_VIRT_VALUE 0 +static uint64_t cr4_rsv_bits_guest_value; + +/* + * Initial value or reset value of GUEST_CR4, i.e. physical value. + * They are likely zeros, but some reserved bits may be not. + */ +static uint64_t initial_guest_cr4; + +/* + * Bits not in cr4_passthru_mask/cr4_trap_and_passthru_mask/cr4_trap_and_emulate_mask + * are reserved bits, includes at least CR4_EMULATED_RESERVE_BITS + */ +static uint64_t cr4_reserved_bits_mask; + +/* + * CR0 follows the same rule of CR4, except it won't inject #GP for reserved bits violation. + * Instead, it ignores the software write to those reserved bits. + */ +#define CR0_PASSTHRU_BITS (CR0_MP | CR0_EM | CR0_TS | CR0_ET | CR0_NE | CR0_AM) +static uint64_t cr0_passthru_mask = CR0_PASSTHRU_BITS; /* bound to flexible bits */ + +#define CR0_TRAP_AND_PASSTHRU_BITS (CR0_PE | CR0_PG | CR0_WP) +static uint64_t cr0_trap_and_passthru_mask = CR0_TRAP_AND_PASSTHRU_BITS;/* bound to flexible bits */ +/* software emulated bits even if host is fixed */ +#define CR0_TRAP_AND_EMULATE_BITS (CR0_CD | CR0_NW) + +/* These bits may be part of flexible bits but reserved to guest */ +#define CR0_EMULATED_RESERVE_BITS 0UL +#define CR0_EMRSV_BITS_PHYS_VALUE 0UL +#define CR0_EMRSV_BITS_VIRT_VALUE 0UL +static uint64_t cr0_rsv_bits_guest_value; +static uint64_t initial_guest_cr0; /* Initial value of GUEST_CR0 */ +static uint64_t cr0_reserved_bits_mask; /* PAE PDPTE bits 1 ~ 2, 5 ~ 8 are always reserved */ #define PAE_PDPTE_FIXED_RESVD_BITS 0x00000000000001E6UL -static uint64_t cr0_always_on_mask; -static uint64_t cr0_always_off_mask; -static uint64_t cr4_always_on_mask; -static uint64_t cr4_always_off_mask; - static int32_t load_pdptrs(const struct acrn_vcpu *vcpu) { uint64_t guest_cr3 = exec_vmread(VMX_GUEST_CR3); @@ -84,12 +145,25 @@ static int32_t load_pdptrs(const struct acrn_vcpu *vcpu) return ret; } +/* + * Whether the value changes the reserved bits. + */ +static inline bool is_valid_cr0(uint64_t cr0) +{ + return (cr0 & cr0_reserved_bits_mask) == cr0_rsv_bits_guest_value; +} + +/* + * Certain combination of CR0 write may lead to #GP. + */ static bool is_cr0_write_valid(struct acrn_vcpu *vcpu, uint64_t cr0) { bool ret = true; - /* Shouldn't set always off bit */ - if ((cr0 & cr0_always_off_mask) != 0UL) { + /* + * Set 1 in high 32 bits (part of reserved bits) leads to #GP. + */ + if ((cr0 >> 32UL) != 0UL) { ret = false; } else { /* SDM 25.3 "Changes to instruction behavior in VMX non-root" @@ -150,64 +224,66 @@ static bool is_cr0_write_valid(struct acrn_vcpu *vcpu, uint64_t cr0) * - PG (31) Trapped to track cpu/paging mode. * Set the value according to the value from guest. */ -static void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0) +static void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t value) { bool err_found = false; + /* + * For reserved bits of CR0, SDM states: + * attempts to set them have no impact, while set to high 32 bits lead to #GP. + */ - if (!is_cr0_write_valid(vcpu, cr0)) { - pr_dbg("Invalid cr0 write operation from guest"); + if (!is_cr0_write_valid(vcpu, value)) { + pr_err("Invalid cr0 write operation from guest"); vcpu_inject_gp(vcpu, 0U); } else { - uint64_t cr0_vmx; + uint64_t effective_cr0 = (value & ~cr0_reserved_bits_mask) | cr0_rsv_bits_guest_value; + uint64_t mask, tmp; uint32_t entry_ctrls; - bool old_paging_enabled = is_paging_enabled(vcpu); - uint64_t cr0_changed_bits = vcpu_get_cr0(vcpu) ^ cr0; - uint64_t cr0_mask = cr0; + uint64_t cr0_changed_bits = vcpu_get_cr0(vcpu) ^ effective_cr0; - /* SDM 2.5 - * When loading a control register, reserved bit should always set - * to the value previously read. - */ - cr0_mask &= ~CR0_RESERVED_MASK; + if ((cr0_changed_bits & CR0_PG) != 0UL) { + /* PG bit changes */ + if ((effective_cr0 & CR0_PG) != 0UL) { + /* Enable paging */ + if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { + /* Enable long mode */ + pr_dbg("VMM: Enable long mode"); + entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); + entry_ctrls |= VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); - if (!old_paging_enabled && ((cr0_mask & CR0_PG) != 0UL)) { - if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { - /* Enable long mode */ - pr_dbg("VMM: Enable long mode"); - entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); - entry_ctrls |= VMX_ENTRY_CTLS_IA32E_MODE; - exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); - - vcpu_set_efer(vcpu, vcpu_get_efer(vcpu) | MSR_IA32_EFER_LMA_BIT); - } else if (is_pae(vcpu)) { - /* enabled PAE from paging disabled */ - if (load_pdptrs(vcpu) != 0) { - err_found = true; - vcpu_inject_gp(vcpu, 0U); + vcpu_set_efer(vcpu, vcpu_get_efer(vcpu) | MSR_IA32_EFER_LMA_BIT); + } else { + pr_dbg("VMM: NOT Enable long mode"); + if (is_pae(vcpu)) { + /* enabled PAE from paging disabled */ + if (load_pdptrs(vcpu) != 0) { + err_found = true; + vcpu_inject_gp(vcpu, 0U); + } + } } - } else { - /* do nothing */ - } - } else if (old_paging_enabled && ((cr0_mask & CR0_PG) == 0UL)) { - if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { - /* Disable long mode */ - pr_dbg("VMM: Disable long mode"); - entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); - entry_ctrls &= ~VMX_ENTRY_CTLS_IA32E_MODE; - exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); + } else { + /* Disable paging */ + pr_dbg("disable paginge"); + if ((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LME_BIT) != 0UL) { + /* Disable long mode */ + pr_dbg("VMM: Disable long mode"); + entry_ctrls = exec_vmread32(VMX_ENTRY_CONTROLS); + entry_ctrls &= ~VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite32(VMX_ENTRY_CONTROLS, entry_ctrls); - vcpu_set_efer(vcpu, vcpu_get_efer(vcpu) & ~MSR_IA32_EFER_LMA_BIT); + vcpu_set_efer(vcpu, vcpu_get_efer(vcpu) & ~MSR_IA32_EFER_LMA_BIT); + } } - } else { - /* do nothing */ } if (!err_found) { /* If CR0.CD or CR0.NW get cr0_changed_bits */ - if ((cr0_changed_bits & (CR0_CD | CR0_NW)) != 0UL) { - /* No action if only CR0.NW is cr0_changed_bits */ + if ((cr0_changed_bits & CR0_TRAP_AND_EMULATE_BITS) != 0UL) { + /* No action if only CR0.NW is changed */ if ((cr0_changed_bits & CR0_CD) != 0UL) { - if ((cr0_mask & CR0_CD) != 0UL) { + if ((effective_cr0 & CR0_CD) != 0UL) { /* * When the guest requests to set CR0.CD, we don't allow * guest's CR0.CD to be actually set, instead, we write guest @@ -220,50 +296,46 @@ static void vmx_write_cr0(struct acrn_vcpu *vcpu, uint64_t cr0) exec_vmwrite64(VMX_GUEST_IA32_PAT_FULL, vcpu_get_guest_msr(vcpu, MSR_IA32_PAT)); } - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); } } - if ((cr0_changed_bits & (CR0_PG | CR0_WP)) != 0UL) { + if ((cr0_changed_bits & (CR0_PG | CR0_WP | CR0_CD)) != 0UL) { vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); } - /* CR0 has no always off bits, except the always on bits, and reserved - * bits, allow to set according to guest. - */ - cr0_vmx = cr0_always_on_mask | cr0_mask; + mask = cr0_trap_and_passthru_mask | cr0_passthru_mask; + tmp = (initial_guest_cr0 & ~mask) | (effective_cr0 & mask); - /* Don't set CD or NW bit to guest */ - cr0_vmx &= ~(CR0_CD | CR0_NW); - exec_vmwrite(VMX_GUEST_CR0, cr0_vmx & 0xFFFFFFFFUL); - exec_vmwrite(VMX_CR0_READ_SHADOW, cr0_mask & 0xFFFFFFFFUL); + exec_vmwrite(VMX_GUEST_CR0, tmp); + exec_vmwrite(VMX_CR0_READ_SHADOW, effective_cr0); /* clear read cache, next time read should from VMCS */ bitmap_clear_lock(CPU_REG_CR0, &vcpu->reg_cached); - pr_dbg("VMM: Try to write %016lx, allow to write 0x%016lx to CR0", cr0_mask, cr0_vmx); + pr_dbg("VMM: Try to write %016lx, allow to write 0x%016lx to CR0", effective_cr0, tmp); } } } +static inline bool is_valid_cr4(uint64_t cr4) +{ + return (cr4 & cr4_reserved_bits_mask) == cr4_rsv_bits_guest_value; +} + +/* + * TODO: Implement more comprhensive check here. + */ +bool is_valid_cr0_cr4(uint64_t cr0, uint64_t cr4) +{ + return is_valid_cr4(cr4) & is_valid_cr0(cr0); +} + static bool is_cr4_write_valid(struct acrn_vcpu *vcpu, uint64_t cr4) { bool ret = true; - /* Check if guest try to set fixed to 0 bits or reserved bits */ - if ((cr4 & cr4_always_off_mask) != 0U) { + if (!is_valid_cr4(cr4) || (is_long_mode(vcpu) && ((cr4 & CR4_PAE) == 0UL))) { ret = false; - } else { - /* Do NOT support nested guest, nor SMX */ - if (((cr4 & CR4_VMXE) != 0UL) || ((cr4 & CR4_SMXE) != 0UL) || ((cr4 & CR4_CET) != 0UL)) { - ret = false; - } else { - if (is_long_mode(vcpu)) { - if ((cr4 & CR4_PAE) == 0UL) { - ret = false; - } - } - } } return ret; @@ -273,37 +345,9 @@ static bool is_cr4_write_valid(struct acrn_vcpu *vcpu, uint64_t cr4) * Handling of CR4: * Assume "unrestricted guest" feature is supported by vmx. * - * For CR4, if some feature is not supported by hardware, the corresponding bit - * will be set in cr4_always_off_mask. If guest try to set these bits after - * vmexit, will inject a #GP. - * If a bit for a feature not supported by hardware, which is flexible to guest, - * and write to it do not lead to a VM exit, a #GP should be generated inside - * guest. - * - * - VME (0) Flexible to guest - * - PVI (1) Flexible to guest - * - TSD (2) Flexible to guest - * - DE (3) Flexible to guest - * - PSE (4) Trapped to track paging mode. - * Set the value according to the value from guest. - * - PAE (5) Trapped to track paging mode. - * Set the value according to the value from guest. - * - MCE (6) Trapped to hide from guest - * - PGE (7) Flexible to guest - * - PCE (8) Flexible to guest - * - OSFXSR (9) Flexible to guest - * - OSXMMEXCPT (10) Flexible to guest - * - VMXE (13) Trapped to hide from guest - * - SMXE (14) must always be 0 => must lead to a VM exit - * - PCIDE (17) Flexible to guest - * - OSXSAVE (18) Flexible to guest - * - XSAVE (19) Flexible to guest - * We always keep align with physical cpu. So it's flexible to - * guest - * - SMEP (20) Flexible to guest - * - SMAP (21) Flexible to guest - * - PKE (22) Flexible to guest - * - CET (23) Trapped to hide from guest + * For CR4, if a guest attempts to change the reserved bits, a #GP fault is injected. + * This includes hardware reserved bits in VMX operation (not flexible bits), + * and CR4_EMULATED_RESERVE_BITS, or check with cr4_reserved_bits_mask. */ static void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4) { @@ -313,130 +357,159 @@ static void vmx_write_cr4(struct acrn_vcpu *vcpu, uint64_t cr4) pr_dbg("Invalid cr4 write operation from guest"); vcpu_inject_gp(vcpu, 0U); } else { - uint64_t cr4_vmx, cr4_shadow; - uint64_t old_cr4 = vcpu_get_cr4(vcpu); + uint64_t mask, tmp; + uint64_t cr4_changed_bits = vcpu_get_cr4(vcpu) ^ cr4; - if (((cr4 ^ old_cr4) & (CR4_PGE | CR4_PSE | CR4_PAE | CR4_SMEP | CR4_SMAP | CR4_PKE)) != 0UL) { + if ((cr4_changed_bits & CR4_TRAP_AND_PASSTHRU_BITS) != 0UL) { if (((cr4 & CR4_PAE) != 0UL) && (is_paging_enabled(vcpu)) && (!is_long_mode(vcpu))) { if (load_pdptrs(vcpu) != 0) { err_found = true; + pr_dbg("Err found,cr4:0xlx,cr0:0x%lx ", cr4, vcpu_get_cr0(vcpu)); vcpu_inject_gp(vcpu, 0U); } } - if (!err_found) { - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); - } + vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); } - if (!err_found && (((cr4 ^ old_cr4) & CR4_PCIDE) != 0UL)) { + if (!err_found && ((cr4_changed_bits & CR4_PCIDE) != 0UL)) { /* MOV to CR4 causes a general-protection exception (#GP) if it would change * CR4.PCIDE from 0 to 1 and either IA32_EFER.LMA = 0 or CR3[11:0] != 000H */ if ((cr4 & CR4_PCIDE) != 0UL) { uint64_t guest_cr3 = exec_vmread(VMX_GUEST_CR3); - /* For now, HV passes-through PCID capability to guest, check X86_FEATURE_PCID of - * pcpu capabilities directly. - */ - if ((!pcpu_has_cap(X86_FEATURE_PCID)) || (!is_long_mode(vcpu)) || - ((guest_cr3 & 0xFFFUL) != 0UL)) { - pr_dbg("Failed to enable CR4.PCIE"); + + if ((!is_long_mode(vcpu)) || ((guest_cr3 & 0xFFFUL) != 0UL)) { + pr_dbg("Failed to enable CR4.PCID, cr4:0x%lx, cr4_changed_bits:0x%lx,vcpu_cr4:0x%lx cr3:0x%lx", + cr4, cr4_changed_bits, vcpu_get_cr4(vcpu), guest_cr3); + err_found = true; vcpu_inject_gp(vcpu, 0U); } - } else { - /* The instruction invalidates all TLB entries (including global entries) and - * all entries in all paging-structure caches (for all PCIDs) if it changes the - * value of the CR4.PCIDE from 1 to 0 - */ - vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH); } } if (!err_found) { - /* Clear forced off bits */ - cr4_shadow = cr4 & ~CR4_MCE; + /* + * Update the passthru bits. + */ + mask = cr4_trap_and_passthru_mask | cr4_passthru_mask; + tmp = (initial_guest_cr4 & ~mask) | (cr4 & mask); - cr4_vmx = cr4_always_on_mask | cr4_shadow; - exec_vmwrite(VMX_GUEST_CR4, cr4_vmx & 0xFFFFFFFFUL); - exec_vmwrite(VMX_CR4_READ_SHADOW, cr4_shadow & 0xFFFFFFFFUL); + /* + * For all reserved bits (including CR4_EMULATED_RESERVE_BITS), we came here because + * the guest is not changing them. + */ + exec_vmwrite(VMX_GUEST_CR4, tmp); + exec_vmwrite(VMX_CR4_READ_SHADOW, cr4); /* clear read cache, next time read should from VMCS */ bitmap_clear_lock(CPU_REG_CR4, &vcpu->reg_cached); - pr_dbg("VMM: Try to write %016lx, allow to write 0x%016lx to CR4", cr4, cr4_vmx); + pr_dbg("VMM: Try to write %016lx, allow to write 0x%016lx to CR4", cr4, tmp); } } } -void init_cr0_cr4_host_mask(void) +void init_cr0_cr4_flexible_bits(void) { - static bool inited = false; - static uint64_t cr0_host_owned_bits, cr4_host_owned_bits; + uint64_t cr0_flexible_bits; + uint64_t cr4_flexible_bits; uint64_t fixed0, fixed1; - if (!inited) { - /* Read the CR0 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); + /* make sure following MACROs don't have any overlapped set bit. + */ + ASSERT(((CR0_PASSTHRU_BITS ^ CR0_TRAP_AND_PASSTHRU_BITS) ^ CR0_TRAP_AND_EMULATE_BITS) == + (CR0_PASSTHRU_BITS | CR0_TRAP_AND_PASSTHRU_BITS | CR0_TRAP_AND_EMULATE_BITS)); - cr0_host_owned_bits = ~(fixed0 ^ fixed1); - /* Add the bit hv wants to trap */ - cr0_host_owned_bits |= CR0_TRAP_MASK; - cr0_host_owned_bits &= ~CR0_RESERVED_MASK; - /* CR0 clear PE/PG from always on bits due to "unrestructed guest" feature */ - cr0_always_on_mask = fixed0 & (~(CR0_PE | CR0_PG)); - cr0_always_off_mask = ~fixed1; - /* SDM 2.5 - * bit 63:32 of CR0 and CR4 ar reserved and must be written - * zero. We could merge it with always off mask. - */ - cr0_always_off_mask |= 0xFFFFFFFF00000000UL; + ASSERT(((CR4_PASSTHRU_BITS ^ CR4_TRAP_AND_PASSTHRU_BITS) ^ CR4_TRAP_AND_EMULATE_BITS) == + (CR4_PASSTHRU_BITS | CR4_TRAP_AND_PASSTHRU_BITS | CR4_TRAP_AND_EMULATE_BITS)); - /* Read the CR4 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); + /* Read the CR0 fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); - cr4_host_owned_bits = ~(fixed0 ^ fixed1); - /* Add the bit hv wants to trap */ - cr4_host_owned_bits |= CR4_TRAP_MASK; - cr4_host_owned_bits &= ~CR4_RESERVED_MASK; - cr4_always_on_mask = fixed0; - /* Record the bit fixed to 0 for CR4, including reserved bits */ - cr4_always_off_mask = ~fixed1; - /* SDM 2.5 - * bit 63:32 of CR0 and CR4 ar reserved and must be written - * zero. We could merge it with always off mask. - */ - cr4_always_off_mask |= 0xFFFFFFFF00000000UL; - cr4_always_off_mask |= CR4_RESERVED_MASK; - inited = true; - } + pr_dbg("%s:cr0 fixed0 = 0x%016lx, fixed1 = 0x%016lx", __func__, fixed0, fixed1); + cr0_flexible_bits = (fixed0 ^ fixed1); + /* + * HW reports fixed bits for CR0_PG & CR0_PE, but do not check the violation. + * ACRN needs to set them for (unrestricted) guest, and therefore view them as + * flexible bits. + */ + cr0_flexible_bits |= (CR0_PE | CR0_PG); + cr0_passthru_mask &= cr0_flexible_bits; + cr0_trap_and_passthru_mask &= cr0_flexible_bits; + cr0_reserved_bits_mask = ~(cr0_passthru_mask | cr0_trap_and_passthru_mask | CR0_TRAP_AND_EMULATE_BITS); - exec_vmwrite(VMX_CR0_GUEST_HOST_MASK, cr0_host_owned_bits); - /* Output CR0 mask value */ - pr_dbg("CR0 guest-host mask value: 0x%016lx", cr0_host_owned_bits); + /* + * cr0_rsv_bits_guest_value should be sync with always ON bits (1 in both FIXED0/FIXED1 MSRs). + * Refer SDM Appendix A.7 + */ + cr0_rsv_bits_guest_value = (fixed0 & ~cr0_flexible_bits); + initial_guest_cr0 = (cr0_rsv_bits_guest_value & ~CR0_EMULATED_RESERVE_BITS) | CR0_EMRSV_BITS_PHYS_VALUE; + cr0_rsv_bits_guest_value = (cr0_rsv_bits_guest_value & ~CR0_EMULATED_RESERVE_BITS) | CR0_EMRSV_BITS_VIRT_VALUE; + pr_dbg("cr0_flexible_bits:0x%lx, cr0_passthru_mask:%lx, cr0_trap_and_passthru_mask:%lx.\n", + cr0_flexible_bits, cr0_passthru_mask, cr0_trap_and_passthru_mask); + pr_dbg("cr0_reserved_bits_mask:%lx, cr0_rsv_bits_guest_value:%lx, initial_guest_cr0:%lx.\n", + cr0_reserved_bits_mask, cr0_rsv_bits_guest_value, initial_guest_cr0); - exec_vmwrite(VMX_CR4_GUEST_HOST_MASK, cr4_host_owned_bits); - /* Output CR4 mask value */ - pr_dbg("CR4 guest-host mask value: 0x%016lx", cr4_host_owned_bits); + /* Read the CR4 fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); + + pr_dbg("%s:cr4 fixed0 = 0x%016lx, fixed1 = 0x%016lx", __func__, fixed0, fixed1); + cr4_flexible_bits = (fixed0 ^ fixed1); + cr4_passthru_mask &= cr4_flexible_bits; + cr4_trap_and_passthru_mask &= cr4_flexible_bits; + + /* + * vcpuid should always consult cr4_reserved_bits_mask when reporting capability. + * + * The guest value of reserved bits are likely identical to fixed bits, but certains + * exceptions may be applied, i.e. for CR4_EMULATED_RESERVE_BITS. + */ + cr4_reserved_bits_mask = ~(cr4_passthru_mask | cr4_trap_and_passthru_mask | CR4_TRAP_AND_EMULATE_BITS); + + /* + * cr4_reserved_bits_value should be sync with always ON bits (1 in both FIXED0/FIXED1 MSRs). + * Refer SDM Appendix A.8 + */ + cr4_rsv_bits_guest_value = (fixed0 & ~cr4_flexible_bits); + initial_guest_cr4 = (cr4_rsv_bits_guest_value & ~CR4_EMULATED_RESERVE_BITS) | CR4_EMRSV_BITS_PHYS_VALUE; + cr4_rsv_bits_guest_value = (cr4_rsv_bits_guest_value & ~CR4_EMULATED_RESERVE_BITS) | CR4_EMRSV_BITS_VIRT_VALUE; + + pr_dbg("cr4_flexible_bits:0x%lx, cr4_passthru_mask:0x%lx, cr4_trap_and_passthru_mask:0x%lx.", + cr4_flexible_bits, cr4_passthru_mask, cr4_trap_and_passthru_mask); + pr_dbg("cr4_reserved_bits_mask:%lx, cr4_rsv_bits_guest_value:%lx, initial_guest_cr4:%lx.\n", + cr4_reserved_bits_mask, cr4_rsv_bits_guest_value, initial_guest_cr4); +} + +void init_cr0_cr4_host_guest_mask(void) +{ + /* + * "1" means the bit is trapped by host, and "0" means passthru to guest.. + */ + exec_vmwrite(VMX_CR0_GUEST_HOST_MASK, ~cr0_passthru_mask); /* all bits except passthrubits are trapped */ + pr_dbg("CR0 guest-host mask value: 0x%016lx", ~cr0_passthru_mask); + + exec_vmwrite(VMX_CR4_GUEST_HOST_MASK, ~cr4_passthru_mask); /* all bits except passthru bits are trapped */ + pr_dbg("CR4 guest-host mask value: 0x%016lx", ~cr4_passthru_mask); } uint64_t vcpu_get_cr0(struct acrn_vcpu *vcpu) { - uint64_t mask; struct run_context *ctx = &vcpu->arch.contexts[vcpu->arch.cur_context].run_ctx; if (bitmap_test_and_set_lock(CPU_REG_CR0, &vcpu->reg_cached) == 0) { - mask = exec_vmread(VMX_CR0_GUEST_HOST_MASK); - ctx->cr0 = (exec_vmread(VMX_CR0_READ_SHADOW) & mask) | - (exec_vmread(VMX_GUEST_CR0) & (~mask)); + ctx->cr0 = (exec_vmread(VMX_CR0_READ_SHADOW) & ~cr0_passthru_mask) | + (exec_vmread(VMX_GUEST_CR0) & cr0_passthru_mask); } return ctx->cr0; } void vcpu_set_cr0(struct acrn_vcpu *vcpu, uint64_t val) { + pr_dbg("%s, value: 0x%016lx rip:%016lx", __func__, val, vcpu_get_rip(vcpu)); vmx_write_cr0(vcpu, val); } @@ -450,21 +523,21 @@ void vcpu_set_cr2(struct acrn_vcpu *vcpu, uint64_t val) vcpu->arch.contexts[vcpu->arch.cur_context].run_ctx.cr2 = val; } +/* This API shall be called after vCPU is created. */ uint64_t vcpu_get_cr4(struct acrn_vcpu *vcpu) { - uint64_t mask; struct run_context *ctx = &vcpu->arch.contexts[vcpu->arch.cur_context].run_ctx; if (bitmap_test_and_set_lock(CPU_REG_CR4, &vcpu->reg_cached) == 0) { - mask = exec_vmread(VMX_CR4_GUEST_HOST_MASK); - ctx->cr4 = (exec_vmread(VMX_CR4_READ_SHADOW) & mask) | - (exec_vmread(VMX_GUEST_CR4) & (~mask)); + ctx->cr4 = (exec_vmread(VMX_CR4_READ_SHADOW) & ~cr4_passthru_mask) | + (exec_vmread(VMX_GUEST_CR4) & cr4_passthru_mask); } return ctx->cr4; } void vcpu_set_cr4(struct acrn_vcpu *vcpu, uint64_t val) { + pr_dbg("%s, value: 0x%016lx rip: %016lx", __func__, val, vcpu_get_rip(vcpu)); vmx_write_cr4(vcpu, val); } @@ -486,6 +559,7 @@ int32_t cr_access_vmexit_handler(struct acrn_vcpu *vcpu) /* mov to cr0 */ vcpu_set_cr0(vcpu, reg); break; + case 0x04UL: /* mov to cr4 */ vcpu_set_cr4(vcpu, reg); @@ -501,3 +575,8 @@ int32_t cr_access_vmexit_handler(struct acrn_vcpu *vcpu) return ret; } + +uint64_t get_cr4_reserved_bits(void) +{ + return cr4_reserved_bits_mask; +} diff --git a/hypervisor/arch/x86/guest/vmcs.c b/hypervisor/arch/x86/guest/vmcs.c index b57ed4a2f..3434d2701 100644 --- a/hypervisor/arch/x86/guest/vmcs.c +++ b/hypervisor/arch/x86/guest/vmcs.c @@ -26,6 +26,8 @@ static void init_guest_vmx(struct acrn_vcpu *vcpu, uint64_t cr0, uint64_t cr3, struct guest_cpu_context *ctx = &vcpu->arch.contexts[vcpu->arch.cur_context]; struct ext_context *ectx = &ctx->ext_ctx; + pr_dbg("%s,cr0:0x%lx, cr4:0x%lx.", __func__, cr0, cr4); + vcpu_set_cr4(vcpu, cr4); vcpu_set_cr0(vcpu, cr0); exec_vmwrite(VMX_GUEST_CR3, cr3); @@ -71,6 +73,9 @@ static void init_guest_state(struct acrn_vcpu *vcpu) { struct guest_cpu_context *ctx = &vcpu->arch.contexts[vcpu->arch.cur_context]; + pr_dbg("%s, cr0:0x%lx, cr4:0x%lx.\n", __func__, + ctx->run_ctx.cr0, ctx->run_ctx.cr4); + init_guest_vmx(vcpu, ctx->run_ctx.cr0, ctx->ext_ctx.cr3, ctx->run_ctx.cr4 & ~(CR4_VMXE | CR4_SMXE | CR4_MCE)); } @@ -411,7 +416,7 @@ static void init_exec_ctrl(struct acrn_vcpu *vcpu) /* Natural-width */ pr_dbg("Natural-width*********"); - init_cr0_cr4_host_mask(); + init_cr0_cr4_host_guest_mask(); /* The CR3 target registers work in concert with VMX_CR3_TARGET_COUNT * field. Using these registers guest CR3 access can be managed. i.e., diff --git a/hypervisor/arch/x86/init.c b/hypervisor/arch/x86/init.c index 897434dd2..6533bdb32 100644 --- a/hypervisor/arch/x86/init.c +++ b/hypervisor/arch/x86/init.c @@ -68,6 +68,14 @@ static void init_pcpu_comm_post(void) run_idle_thread(); } +static void init_misc(void) +{ + init_cr0_cr4_flexible_bits(); + if (!sanitize_cr0_cr4_pattern()) { + panic("%s Sanitize pattern of CR0 or CR4 failed.\n", __func__); + } +} + /* NOTE: this function is using temp stack, and after SWITCH_TO(runtime_sp, to) * it will switch to runtime stack. */ @@ -87,6 +95,7 @@ void init_primary_pcpu(void) init_pcpu_pre(true); init_seed(); + init_misc(); /* Switch to run-time stack */ rsp = (uint64_t)(&get_cpu_var(stack)[CONFIG_STACK_SIZE - 1]); diff --git a/hypervisor/common/hypercall.c b/hypervisor/common/hypercall.c index fe38d2cbb..408d1c0a7 100644 --- a/hypervisor/common/hypercall.c +++ b/hypervisor/common/hypercall.c @@ -366,8 +366,10 @@ int32_t hcall_set_vcpu_regs(struct acrn_vm *vm, struct acrn_vm *target_vm, __unu } else { vcpu = vcpu_from_vid(target_vm, vcpu_regs.vcpu_id); if (vcpu->state != VCPU_OFFLINE) { - set_vcpu_regs(vcpu, &(vcpu_regs.vcpu_regs)); - ret = 0; + if (is_valid_cr0_cr4(vcpu_regs.vcpu_regs.cr0, vcpu_regs.vcpu_regs.cr4)) { + set_vcpu_regs(vcpu, &(vcpu_regs.vcpu_regs)); + ret = 0; + } } } } diff --git a/hypervisor/include/arch/x86/guest/vcpu.h b/hypervisor/include/arch/x86/guest/vcpu.h index a73be7b87..1b24d9440 100644 --- a/hypervisor/include/arch/x86/guest/vcpu.h +++ b/hypervisor/include/arch/x86/guest/vcpu.h @@ -536,6 +536,8 @@ void set_vcpu_regs(struct acrn_vcpu *vcpu, struct acrn_vcpu_regs *vcpu_regs); */ void reset_vcpu_regs(struct acrn_vcpu *vcpu); +bool sanitize_cr0_cr4_pattern(void); + /** * @brief Initialize the protect mode vcpu registers * diff --git a/hypervisor/include/arch/x86/guest/virtual_cr.h b/hypervisor/include/arch/x86/guest/virtual_cr.h index b2acd56e2..0f9e73985 100644 --- a/hypervisor/include/arch/x86/guest/virtual_cr.h +++ b/hypervisor/include/arch/x86/guest/virtual_cr.h @@ -12,8 +12,8 @@ * * @brief public APIs for vCR operations */ - -void init_cr0_cr4_host_mask(void); +uint64_t get_cr4_reserved_bits(void); +void init_cr0_cr4_host_guest_mask(void); /** * @brief vCR from vcpu diff --git a/hypervisor/include/arch/x86/vmx.h b/hypervisor/include/arch/x86/vmx.h index 6017adb61..3f2636e26 100644 --- a/hypervisor/include/arch/x86/vmx.h +++ b/hypervisor/include/arch/x86/vmx.h @@ -431,5 +431,8 @@ void exec_vmwrite64(uint32_t field_full, uint64_t value); void exec_vmclear(void *addr); void exec_vmptrld(void *addr); +void init_cr0_cr4_flexible_bits(void); +bool is_valid_cr0_cr4(uint64_t cr0, uint64_t cr4); + #define POSTED_INTR_ON 0U #endif /* VMX_H_ */