diff --git a/hypervisor/arch/x86/guest/vcpu.c b/hypervisor/arch/x86/guest/vcpu.c index 7ebf64252..a0f1ee607 100644 --- a/hypervisor/arch/x86/guest/vcpu.c +++ b/hypervisor/arch/x86/guest/vcpu.c @@ -158,6 +158,9 @@ int start_vcpu(struct vcpu *vcpu) /* Save guest CR3 register */ cur_context->cr3 = exec_vmread(VMX_GUEST_CR3); + /* Save guest IA32_EFER register */ + cur_context->ia32_efer = exec_vmread64(VMX_GUEST_IA32_EFER_FULL); + /* Obtain current VCPU instruction pointer and length */ cur_context->rip = exec_vmread(VMX_GUEST_RIP); vcpu->arch_vcpu.inst_len = exec_vmread(VMX_EXIT_INSTR_LEN); diff --git a/hypervisor/arch/x86/vmx.c b/hypervisor/arch/x86/vmx.c index 79d2ccaed..be8e1d27b 100644 --- a/hypervisor/arch/x86/vmx.c +++ b/hypervisor/arch/x86/vmx.c @@ -19,6 +19,13 @@ extern struct efi_ctx* efi_ctx; ((uint64_t)PAT_MEM_TYPE_UCM << 48) + \ ((uint64_t)PAT_MEM_TYPE_UC << 56)) +static uint32_t cr0_host_mask; +static uint32_t cr0_always_on_mask; +static uint32_t cr0_always_off_mask; +static uint32_t cr4_host_mask; +static uint32_t cr4_always_on_mask; +static uint32_t cr4_always_off_mask; + static inline int exec_vmxon(void *addr) { uint64_t rflags; @@ -216,85 +223,203 @@ uint32_t get_cs_access_rights(void) return usable_ar; } -int vmx_write_cr0(struct vcpu *vcpu, uint64_t value) +static void init_cr0_cr4_host_mask(__unused struct vcpu *vcpu) { - uint32_t value32; - uint64_t value64; + static bool inited = false; + uint32_t fixed0, fixed1; + if (!inited) { + /* Read the CR0 fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); - pr_dbg("VMM: Guest trying to write 0x%08x to CR0", value); + cr0_host_mask = ~(fixed0 ^ fixed1); + /* Add the bit hv wants to trap */ + cr0_host_mask |= CR0_TRAP_MASK; + /* CR0 clear PE/PG from always on bits due to "unrestructed + * guest" feature */ + cr0_always_on_mask = fixed0 & (~(CR0_PE | CR0_PG)); + cr0_always_off_mask = ~fixed1; - /* Read host mask value */ - value64 = exec_vmread(VMX_CR0_MASK); - /* Clear all bits being written by guest that are owned by host */ - value &= ~value64; + /* Read the CR$ fixed0 / fixed1 MSR registers */ + fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); + fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); - /* Update CR0 in guest state */ - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0 |= value; - exec_vmwrite(VMX_GUEST_CR0, - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0); - pr_dbg("VMM: Guest allowed to write 0x%08x to CR0", - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0); - - /* If guest is trying to transition vcpu from unpaged real mode to page - * protected mode make necessary changes to VMCS structure to reflect - * transition from real mode to paged-protected mode - */ - if (!is_vcpu_bsp(vcpu) && - (vcpu->arch_vcpu.cpu_mode == CPU_MODE_REAL) && - (value & CR0_PG) && (value & CR0_PE)) { - /* Enable protected mode */ - value32 = exec_vmread(VMX_ENTRY_CONTROLS); - value32 |= (VMX_ENTRY_CTLS_IA32E_MODE | - VMX_ENTRY_CTLS_LOAD_PAT | - VMX_ENTRY_CTLS_LOAD_EFER); - exec_vmwrite(VMX_ENTRY_CONTROLS, value32); - pr_dbg("VMX_ENTRY_CONTROLS: 0x%x ", value32); - - /* Set up EFER */ - value64 = exec_vmread64(VMX_GUEST_IA32_EFER_FULL); - value64 |= (MSR_IA32_EFER_SCE_BIT | - MSR_IA32_EFER_LME_BIT | - MSR_IA32_EFER_LMA_BIT | MSR_IA32_EFER_NXE_BIT); - exec_vmwrite64(VMX_GUEST_IA32_EFER_FULL, value64); - pr_dbg("VMX_GUEST_IA32_EFER: 0x%016llx ", value64); + cr4_host_mask = ~(fixed0 ^ fixed1); + /* Add the bit hv wants to trap */ + cr4_host_mask |= CR4_TRAP_MASK; + cr4_always_on_mask = fixed0; + /* Record the bit fixed to 0 for CR4, including reserved bits */ + cr4_always_off_mask = ~fixed1; + inited = true; } + exec_vmwrite(VMX_CR0_MASK, cr0_host_mask); + /* Output CR0 mask value */ + pr_dbg("CR0 mask value: 0x%x", cr0_host_mask); + + + exec_vmwrite(VMX_CR4_MASK, cr4_host_mask); + /* Output CR4 mask value */ + pr_dbg("CR4 mask value: 0x%x", cr4_host_mask); + +} + +/* + * Handling of CR0: + * Assume "unrestricted guest" feature is supported by vmx. + * For mode switch, hv only needs to take care of enabling/disabling long mode, + * thanks to "unrestricted guest" feature. + * + * - PE (0) Trapped to track cpu mode. + * Set the value according to the value from guest. + * - MP (1) Flexible to guest + * - EM (2) Flexible to guest + * - TS (3) Flexible to guest + * - ET (4) Flexible to guest + * - NE (5) must always be 1 + * - WP (16) Trapped to get if it inhibits supervisor level procedures to + * write into ro-pages. + * - AM (18) Flexible to guest + * - NW (29) Flexible to guest + * - CD (30) Flexible to guest + * - PG (31) Trapped to track cpu/paging mode. + * Set the value according to the value from guest. + */ +int vmx_write_cr0(struct vcpu *vcpu, uint64_t cr0) +{ + struct run_context *context = + &vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context]; + uint64_t cr0_vmx; + uint32_t entry_ctrls; + bool paging_enabled = !!(context->cr0 & CR0_PG); + + if (cr0 & (cr0_always_off_mask | CR0_RESERVED_MASK)) { + pr_err("Not allow to set always off / reserved bits for CR0"); + vcpu_inject_gp(vcpu, 0); + return -EINVAL; + } + + /* TODO: Check all invalid guest statuses according to the change of + * CR0, and inject a #GP to guest */ + + if ((context->ia32_efer & MSR_IA32_EFER_LME_BIT) && + !paging_enabled && (cr0 & CR0_PG)) { + if (!(context->cr4 & CR4_PAE)) { + pr_err("Can't enable long mode when PAE disabled"); + vcpu_inject_gp(vcpu, 0); + return -EINVAL; + } + /* Enable long mode */ + pr_dbg("VMM: Enable long mode"); + entry_ctrls = exec_vmread(VMX_ENTRY_CONTROLS); + entry_ctrls |= VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite(VMX_ENTRY_CONTROLS, entry_ctrls); + + context->ia32_efer |= MSR_IA32_EFER_LMA_BIT; + exec_vmwrite64(VMX_GUEST_IA32_EFER_FULL, context->ia32_efer); + } else if ((context->ia32_efer & MSR_IA32_EFER_LME_BIT) && + paging_enabled && !(cr0 & CR0_PG)){ + /* Disable long mode */ + pr_dbg("VMM: Disable long mode"); + entry_ctrls = exec_vmread(VMX_ENTRY_CONTROLS); + entry_ctrls &= ~VMX_ENTRY_CTLS_IA32E_MODE; + exec_vmwrite(VMX_ENTRY_CONTROLS, entry_ctrls); + + context->ia32_efer &= ~MSR_IA32_EFER_LMA_BIT; + exec_vmwrite64(VMX_GUEST_IA32_EFER_FULL, context->ia32_efer); + } + + /* CR0 has no always off bits, except the always on bits, and reserved + * bits, allow to set according to guest. + */ + cr0_vmx = cr0_always_on_mask | cr0; + exec_vmwrite(VMX_GUEST_CR0, cr0_vmx & 0xFFFFFFFFUL); + exec_vmwrite(VMX_CR0_READ_SHADOW, cr0 & 0xFFFFFFFFUL); + context->cr0 = cr0; + + pr_dbg("VMM: Try to write %08x, allow to write 0x%08x to CR0", + cr0, cr0_vmx); + return 0; } -int vmx_write_cr3(struct vcpu *vcpu, uint64_t value) +int vmx_write_cr3(struct vcpu *vcpu, uint64_t cr3) { + struct run_context *context = + &vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context]; /* Write to guest's CR3 */ - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3 = value; + context->cr3 = cr3; /* Commit new value to VMCS */ - exec_vmwrite(VMX_GUEST_CR3, - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3); + exec_vmwrite(VMX_GUEST_CR3, cr3); return 0; } -int vmx_write_cr4(struct vcpu *vcpu, uint64_t value) +/* + * Handling of CR4: + * Assume "unrestricted guest" feature is supported by vmx. + * + * For CR4, if some feature is not supported by hardware, the corresponding bit + * will be set in cr4_always_off_mask. If guest try to set these bits after + * vmexit, will inject a #GP. + * If a bit for a feature not supported by hardware, which is flexible to guest, + * and write to it do not lead to a VM exit, a #GP should be generated inside + * guest. + * + * - VME (0) Flexible to guest + * - PVI (1) Flexible to guest + * - TSD (2) Flexible to guest + * - DE (3) Flexible to guest + * - PSE (4) Trapped to track paging mode. + * Set the value according to the value from guest. + * - PAE (5) Trapped to track paging mode. + * Set the value according to the value from guest. + * - MCE (6) Flexible to guest + * - PGE (7) Flexible to guest + * - PCE (8) Flexible to guest + * - OSFXSR (9) Flexible to guest + * - OSXMMEXCPT (10) Flexible to guest + * - VMXE (13) must always be 1 => must lead to a VM exit + * - SMXE (14) must always be 0 => must lead to a VM exit + * - PCIDE (17) Flexible to guest + * - OSXSAVE (18) Flexible to guest + * - SMEP (20) Flexible to guest + * - SMAP (21) Flexible to guest + * - PKE (22) Flexible to guest + */ +int vmx_write_cr4(struct vcpu *vcpu, uint64_t cr4) { - uint64_t temp64; + struct run_context *context = + &vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context]; + uint64_t cr4_vmx; - pr_dbg("VMM: Guest trying to write 0x%08x to CR4", value); + /* TODO: Check all invalid guest statuses according to the change of + * CR4, and inject a #GP to guest */ - /* Read host mask value */ - temp64 = exec_vmread(VMX_CR4_MASK); + /* Check if guest try to set fixed to 0 bits or reserved bits */ + if(cr4 & cr4_always_off_mask) { + pr_err("Not allow to set reserved/always off bits for CR4"); + vcpu_inject_gp(vcpu, 0); + return -EINVAL; + } - /* Clear all bits being written by guest that are owned by host */ - value &= ~temp64; + /* Do NOT support nested guest */ + if (cr4 & CR4_VMXE) { + pr_err("Nested guest not supported"); + vcpu_inject_gp(vcpu, 0); + return -EINVAL; + } - /* Write updated CR4 (bitwise OR of allowed guest bits and CR4 host - * value) - */ - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4 |= value; - exec_vmwrite(VMX_GUEST_CR4, - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4); - pr_dbg("VMM: Guest allowed to write 0x%08x to CR4", - vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4); + /* Aways off bits and reserved bits has been filtered above */ + cr4_vmx = cr4_always_on_mask | cr4; + exec_vmwrite(VMX_GUEST_CR4, cr4_vmx & 0xFFFFFFFFUL); + exec_vmwrite(VMX_CR4_READ_SHADOW, cr4 & 0xFFFFFFFFUL); + context->cr4 = cr4; + + pr_dbg("VMM: Try to write %08x, allow to write 0x%08x to CR4", + cr4, cr4_vmx); return 0; } @@ -904,7 +1029,7 @@ static void init_host_state(__unused struct vcpu *vcpu) static void init_exec_ctrl(struct vcpu *vcpu) { - uint32_t value32, fixed0, fixed1; + uint32_t value32; uint64_t value64; struct vm *vm = (struct vm *) vcpu->vm; @@ -1081,89 +1206,7 @@ static void init_exec_ctrl(struct vcpu *vcpu) /* Natural-width */ pr_dbg("Natural-width*********"); - /* Read the CR0 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR0_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR0_FIXED1); - - if (get_vcpu_mode(vcpu) == CPU_MODE_REAL) { - /* Check to see if unrestricted guest support is available */ - if (msr_read(MSR_IA32_VMX_MISC) & (1 << 5)) { - /* Adjust fixed bits as they can/will reflect incorrect - * settings that ARE valid in unrestricted guest mode. - * Both PG and PE bits can bit changed in unrestricted - * guest mode. - */ - fixed0 &= ~(CR0_PG | CR0_PE); - fixed1 |= (CR0_PG | CR0_PE); - - /* Log success for unrestricted mode being present */ - pr_dbg("Unrestricted support is available. "); - } else { - /* Log failure for unrestricted mode NOT being - * present - */ - pr_err("Error: Unrestricted support is not available"); - /* IA32_VMX_MISC bit 5 clear */ - } - - } - - /* (get_vcpu_mode(vcpu) == CPU_MODE_REAL) */ - /* Output fixed CR0 values */ - pr_dbg("Fixed0 CR0 value: 0x%x", fixed0); - pr_dbg("Fixed1 CR0 value: 0x%x", fixed1); - - /* Determine which bits are "flexible" in CR0 - allowed to be changed - * as per arch manual in VMX operation. Any bits that are different - * between fixed0 and fixed1 are "flexible" and the guest can change. - */ - value32 = fixed0 ^ fixed1; - - /* Set the CR0 mask to the inverse of the "flexible" bits */ - value32 = ~value32; - exec_vmwrite(VMX_CR0_MASK, value32); - - /* Output CR0 mask value */ - pr_dbg("CR0 mask value: 0x%x", value32); - - /* Calculate the CR0 shadow register value that will be used to enforce - * the correct values for host owned bits - */ - value32 = (fixed0 | fixed1) & value32; - exec_vmwrite(VMX_CR0_READ_SHADOW, value32); - - /* Output CR0 shadow value */ - pr_dbg("CR0 shadow value: 0x%x", value32); - - /* Read the CR4 fixed0 / fixed1 MSR registers */ - fixed0 = msr_read(MSR_IA32_VMX_CR4_FIXED0); - fixed1 = msr_read(MSR_IA32_VMX_CR4_FIXED1); - - /* Output fixed CR0 values */ - pr_dbg("Fixed0 CR4 value: 0x%x", fixed0); - pr_dbg("Fixed1 CR4 value: 0x%x", fixed1); - - /* Determine which bits are "flexible" in CR4 - allowed to be changed - * as per arch manual in VMX operation. Any bits that are different - * between fixed0 and fixed1 are "flexible" and the guest can change. - */ - value32 = fixed0 ^ fixed1; - - /* Set the CR4 mask to the inverse of the "flexible" bits */ - value32 = ~value32; - exec_vmwrite(VMX_CR4_MASK, value32); - - /* Output CR4 mask value */ - pr_dbg("CR4 mask value: 0x%x", value32); - - /* Calculate the CR4 shadow register value that will be used to enforce - * the correct values for host owned bits - */ - value32 = (fixed0 | fixed1) & value32; - exec_vmwrite(VMX_CR4_READ_SHADOW, value32); - - /* Output CR4 shadow value */ - pr_dbg("CR4 shadow value: 0x%x", value32); + init_cr0_cr4_host_mask(vcpu); /* The CR3 target registers work in concert with VMX_CR3_TARGET_COUNT * field. Using these registers guest CR3 access can be managed. i.e., @@ -1382,8 +1425,9 @@ int init_vmcs(struct vcpu *vcpu) /* Initialize the Virtual Machine Control Structure (VMCS) */ init_host_state(vcpu); - init_guest_state(vcpu); + /* init exec_ctrl needs to run before init_guest_state */ init_exec_ctrl(vcpu); + init_guest_state(vcpu); init_entry_ctrl(vcpu); init_exit_ctrl(vcpu); diff --git a/hypervisor/include/arch/x86/vmx.h b/hypervisor/include/arch/x86/vmx.h index 9fa3cb0dd..27987f502 100644 --- a/hypervisor/include/arch/x86/vmx.h +++ b/hypervisor/include/arch/x86/vmx.h @@ -382,52 +382,13 @@ #define RFLAGS_C (1<<0) #define RFLAGS_Z (1<<6) -/* - * Handling of CR0: - * - * - PE (0) Must always be 1. Attempt to write to it must lead to a VM exit. - * - MP (1) coprocessor related => no action needed - * - EM (2) coprocessor related => no action needed - * - TS (3) no action needed - * - ET (4) typically hardcoded to 1. => no action needed - * - NE (5) coprocessor related => no action needed - * - WP (16) inhibits supervisor level procedures to write into ro-pages - * => no action needed - * - AM (18) alignment mask => no action needed - * - NW (29) not write through => no action - * - CD (30) cache disable => no action - * - PG (31) paging => must always be 1. Attempt to write to it must lead to - * a VM exit. - */ +/* CR0 bits hv want to trap to track status change */ +#define CR0_TRAP_MASK (CR0_PE | CR0_PG | CR0_WP) +#define CR0_RESERVED_MASK ~(CR0_PG | CR0_CD | CR0_NW | CR0_AM | CR0_WP | \ + CR0_NE | CR0_ET | CR0_TS | CR0_EM | CR0_MP | CR0_PE) -/* we must guard protected mode and paging */ -#define CR0_GUEST_HOST_MASK (CR0_PE | CR0_PG | CR0_WP) -/* initially, the guest runs in protected mode enabled, but with no paging */ -#define CR0_READ_SHADOW CR0_PE - -/* - * Handling of CR4: - * - * - VME (0) must always be 0 => must lead to a VM exit - * - PVI (1) must always be 0 => must lead to a VM exit - * - TSD (2) don't care - * - DE (3) don't care - * - PSE (4) must always be 1 => must lead to a VM exit - * - PAE (5) must always be 0 => must lead to a VM exit - * - MCE (6) don't care - * - PGE (7) => important for TLB flush - * - PCE (8) don't care - * - OSFXSR (9) don't care - * - OSXMMEXCPT (10) don't care - * - VMXE (13) must always be 1 => must lead to a VM exit - * - SMXE (14) must always be 0 => must lead to a VM exit - * - PCIDE (17) => important for TLB flush - * - OSXSAVE (18) don't care - */ - -#define CR4_GUEST_HOST_MASK (CR4_VME | CR4_PVI | CR4_PSE | CR4_PAE | \ - CR4_VMXE | CR4_SMXE | CR4_PGE | CR4_PCIDE) -#define CR4_READ_SHADOW (CR4_PGE | CR4_PSE) +/* CR4 bits hv want to trap to track status change */ +#define CR4_TRAP_MASK (CR4_PSE | CR4_PAE) /* External Interfaces */ int exec_vmxon_instr(void);