initial import

internal commit: 14ac2bc2299032fa6714d1fefa7cf0987b3e3085

Signed-off-by: Eddie Dong <eddie.dong@intel.com>
This commit is contained in:
Eddie Dong
2018-03-07 20:57:14 +08:00
committed by lijinxia
parent bd31b1c53e
commit 7a3a539b17
156 changed files with 41265 additions and 0 deletions

1015
hypervisor/arch/x86/assign.c Normal file

File diff suppressed because it is too large Load Diff

650
hypervisor/arch/x86/cpu.c Normal file
View File

@@ -0,0 +1,650 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <bsp_extern.h>
#include <hv_arch.h>
#include <schedule.h>
#include <version.h>
#include <hv_debug.h>
#ifdef CONFIG_EFI_STUB
extern uint32_t efi_physical_available_ap_bitmap;
#endif
uint64_t tsc_clock_freq = 1000000000;
spinlock_t cpu_secondary_spinlock = {
.head = 0,
.tail = 0
};
spinlock_t up_count_spinlock = {
.head = 0,
.tail = 0
};
void *per_cpu_data_base_ptr;
int phy_cpu_num;
unsigned long pcpu_sync = 0;
uint32_t up_count = 0;
DEFINE_CPU_DATA(uint8_t[STACK_SIZE], stack) __aligned(16);
DEFINE_CPU_DATA(uint8_t, lapic_id);
DEFINE_CPU_DATA(void *, vcpu);
DEFINE_CPU_DATA(int, state);
/* TODO: add more capability per requirement */
struct cpu_capability {
bool tsc_adjust_supported;
bool ibrs_ibpb_supported;
bool stibp_supported;
bool apicv_supported;
bool monitor_supported;
};
static struct cpu_capability cpu_caps;
static void apicv_cap_detect(void);
static void cpu_set_logical_id(uint32_t logical_id);
static void print_hv_banner(void);
bool check_monitor_support(void);
int cpu_find_logical_id(uint32_t lapic_id);
#ifndef CONFIG_EFI_STUB
static void start_cpus();
#endif
static void pcpu_sync_sleep(unsigned long *sync, int mask_bit);
int ibrs_type;
static void check_cpu_capability(void)
{
uint32_t eax, ebx, ecx, edx;
memset(&cpu_caps, 0, sizeof(struct cpu_capability));
cpuid(CPUID_EXTEND_FEATURE, &eax, &ebx, &ecx, &edx);
cpu_caps.tsc_adjust_supported = (ebx & CPUID_EBX_TSC_ADJ) ?
(true) : (false);
cpu_caps.ibrs_ibpb_supported = (edx & CPUID_EDX_IBRS_IBPB) ?
(true) : (false);
cpu_caps.stibp_supported = (edx & CPUID_EDX_STIBP) ?
(true) : (false);
/* For speculation defence.
* The default way is to set IBRS at vmexit and then do IBPB at vcpu
* context switch(ibrs_type == IBRS_RAW).
* Now provide an optimized way (ibrs_type == IBRS_OPT) which set
* STIBP and do IBPB at vmexit,since having STIBP always set has less
* impact than having IBRS always set. Also since IBPB is already done
* at vmexit, it is no necessary to do so at vcpu context switch then.
*/
ibrs_type = IBRS_NONE;
/* Currently for APL, if we enabled retpoline, then IBRS should not
* take effect
* TODO: add IA32_ARCH_CAPABILITIES[1] check, if this bit is set, IBRS
* should be set all the time instead of relying on retpoline
*/
#ifndef CONFIG_RETPOLINE
if (cpu_caps.ibrs_ibpb_supported) {
ibrs_type = IBRS_RAW;
if (cpu_caps.stibp_supported)
ibrs_type = IBRS_OPT;
}
#endif
}
bool check_tsc_adjust_support(void)
{
return cpu_caps.tsc_adjust_supported;
}
bool check_ibrs_ibpb_support(void)
{
return cpu_caps.ibrs_ibpb_supported;
}
bool check_stibp_support(void)
{
return cpu_caps.stibp_supported;
}
static void alloc_phy_cpu_data(int pcpu_num)
{
phy_cpu_num = pcpu_num;
per_cpu_data_base_ptr = calloc(1, PER_CPU_DATA_SIZE * pcpu_num);
ASSERT(per_cpu_data_base_ptr != NULL, "");
}
int __attribute__((weak)) parse_madt(uint8_t *lapic_id_base)
{
static const uint32_t lapic_id[] = {0, 2, 4, 6};
uint32_t i;
for (i = 0; i < ARRAY_SIZE(lapic_id); i++)
*lapic_id_base++ = lapic_id[i];
return ARRAY_SIZE(lapic_id);
}
static int init_phy_cpu_storage(void)
{
int i, pcpu_num = 0;
int bsp_cpu_id;
uint8_t bsp_lapic_id = 0;
uint8_t *lapic_id_base;
/*
* allocate memory to save all lapic_id detected in parse_mdt.
* We allocate 4K size which could save 4K CPUs lapic_id info.
*/
lapic_id_base = alloc_page(CPU_PAGE_SIZE);
ASSERT(lapic_id_base != NULL, "fail to alloc page");
pcpu_num = parse_madt(lapic_id_base);
alloc_phy_cpu_data(pcpu_num);
for (i = 0; i < pcpu_num; i++) {
per_cpu(lapic_id, i) = *lapic_id_base++;
#ifdef CONFIG_EFI_STUB
efi_physical_available_ap_bitmap |= 1 << per_cpu(lapic_id, i);
#endif
}
/* free memory after lapic_id are saved in per_cpu data */
free(lapic_id_base);
bsp_lapic_id = get_cur_lapic_id();
#ifdef CONFIG_EFI_STUB
efi_physical_available_ap_bitmap &= ~(1 << bsp_lapic_id);
#endif
bsp_cpu_id = cpu_find_logical_id(bsp_lapic_id);
ASSERT(bsp_cpu_id >= 0, "fail to get phy cpu id");
return bsp_cpu_id;
}
static void cpu_set_current_state(uint32_t logical_id, int state)
{
spinlock_obtain(&up_count_spinlock);
/* Check if state is initializing */
if (state == CPU_STATE_INITIALIZING) {
/* Increment CPU up count */
up_count++;
/* Save this CPU's logical ID to the TSC AUX MSR */
cpu_set_logical_id(logical_id);
}
/* Set state for the specified CPU */
per_cpu(state, logical_id) = state;
spinlock_release(&up_count_spinlock);
}
#ifdef STACK_PROTECTOR
struct stack_canary {
/* Gcc generates extra code, using [fs:40] to access canary */
uint8_t reserved[40];
uint64_t canary;
};
static DEFINE_CPU_DATA(struct stack_canary, stack_canary);
static uint64_t get_random_value(void)
{
uint64_t random = 0;
asm volatile ("1: rdrand %%rax\n"
"jnc 1b\n"
"mov %%rax, %0\n"
: "=r"(random) :: );
return random;
}
static void set_fs_base(void)
{
struct stack_canary *psc = &get_cpu_var(stack_canary);
psc->canary = get_random_value();
msr_write(MSR_IA32_FS_BASE, (uint64_t)psc);
}
#endif
void bsp_boot_init(void)
{
#ifdef HV_DEBUG
uint64_t start_tsc = rdtsc();
#endif
/* Clear BSS */
memset(_ld_bss_start, 0, _ld_bss_end - _ld_bss_start);
/* Build time sanity checks to make sure hard-coded offset
* is matching the actual offset!
*/
STATIC_ASSERT(offsetof(struct cpu_regs, rax) ==
VMX_MACHINE_T_GUEST_RAX_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rbx) ==
VMX_MACHINE_T_GUEST_RBX_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rcx) ==
VMX_MACHINE_T_GUEST_RCX_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rdx) ==
VMX_MACHINE_T_GUEST_RDX_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rbp) ==
VMX_MACHINE_T_GUEST_RBP_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rsi) ==
VMX_MACHINE_T_GUEST_RSI_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, rdi) ==
VMX_MACHINE_T_GUEST_RDI_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r8) ==
VMX_MACHINE_T_GUEST_R8_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r9) ==
VMX_MACHINE_T_GUEST_R9_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r10) ==
VMX_MACHINE_T_GUEST_R10_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r11) ==
VMX_MACHINE_T_GUEST_R11_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r12) ==
VMX_MACHINE_T_GUEST_R12_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r13) ==
VMX_MACHINE_T_GUEST_R13_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r14) ==
VMX_MACHINE_T_GUEST_R14_OFFSET);
STATIC_ASSERT(offsetof(struct cpu_regs, r15) ==
VMX_MACHINE_T_GUEST_R15_OFFSET);
STATIC_ASSERT(offsetof(struct run_context, cr2) ==
VMX_MACHINE_T_GUEST_CR2_OFFSET);
STATIC_ASSERT(offsetof(struct run_context, ia32_spec_ctrl) ==
VMX_MACHINE_T_GUEST_SPEC_CTRL_OFFSET);
/* Initialize the hypervisor paging */
init_paging();
early_init_lapic();
init_phy_cpu_storage();
load_gdtr_and_tr();
/* Switch to run-time stack */
CPU_SP_WRITE(&get_cpu_var(stack)[STACK_SIZE - 1]);
#ifdef STACK_PROTECTOR
set_fs_base();
#endif
check_cpu_capability();
apicv_cap_detect();
/* Set state for this CPU to initializing */
cpu_set_current_state(CPU_BOOT_ID, CPU_STATE_INITIALIZING);
/* Perform any necessary BSP initialization */
init_bsp();
/* Initialize Serial */
serial_init();
/* Initialize console */
console_init();
/* Print Hypervisor Banner */
print_hv_banner();
/* Make sure rdtsc is enabled */
check_tsc();
/* Calculate TSC Frequency */
tsc_clock_freq = tsc_cycles_in_period(1000) / 1000 * 1000000;
/* Enable logging */
init_logmsg(LOG_BUF_SIZE,
LOG_DESTINATION);
#ifdef HV_DEBUG
/* Log first messages */
printf("HV version %d.%d-%s-%s build by %s, start time %lluus\r\n",
HV_MAJOR_VERSION, HV_MINOR_VERSION, HV_BUILD_TIME,
HV_BUILD_VERSION, HV_BUILD_USER,
TICKS_TO_US(start_tsc));
#endif
pr_dbg("Core %d is up", CPU_BOOT_ID);
/* Warn for security feature not ready */
if (!check_ibrs_ibpb_support() && !check_stibp_support()) {
pr_fatal("SECURITY WARNING!!!!!!");
pr_fatal("Please apply the latest CPU uCode patch!");
}
/* Initialize the shell */
shell_init();
/* Initialize interrupts */
interrupt_init(CPU_BOOT_ID);
timer_init();
setup_notification();
ptdev_init();
init_scheduler();
#ifndef CONFIG_EFI_STUB
/* Start all secondary cores */
start_cpus();
/* Trigger event to allow secondary CPUs to continue */
bitmap_set(0, &pcpu_sync);
#else
memcpy_s(_ld_cpu_secondary_reset_start,
(unsigned long)&_ld_cpu_secondary_reset_size,
_ld_cpu_secondary_reset_load,
(unsigned long)&_ld_cpu_secondary_reset_size);
#endif
ASSERT(get_cpu_id() == CPU_BOOT_ID, "");
init_iommu();
console_setup_timer();
/* Start initializing the VM for this CPU */
hv_main(CPU_BOOT_ID);
/* Control should not come here */
cpu_halt(CPU_BOOT_ID);
}
void cpu_secondary_init(void)
{
/* NOTE: Use of local / stack variables in this function is problematic
* since the stack is switched in the middle of the function. For this
* reason, the logical id is only temporarily stored in a static
* variable, but this will be over-written once subsequent CPUs
* start-up. Once the spin-lock is released, the cpu_logical_id_get()
* API is used to obtain the logical ID
*/
/* Switch this CPU to use the same page tables set-up by the
* primary/boot CPU
*/
enable_paging(get_paging_pml4());
early_init_lapic();
/* Find the logical ID of this CPU given the LAPIC ID
* temp_logical_id =
* cpu_find_logical_id(get_cur_lapic_id());
*/
cpu_find_logical_id(get_cur_lapic_id());
/* Set state for this CPU to initializing */
cpu_set_current_state(cpu_find_logical_id
(get_cur_lapic_id()),
CPU_STATE_INITIALIZING);
/* Switch to run-time stack */
CPU_SP_WRITE(&get_cpu_var(stack)[STACK_SIZE - 1]);
#ifdef STACK_PROTECTOR
set_fs_base();
#endif
load_gdtr_and_tr();
/* Make sure rdtsc is enabled */
check_tsc();
pr_dbg("Core %d is up", get_cpu_id());
/* Release secondary boot spin-lock to allow one of the next CPU(s) to
* perform this common initialization
*/
spinlock_release(&cpu_secondary_spinlock);
/* Initialize secondary processor interrupts. */
interrupt_init(get_cpu_id());
timer_init();
/* Wait for boot processor to signal all secondary cores to continue */
pcpu_sync_sleep(&pcpu_sync, 0);
#ifdef CONFIG_EFI_STUB
bitmap_clr(0, &pcpu_sync);
#endif
hv_main(get_cpu_id());
/* Control will only come here for secondary CPUs not configured for
* use or if an error occurs in hv_main
*/
cpu_halt(get_cpu_id());
}
int cpu_find_logical_id(uint32_t lapic_id)
{
int i;
for (i = 0; i < phy_cpu_num; i++) {
if (per_cpu(lapic_id, i) == lapic_id)
return i;
}
return -1;
}
#ifndef CONFIG_EFI_STUB
/*
* Start all secondary CPUs.
*/
static void start_cpus()
{
uint32_t timeout;
uint32_t expected_up;
/*Copy segment for AP initialization code below 1MB */
memcpy_s(_ld_cpu_secondary_reset_start,
(unsigned long)&_ld_cpu_secondary_reset_size,
_ld_cpu_secondary_reset_load,
(unsigned long)&_ld_cpu_secondary_reset_size);
/* Set flag showing number of CPUs expected to be up to all
* cpus
*/
expected_up = phy_cpu_num;
/* Broadcast IPIs to all other CPUs */
send_startup_ipi(INTR_CPU_STARTUP_ALL_EX_SELF,
-1U, ((paddr_t) cpu_secondary_reset));
/* Wait until global count is equal to expected CPU up count or
* configured time-out has expired
*/
timeout = CPU_UP_TIMEOUT * 1000;
while ((up_count != expected_up) && (timeout != 0)) {
/* Delay 10us */
udelay(10);
/* Decrement timeout value */
timeout -= 10;
}
/* Check to see if all expected CPUs are actually up */
if (up_count != expected_up) {
/* Print error */
pr_fatal("Secondary CPUs failed to come up");
/* Error condition - loop endlessly for now */
do {
} while (1);
}
}
#endif
void cpu_halt(uint32_t logical_id)
{
/* For debug purposes, using a stack variable in the while loop enables
* us to modify the value using a JTAG probe and resume if needed.
*/
int halt = 1;
/* Set state to show CPU is halted */
cpu_set_current_state(logical_id, CPU_STATE_HALTED);
/* Halt the CPU */
do {
asm volatile ("hlt");
} while (halt);
}
static void cpu_set_logical_id(uint32_t logical_id)
{
/* Write TSC AUX register */
msr_write(MSR_IA32_TSC_AUX, (uint64_t) logical_id);
}
static void print_hv_banner(void)
{
char *boot_msg = "ACRN Hypervisor\n\r";
/* Print the boot message */
printf(boot_msg);
}
static void pcpu_sync_sleep(unsigned long *sync, int mask_bit)
{
int wake_sync = (1 << mask_bit);
if (check_monitor_support()) {
/* Wait for the event to be set using monitor/mwait */
asm volatile ("1: cmpl %%ebx,(%%eax)\n"
" je 2f\n"
" monitor\n"
" mwait\n"
" jmp 1b\n"
"2:\n"
:
: "a" (sync), "d"(0), "c"(0),
"b"(wake_sync)
: "cc");
} else {
/* Wait for the event to be set using pause */
asm volatile ("1: cmpl %%ebx,(%%eax)\n"
" je 2f\n"
" pause\n"
" jmp 1b\n"
"2:\n"
:
: "a" (sync), "d"(0), "c"(0),
"b"(wake_sync)
: "cc");
}
}
/*check allowed ONEs setting in vmx control*/
static bool is_ctrl_setting_allowed(uint64_t msr_val, uint32_t ctrl)
{
/*
* Intel SDM Appendix A.3
* - bitX in ctrl can be set 1
* only if bit 32+X in msr_val is 1
*/
return ((((uint32_t)(msr_val >> 32)) & ctrl) == ctrl);
}
static void apicv_cap_detect(void)
{
uint64_t val64;
uint32_t ctrl;
bool result;
ctrl = VMX_PROCBASED_CTLS_TPR_SHADOW;
val64 = msr_read(MSR_IA32_VMX_PROCBASED_CTLS);
result = is_ctrl_setting_allowed(val64, ctrl);
if (result) {
ctrl = VMX_PROCBASED_CTLS2_VAPIC |
VMX_PROCBASED_CTLS2_VAPIC_REGS |
VMX_PROCBASED_CTLS2_VIRQ;
val64 = msr_read(MSR_IA32_VMX_PROCBASED_CTLS2);
result = is_ctrl_setting_allowed(val64, ctrl);
}
cpu_caps.apicv_supported = result;
}
bool is_apicv_enabled(void)
{
return cpu_caps.apicv_supported;
}
static void monitor_cap_detect(void)
{
uint32_t eax, ebx, ecx, edx;
uint32_t family;
uint32_t model;
/* Run CPUID to determine if MONITOR support available */
cpuid(CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
/* See if MONITOR feature bit is set in ECX */
if (ecx & CPUID_ECX_MONITOR)
cpu_caps.monitor_supported = true;
/* don't use monitor for CPU (family: 0x6 model: 0x5c)
* in hypervisor, but still expose it to the guests and
* let them handle it correctly
*/
family = (eax >> 8) & 0xff;
if (family == 0xF)
family += (eax >> 20) & 0xff;
model = (eax >> 4) & 0xf;
if (family >= 0x06)
model += ((eax >> 16) & 0xf) << 4;
if (cpu_caps.monitor_supported &&
(family == 0x06) &&
(model == 0x5c)) {
cpu_caps.monitor_supported = false;
}
}
bool check_monitor_support(void)
{
return cpu_caps.monitor_supported;
}

View File

@@ -0,0 +1,228 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <cpu.h>
#include <mmu.h>
#include <gdt.h>
#include <idt.h>
#include <msr.h>
/* MULTIBOOT HEADER */
#define MULTIBOOT_HEADER_MAGIC 0x1badb002
#define MULTIBOOT_HEADER_FLAGS 0x00000002 /*flags bit 1 : enable mem_*, mmap_**/
.section multiboot_header, "a"
.align 4
/* header magic */
.long MULTIBOOT_HEADER_MAGIC
/* header flags - flags bit 6 : enable mmap_* */
.long MULTIBOOT_HEADER_FLAGS
/* header checksum = -(magic + flags) */
.long -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_HEADER_FLAGS)
.section entry, "ax"
.align 8
.code32
.global cpu_primary_start_32
cpu_primary_start_32:
/* Disable interrupts */
cli
/* Clear direction flag */
cld
/* save eax and ebx */
movl %eax, %esp
movl %ebx, %ebp
/* detect whether it is in long mode */
movl $MSR_IA32_EFER, %ecx
rdmsr
test $MSR_IA32_EFER_LMA_BIT, %eax
/* jump to 64bit entry if it is already in long mode */
jne cpu_primary_start_64
/* save the MULTBOOT magic number & MBI */
movl %esp, (boot_regs)
movl %ebp, (boot_regs+4)
/* Disable paging */
mov %cr0, %ebx
andl $~CR0_PG, %ebx
mov %ebx, %cr0
/* Set DE, PAE, MCE and OS support bits in CR4 */
movl $(CR4_DE | CR4_PAE | CR4_MCE | CR4_OSFXSR | CR4_OSXMMEXCPT), %eax
mov %eax, %cr4
/* Set CR3 to PML4 table address */
movl $cpu_boot32_page_tables_start, %edi
mov %edi, %cr3
/* Set LME bit in EFER */
movl $MSR_IA32_EFER, %ecx
rdmsr
orl $MSR_IA32_EFER_LME_BIT, %eax
wrmsr
/* Enable paging, protection, numeric error and co-processor
monitoring in CR0 to enter long mode */
mov %cr0, %ebx
orl $(CR0_PG | CR0_PE | CR0_MP | CR0_NE), %ebx
mov %ebx, %cr0
/* Load temportary GDT pointer value */
mov $cpu_primary32_gdt_ptr, %ebx
lgdt (%ebx)
/* Perform a long jump based to start executing in 64-bit mode */
ljmp $HOST_GDT_RING0_CODE_SEL, $primary_start_long_mode
.code64
.org 0x200
.global cpu_primary_start_64
cpu_primary_start_64:
/* save the MULTBOOT magic number & MBI */
movl %edi, (boot_regs)
movl %esi, (boot_regs+4)
#ifdef CONFIG_EFI_STUB
movl %edx, (boot_regs+8)
#endif
primary_start_long_mode:
/* Fix up the IDT desciptors */
movl $HOST_IDT, %edx
movl $HOST_IDT_ENTRIES, %ecx
.LFixUpIDT_Entries:
xorl %eax, %eax
xchgl %eax, 12(%edx) /* Set rsvd bits to 0; eax now has
high 32 of entry point */
xchgl %eax, 8(%edx) /* Set bits 63..32 of entry point;
eax now has low 32 of entry point */
movw %ax, (%edx) /* Set bits 0-15 of procedure entry
point */
shr $16, %eax
movw %ax, 6(%edx) /* Set bits 16-31 of entry point */
addl $X64_IDT_DESC_SIZE,%edx
loop .LFixUpIDT_Entries
/* Load IDT */
mov $HOST_IDTR, %rcx
lidtq (%rcx)
/* Load temportary GDT pointer value */
mov $cpu_primary32_gdt_ptr, %ebx
lgdt (%ebx)
/* Replace CS with the correct value should we need it */
mov $HOST_GDT_RING0_CODE_SEL, %bx
mov %bx, jcs
movabsq $jmpbuf, %rax
rex.w ljmp *(%rax)
.data
jmpbuf: .quad after
jcs: .word 0
.text
after:
/* Initialize temporary stack pointer */
movq $_ld_bss_end, %rsp
add $CPU_PAGE_SIZE,%rsp
and $(~(CPU_STACK_ALIGN - 1)),%rsp
// load all selector registers with appropriate values
xor %edx, %edx
lldt %dx
movl $HOST_GDT_RING0_DATA_SEL,%eax
mov %eax,%ss // Was 32bit POC Stack
mov %eax,%ds // Was 32bit POC Data
mov %eax,%es // Was 32bit POC Data
mov %edx,%fs // Was 32bit POC Data
mov %edx,%gs // Was 32bit POC CLS
/* Push sp magic to top of stack for call trace */
pushq $SP_BOTTOM_MAGIC
/* continue with chipset level initialization */
call bsp_boot_init
loop:
jmp loop
.align 4
.global boot_regs
boot_regs:
.long 0x00000000
.long 0x00000000
#ifdef CONFIG_EFI_STUB
.long 0x00000000
#endif
/* GDT table */
.align 4
cpu_primary32_gdt:
.quad 0x0000000000000000
.quad 0x00af9b000000ffff
.quad 0x00cf93000000ffff
cpu_primary32_gdt_end:
/* GDT pointer */
.align 2
cpu_primary32_gdt_ptr:
.short (cpu_primary32_gdt_end - cpu_primary32_gdt) - 1
.quad cpu_primary32_gdt
/* PML4, PDPT, and PD tables initialized to map first 4 GBytes of memory */
.align CPU_PAGE_SIZE
.global cpu_boot32_page_tables_start
cpu_boot32_page_tables_start:
.quad cpu_primary32_pdpt_addr + (IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
.align CPU_PAGE_SIZE
cpu_primary32_pdpt_addr:
address = 0
.rept 4
.quad cpu_primary32_pdt_addr + address + \
(IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + CPU_PAGE_SIZE
.endr
.align CPU_PAGE_SIZE
cpu_primary32_pdt_addr:
address = 0
.rept 2048
.quad address + (IA32E_PDPTE_PS_BIT | IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + 0x200000
.endr

View File

@@ -0,0 +1,197 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <spinlock.h>
#include <gdt.h>
#include <cpu.h>
#include <mmu.h>
#include <msr.h>
.extern cpu_secondary_init
.extern cpu_logical_id
.extern _ld_bss_end
.extern HOST_GDTR
.section .cpu_secondary_reset,"ax"
.align 4
.code16
.global cpu_secondary_reset
cpu_secondary_reset:
/* Disable local interrupts */
cli
/* Set DE, PAE, MCE and OS support bits in CR4 */
movl $(CR4_DE | CR4_PAE | CR4_MCE | CR4_OSFXSR | CR4_OSXMMEXCPT), %eax
mov %eax, %cr4
/* Set CR3 to PML4 table address */
movl $CPU_Boot_Page_Tables_Start, %edi
mov %edi, %cr3
/* Set LME bit in EFER */
movl $MSR_IA32_EFER, %ecx
rdmsr
orl $MSR_IA32_EFER_LME_BIT, %eax
wrmsr
/* Enable paging, protection, numeric error and co-processor
monitoring in CR0 to enter long mode */
mov %cr0, %ebx
orl $(CR0_PG | CR0_PE | CR0_MP | CR0_NE), %ebx
mov %ebx, %cr0
/* Load temportary GDT pointer value */
mov $cpu_secondary_gdt_ptr, %ebx
lgdt (%ebx)
/* Perform a long jump based to start executing in 64-bit mode */
data32 ljmp $HOST_GDT_RING0_CODE_SEL, $cpu_secondary_long_mode
.code64
cpu_secondary_long_mode:
/* Set up all other data segment registers */
movl $HOST_GDT_RING0_DATA_SEL, %eax
mov %eax, %ss
mov %eax, %ds
mov %eax, %es
mov %eax, %fs
mov %eax, %gs
/* Obtain secondary CPU spin-lock to serialize
booting of secondary cores for a bit */
spinlock_obtain(cpu_secondary_spinlock)
/* Initialize temporary stack pointer
NOTE: Using the PML4 memory (PDPT address is top of memory
for the PML4 page) for the temporary stack
as we are only using the very first entry in
this page and the stack is growing down from
the top of this page. This stack is only
used for a VERY short period of time, so
this reuse of PML4 memory should be acceptable. */
movq $cpu_secondary_pdpt_addr, %rsp
/* Push sp magic to top of stack for call trace */
pushq $SP_BOTTOM_MAGIC
/* Jump to C entry for the AP */
call cpu_secondary_init
cpu_secondary_error:
/* Error condition trap */
jmp cpu_secondary_error
/* GDT table */
.align 4
cpu_secondary_gdt:
.quad 0x0000000000000000
.quad 0x00af9b000000ffff
.quad 0x00cf93000000ffff
cpu_secondary_gdt_end:
/* GDT pointer */
.align 2
cpu_secondary_gdt_ptr:
.short (cpu_secondary_gdt_end - cpu_secondary_gdt) - 1
.quad cpu_secondary_gdt
/* PML4, PDPT, and PD tables initialized to map first 4 GBytes of memory */
.align CPU_PAGE_SIZE
.global CPU_Boot_Page_Tables_Start
CPU_Boot_Page_Tables_Start:
.quad cpu_secondary_pdpt_addr + (IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
.align CPU_PAGE_SIZE
cpu_secondary_pdpt_addr:
address = 0
.rept 4
.quad cpu_secondary_pdt_addr + address + \
(IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + CPU_PAGE_SIZE
.endr
.align CPU_PAGE_SIZE
cpu_secondary_pdt_addr:
address = 0
.rept 2048
.quad address + (IA32E_PDPTE_PS_BIT | IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + 0x200000
.endr
/*******************************************************************
* GUEST initial 4G page table
*
* guest starts with long mode, HV needs to prepare Guest identity
* mapped page table.
*
* guest page tables covers 4G size, with 2M page size.
*
* HV copy this page table (6 pages) to guest address
* CPU_Boot_Page_Tables_Start_VM before executing guest instruction.
*
******************************************************************/
.align CPU_PAGE_SIZE
.global CPU_Boot_Page_Tables_Start_VM
CPU_Boot_Page_Tables_Start_VM:
.quad vm_cpu_pdpt_addr + (IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
.align CPU_PAGE_SIZE
vm_cpu_pdpt_addr:
address = 0
.rept 4
.quad vm_cpu_pdt_addr + address + (IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + CPU_PAGE_SIZE
.endr
.align CPU_PAGE_SIZE
vm_cpu_pdt_addr:
address = 0
.rept 2048
.quad address + (IA32E_PDPTE_PS_BIT | IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT)
address = address + 0x200000
.endr
.end

195
hypervisor/arch/x86/cpuid.c Normal file
View File

@@ -0,0 +1,195 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <cpu.h>
void emulate_cpuid(struct vcpu *vcpu, uint32_t src_op, uint32_t *eax_ptr,
uint32_t *ebx_ptr, uint32_t *ecx_ptr, uint32_t *edx_ptr)
{
uint32_t apicid = vlapic_get_id(vcpu->arch_vcpu.vlapic);
static const char sig[12] = "ACRNACRNACRN";
const uint32_t *sigptr = (const uint32_t *)sig;
uint32_t count = *ecx_ptr;
if ((src_op != 0x40000000) && (src_op != 0x40000010))
cpuid_count(src_op, count, eax_ptr, ebx_ptr, ecx_ptr, edx_ptr);
switch (src_op) {
/* Virtualize cpuid 0x01 */
case 0x01:
/* Patching initial APIC ID */
*ebx_ptr &= ~APIC_ID_MASK;
*ebx_ptr |= (apicid & APIC_ID_MASK);
/* mask mtrr */
*edx_ptr &= ~CPUID_EDX_MTRR;
/* Patching X2APIC, X2APIC mode is disabled by default. */
if (x2apic_enabled)
*ecx_ptr |= CPUID_ECX_x2APIC;
else
*ecx_ptr &= ~CPUID_ECX_x2APIC;
/* mask pcid */
*ecx_ptr &= ~CPUID_ECX_PCID;
/*mask vmx to guest os */
*ecx_ptr &= ~CPUID_ECX_VMX;
break;
/* Virtualize cpuid 0x07 */
case 0x07:
/* mask invpcid */
*ebx_ptr &= ~CPUID_EBX_INVPCID;
break;
case 0x0a:
/* not support pmu */
*eax_ptr &= ~0xff;
break;
/* Virtualize cpuid 0x0b */
case 0x0b:
/* Patching X2APIC */
if (!x2apic_enabled) {
*eax_ptr = 0;
*ebx_ptr = 0;
*ecx_ptr = 0;
*edx_ptr = 0;
}
break;
/*
* Leaf 0x40000000
* This leaf returns the CPUID leaf range supported by the
* hypervisor and the hypervisor vendor signature.
*
* EAX: The maximum input value for CPUID supported by the
* hypervisor.
* EBX, ECX, EDX: Hypervisor vendor ID signature.
*/
case 0x40000000:
*eax_ptr = 0x40000010;
*ebx_ptr = sigptr[0];
*ecx_ptr = sigptr[1];
*edx_ptr = sigptr[2];
break;
/*
* Leaf 0x40000010 - Timing Information.
* This leaf returns the current TSC frequency and
* current Bus frequency in kHz.
*
* EAX: (Virtual) TSC frequency in kHz.
* TSC frequency is calculated from PIT in ACRN
* EBX: (Virtual) Bus (local apic timer) frequency in kHz.
* Bus (local apic timer) frequency is hardcoded as
* (128 * 1024 * 1024) in ACRN
* ECX, EDX: RESERVED (reserved fields are set to zero).
*/
case 0x40000010:
*eax_ptr = (uint32_t)(tsc_clock_freq / 1000);
*ebx_ptr = (128 * 1024 * 1024) / 1000;
*ecx_ptr = 0;
*edx_ptr = 0;
break;
default:
break;
}
}
static DEFINE_CPU_DATA(struct cpuid_cache_entry[CPUID_EXTEND_FEATURE_CACHE_MAX],
cpuid_cache);
static inline struct cpuid_cache_entry *find_cpuid_cache_entry(uint32_t op,
uint32_t count)
{
int pcpu_id = get_cpu_id();
enum cpuid_cache_idx idx = CPUID_EXTEND_FEATURE_CACHE_MAX;
if ((count != 0))
return NULL;
switch (op) {
case CPUID_VENDORSTRING:
idx = CPUID_VENDORSTRING_CACHE_IDX;
break;
case CPUID_FEATURES:
idx = CPUID_FEATURES_CACHE_IDX;
break;
case CPUID_EXTEND_FEATURE:
idx = CPUID_EXTEND_FEATURE_CACHE_IDX;
break;
default:
break;
}
if (idx == CPUID_EXTEND_FEATURE_CACHE_MAX)
return NULL;
return &per_cpu(cpuid_cache, pcpu_id)[idx];
}
inline void cpuid_count(uint32_t op, uint32_t count,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
{
struct cpuid_cache_entry *entry;
entry = find_cpuid_cache_entry(op, count);
if (entry == NULL) {
native_cpuid_count(op, count, a, b, c, d);
} else if (entry->inited) {
*a = entry->a;
*b = entry->b;
*c = entry->c;
*d = entry->d;
} else {
native_cpuid_count(op, count, a, b, c, d);
entry->a = *a;
entry->b = *b;
entry->c = *c;
entry->d = *d;
entry->inited = 1;
}
}

569
hypervisor/arch/x86/ept.c Normal file
View File

@@ -0,0 +1,569 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <acrn_hv_defs.h>
#include <hv_arch.h>
#include <hypercall.h>
#include <hv_debug.h>
#include "guest/instr_emul_wrapper.h"
#include "guest/instr_emul.h"
#define ACRN_DBG_EPT 6
void *create_guest_paging(struct vm *vm)
{
void *hva_dest;
void *hva_src;
/* copy guest identity mapped 4G page table to guest */
hva_dest = GPA2HVA(vm,
(uint64_t)CPU_Boot_Page_Tables_Start_VM);
hva_src = (void *)(_ld_cpu_secondary_reset_load
+ (CPU_Boot_Page_Tables_Start_VM
- _ld_cpu_secondary_reset_start));
/* 2MB page size, need to copy 6 pages */
memcpy_s(hva_dest, 6 * CPU_PAGE_SIZE, hva_src, 6 * CPU_PAGE_SIZE);
return (void *)CPU_Boot_Page_Tables_Start_VM;
}
static void *find_next_table(uint32_t table_offset,
void *table_base)
{
uint64_t table_entry;
uint64_t table_present;
void *sub_table_addr = 0;
/* Read the table entry */
table_entry = MEM_READ64(table_base
+ (table_offset * IA32E_COMM_ENTRY_SIZE));
/* If bit 7 is set, entry is not a subtable. */
if ((table_entry & IA32E_PDPTE_PS_BIT)
|| (table_entry & IA32E_PDE_PS_BIT))
return sub_table_addr;
/* Set table present bits to any of the read/write/execute bits */
table_present = (IA32E_EPT_R_BIT | IA32E_EPT_W_BIT | IA32E_EPT_X_BIT);
/* Determine if a valid entry exists */
if ((table_entry & table_present) == 0) {
/* No entry present */
return sub_table_addr;
}
/* Get address of the sub-table */
sub_table_addr = (void *)(table_entry & IA32E_REF_MASK);
/* Return the next table in the walk */
return sub_table_addr;
}
void free_ept_mem(void *pml4_addr)
{
void *pdpt_addr;
void *pde_addr;
void *pte_addr;
uint32_t pml4_index;
uint32_t pdpt_index;
uint32_t pde_index;
for (pml4_index = 0; pml4_index < IA32E_NUM_ENTRIES; pml4_index++) {
/* Walk from the PML4 table to the PDPT table */
pdpt_addr = find_next_table(pml4_index, pml4_addr);
if (pdpt_addr == NULL)
continue;
for (pdpt_index = 0; pdpt_index < IA32E_NUM_ENTRIES;
pdpt_index++) {
/* Walk from the PDPT table to the PD table */
pde_addr = find_next_table(pdpt_index, pdpt_addr);
if (pde_addr == NULL)
continue;
for (pde_index = 0; pde_index < IA32E_NUM_ENTRIES;
pde_index++) {
/* Walk from the PD table to the page table */
pte_addr = find_next_table(pde_index,
pde_addr);
/* Free page table entry table */
if (pte_addr)
free(pte_addr);
}
/* Free page directory entry table */
if (pde_addr)
free(pde_addr);
}
free(pdpt_addr);
}
free(pml4_addr);
}
void destroy_ept(struct vm *vm)
{
free_ept_mem(vm->arch_vm.ept);
free_ept_mem(vm->arch_vm.m2p);
}
uint64_t gpa2hpa_check(struct vm *vm, uint64_t gpa,
uint64_t size, int *found, bool assert)
{
uint64_t hpa = 0;
int _found = 0;
struct entry_params entry;
struct map_params map_params;
map_params.page_table_type = PT_EPT;
map_params.pml4_base = vm->arch_vm.ept;
map_params.pml4_inverted = vm->arch_vm.m2p;
obtain_last_page_table_entry(&map_params, &entry,
(void *)gpa, true);
if (entry.entry_present == PT_PRESENT
/* if cross several pages, now not handle it,
* only print error info
*/
&& ((gpa % entry.page_size) + size) <= entry.page_size) {
_found = 1;
hpa = ((entry.entry_val & (~(entry.page_size - 1)))
| (gpa & (entry.page_size - 1)));
}
if (found != NULL)
*found = _found;
if (_found == 0 && assert) {
pr_err("VM %d GPA2HPA: failed for gpa 0x%llx",
vm->attr.boot_idx, gpa);
ASSERT(_found != 0, "GPA2HPA not found");
}
pr_dbg("GPA2HPA: 0x%llx->0x%llx", gpa, hpa);
return hpa;
}
uint64_t gpa2hpa(struct vm *vm, uint64_t gpa)
{
return gpa2hpa_check(vm, gpa, 0, NULL, true);
}
uint64_t hpa2gpa(struct vm *vm, uint64_t hpa)
{
struct entry_params entry;
struct map_params map_params;
map_params.page_table_type = PT_EPT;
map_params.pml4_base = vm->arch_vm.ept;
map_params.pml4_inverted = vm->arch_vm.m2p;
obtain_last_page_table_entry(&map_params, &entry,
(void *)hpa, false);
if (entry.entry_present == PT_NOT_PRESENT) {
pr_err("VM %d hpa2gpa: failed for hpa 0x%llx",
vm->attr.boot_idx, hpa);
ASSERT(false, "hpa2gpa not found");
}
return ((entry.entry_val & (~(entry.page_size - 1)))
| (hpa & (entry.page_size - 1)));
}
int is_ept_supported(void)
{
uint16_t status;
uint64_t tmp64;
/* Read primary processor based VM control. */
tmp64 = msr_read(MSR_IA32_VMX_PROCBASED_CTLS);
/* Check if secondary processor based VM control is available. */
if (tmp64 & MMU_MEM_ATTR_BIT_EXECUTE_DISABLE) {
/* Read primary processor based VM control. */
tmp64 = msr_read(MSR_IA32_VMX_PROCBASED_CTLS2);
/* Check if EPT is supported. */
if (tmp64 & (((uint64_t)VMX_PROCBASED_CTLS2_EPT) << 32)) {
/* EPT is present. */
status = 1;
} else {
status = 0;
}
} else {
/* Secondary processor based VM control is not present */
status = 0;
}
return status;
}
static int check_hv_mmio_range(struct vm *vm, struct mem_io *mmio)
{
int status = false;
struct list_head *pos;
struct mem_io_node *mmio_node;
list_for_each(pos, &vm->mmio_list) {
mmio_node = list_entry(pos, struct mem_io_node, list);
/* Check if this handler's range covers this memory access */
if ((mmio->paddr >= mmio_node->range_start) &&
(mmio->paddr + mmio->access_size <=
mmio_node->range_end)) {
status = true;
/* Break from loop - only 1 handler allowed to support
* a given memory range
*/
break;
}
}
/* Return success for now */
return status;
}
static int hv_emulate_mmio(struct vcpu *vcpu, struct mem_io *mmio)
{
int status = -EINVAL;
struct list_head *pos;
struct mem_io_node *mmio_node;
struct vm *vm = vcpu->vm;
list_for_each(pos, &vm->mmio_list) {
mmio_node = list_entry(pos, struct mem_io_node, list);
/* Check if this handler's range covers this memory access */
if ((mmio->paddr >= mmio_node->range_start) &&
(mmio->paddr + mmio->access_size
<= mmio_node->range_end)) {
ASSERT((mmio->paddr % mmio->access_size) == 0,
"access size not align with paddr");
/* Handle this MMIO operation */
status = mmio_node->read_write(vcpu, mmio,
mmio_node->handler_private_data);
/* Break from loop - only 1 handler allowed to support
* given memory range
*/
break;
}
}
/* Return success for now */
return status;
}
int register_mmio_emulation_handler(struct vm *vm,
hv_mem_io_handler_t read_write, uint64_t start,
uint64_t end, void *handler_private_data)
{
int status = -EINVAL;
struct mem_io_node *mmio_node;
if (vm->hw.created_vcpus > 0 && vm->hw.vcpu_array[0]->launched) {
ASSERT(0, "register mmio handler after vm launched");
return status;
}
/* Ensure both a read/write handler and range check function exist */
if ((read_write != HV_NULL) && (end > start)) {
/* Allocate memory for node */
mmio_node =
(struct mem_io_node *)calloc(1, sizeof(struct mem_io_node));
/* Ensure memory successfully allocated */
if (mmio_node) {
/* Fill in information for this node */
mmio_node->read_write = read_write;
mmio_node->handler_private_data = handler_private_data;
INIT_LIST_HEAD(&mmio_node->list);
list_add(&mmio_node->list, &vm->mmio_list);
mmio_node->range_start = start;
mmio_node->range_end = end;
ept_mmap(vm, start, start, end - start,
MAP_UNMAP, 0);
/* Return success */
status = 0;
}
}
/* Return status to caller */
return status;
}
void unregister_mmio_emulation_handler(struct vm *vm, uint64_t start,
uint64_t end)
{
struct list_head *pos, *tmp;
struct mem_io_node *mmio_node;
list_for_each_safe(pos, tmp, &vm->mmio_list) {
mmio_node = list_entry(pos, struct mem_io_node, list);
if ((mmio_node->range_start == start) &&
(mmio_node->range_end == end)) {
/* assume only one entry found in mmio_list */
list_del_init(&mmio_node->list);
free(mmio_node);
break;
}
}
}
int dm_emulate_mmio_post(struct vcpu *vcpu)
{
int ret = 0;
int cur = vcpu->vcpu_id;
struct vhm_request_buffer *req_buf =
(void *)HPA2HVA(vcpu->vm->sw.req_buf);
vcpu->req.reqs.mmio_request.value =
req_buf->req_queue[cur].reqs.mmio_request.value;
/* VHM emulation data already copy to req, mark to free slot now */
req_buf->req_queue[cur].valid = false;
if (req_buf->req_queue[cur].processed == REQ_STATE_SUCCESS)
vcpu->mmio.mmio_status = MMIO_TRANS_VALID;
else {
vcpu->mmio.mmio_status = MMIO_TRANS_INVALID;
goto out;
}
if (vcpu->mmio.read_write == HV_MEM_IO_READ) {
vcpu->mmio.value = vcpu->req.reqs.mmio_request.value;
/* Emulate instruction and update vcpu register set */
ret = emulate_instruction(vcpu, &vcpu->mmio);
if (ret != 0)
goto out;
}
out:
return ret;
}
static int dm_emulate_mmio_pre(struct vcpu *vcpu, uint64_t exit_qual)
{
int status;
status = analyze_instruction(vcpu, &vcpu->mmio);
if (status != 0)
return status;
if (vcpu->mmio.read_write == HV_MEM_IO_WRITE) {
status = emulate_instruction(vcpu, &vcpu->mmio);
if (status != 0)
return status;
vcpu->req.reqs.mmio_request.value = vcpu->mmio.value;
/* XXX: write access while EPT perm RX -> WP */
if ((exit_qual & 0x38) == 0x28)
vcpu->req.type = REQ_WP;
}
if (vcpu->req.type == 0)
vcpu->req.type = REQ_MMIO;
vcpu->req.reqs.mmio_request.direction = vcpu->mmio.read_write;
vcpu->req.reqs.mmio_request.address = (long)vcpu->mmio.paddr;
vcpu->req.reqs.mmio_request.size = vcpu->mmio.access_size;
return 0;
}
int ept_violation_handler(struct vcpu *vcpu)
{
int status;
uint64_t exit_qual;
uint64_t gpa;
/* Handle page fault from guest */
exit_qual = exec_vmread(VMX_EXIT_QUALIFICATION);
memset(&vcpu->req, 0, sizeof(struct vhm_request));
/* Specify if read or write operation */
if (exit_qual & 0x2) {
/* Write operation */
vcpu->mmio.read_write = HV_MEM_IO_WRITE;
/* Get write value from appropriate register in context */
/* TODO: Need to figure out how to determine value being
* written
*/
vcpu->mmio.value = 0;
} else {
/* Read operation */
vcpu->mmio.read_write = HV_MEM_IO_READ;
/* Get sign extension requirements for read */
/* TODO: Need to determine how sign extension is determined for
* reads
*/
vcpu->mmio.sign_extend_read = 0;
}
/* Get the guest physical address */
gpa = exec_vmread64(VMX_GUEST_PHYSICAL_ADDR_FULL);
TRACE_2L(TRC_VMEXIT_EPT_VIOLATION, exit_qual, gpa);
/* Adjust IPA appropriately and OR page offset to get full IPA of abort
*/
vcpu->mmio.paddr = gpa;
/* Check if the MMIO access has a HV registered handler */
status = check_hv_mmio_range((struct vm *) vcpu->vm, &vcpu->mmio);
if (status == true) {
/* Fetch and decode current vcpu instruction */
status = analyze_instruction(vcpu, &vcpu->mmio);
if (status != 0)
goto out;
if (vcpu->mmio.read_write == HV_MEM_IO_WRITE) {
status = emulate_instruction(vcpu, &vcpu->mmio);
if (status != 0)
goto out;
}
/* Call generic memory emulation handler
* For MMIO write, call hv_emulate_mmio after
* instruction emulation. For MMIO read,
* call hv_emulate_mmio at first.
*/
status = hv_emulate_mmio(vcpu, &vcpu->mmio);
if (vcpu->mmio.read_write == HV_MEM_IO_READ) {
/* Emulate instruction and update vcpu register set */
status = emulate_instruction(vcpu, &vcpu->mmio);
if (status != 0)
goto out;
}
} else {
/*
* No mmio handler from HV side, search from VHM in Dom0
*
* ACRN insert request to VHM and inject upcall
* For MMIO write, ask DM to run MMIO emulation after
* instruction emulation. For MMIO read, ask DM to run MMIO
* emulation at first.
*/
status = dm_emulate_mmio_pre(vcpu, exit_qual);
if (status != 0)
goto out;
status = acrn_insert_request_wait(vcpu, &vcpu->req);
}
return status;
out:
pr_fatal("Guest Linear Address: 0x%016llx",
exec_vmread(VMX_GUEST_LINEAR_ADDR));
pr_fatal("Guest Physical Address address: 0x%016llx",
gpa);
ASSERT(status == true, "EPT violation");
return status;
}
int ept_misconfig_handler(__unused struct vcpu *vcpu)
{
int status;
status = -EINVAL;
/* TODO - EPT Violation handler */
pr_info("%s, Guest linear address: 0x%016llx ",
__func__, exec_vmread64(VMX_GUEST_LINEAR_ADDR));
pr_info("%s, Guest physical address: 0x%016llx ",
__func__, exec_vmread64(VMX_GUEST_PHYSICAL_ADDR_FULL));
ASSERT(status == 0, "EPT Misconfiguration is not handled.\n");
TRACE_2L(TRC_VMEXIT_EPT_MISCONFIGURATION, 0, 0);
return status;
}
int ept_mmap(struct vm *vm, uint64_t hpa,
uint64_t gpa, uint64_t size, uint32_t type, uint32_t prot)
{
struct map_params map_params;
int i;
struct vcpu *vcpu;
/* Setup memory map parameters */
map_params.page_table_type = PT_EPT;
if (vm->arch_vm.ept) {
map_params.pml4_base = vm->arch_vm.ept;
map_params.pml4_inverted = vm->arch_vm.m2p;
} else {
map_params.pml4_base =
alloc_paging_struct();
vm->arch_vm.ept = map_params.pml4_base;
map_params.pml4_inverted = alloc_paging_struct();
vm->arch_vm.m2p = map_params.pml4_inverted;
}
if (type == MAP_MEM || type == MAP_MMIO) {
map_mem(&map_params, (void *)hpa,
(void *)gpa, size, prot);
} else if (type == MAP_UNMAP) {
unmap_mem(&map_params, (void *)hpa, (void *)gpa,
size, prot);
} else
ASSERT(0, "unknown map type");
foreach_vcpu(i, vm, vcpu) {
vcpu_make_request(vcpu, ACRN_REQUEST_TLB_FLUSH);
}
dev_dbg(ACRN_DBG_EPT, "ept map: %s hpa: 0x%016llx gpa: 0x%016llx ",
type == MAP_UNMAP ? "unmap" : "map", hpa, gpa);
dev_dbg(ACRN_DBG_EPT, "size: 0x%016llx prot: 0x%x\n", size, prot);
return 0;
}

84
hypervisor/arch/x86/gdt.c Normal file
View File

@@ -0,0 +1,84 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hv_lib.h>
#include <cpu.h>
#include <gdt.h>
DEFINE_CPU_DATA(struct tss_64, tss);
DEFINE_CPU_DATA(struct host_gdt, gdt);
DEFINE_CPU_DATA(uint8_t[STACK_SIZE], mc_stack) __aligned(16);
DEFINE_CPU_DATA(uint8_t[STACK_SIZE], df_stack) __aligned(16);
DEFINE_CPU_DATA(uint8_t[STACK_SIZE], sf_stack) __aligned(16);
static void set_tss_desc(union tss_64_descriptor *desc,
void *tss, int tss_limit, int type)
{
uint32_t u1, u2, u3;
u1 = ((uint64_t)tss << 16) & 0xFFFFFFFF;
u2 = (uint64_t)tss & 0xFF000000;
u3 = ((uint64_t)tss & 0x00FF0000) >> 16;
desc->low32.value = u1 | (tss_limit & 0xFFFF);
desc->base_addr_63_32 = (uint32_t)((uint64_t)tss >> 32);
desc->high32.value = (u2 | ((uint32_t)type << 8) | 0x8000 | u3);
}
void load_gdtr_and_tr(void)
{
struct host_gdt *gdt = &get_cpu_var(gdt);
struct host_gdt_descriptor gdtr;
struct tss_64 *tss = &get_cpu_var(tss);
/* first entry is not used */
gdt->rsvd = 0xAAAAAAAAAAAAAAAA;
/* ring 0 code sel descriptor */
gdt->host_gdt_code_descriptor.value = 0x00Af9b000000ffff;
/* ring 0 data sel descriptor */
gdt->host_gdt_data_descriptor.value = 0x00cf93000000ffff;
tss->ist1 = (uint64_t)get_cpu_var(mc_stack) + STACK_SIZE;
tss->ist2 = (uint64_t)get_cpu_var(df_stack) + STACK_SIZE;
tss->ist3 = (uint64_t)get_cpu_var(sf_stack) + STACK_SIZE;
tss->ist4 = 0L;
/* tss descriptor */
set_tss_desc(&gdt->host_gdt_tss_descriptors,
(void *)tss, sizeof(struct tss_64), TSS_AVAIL);
gdtr.len = sizeof(struct host_gdt) - 1;
gdtr.gdt = gdt;
asm volatile ("lgdt %0" ::"m"(gdtr));
CPU_LTR_EXECUTE(HOST_GDT_RING0_CPU_TSS_SEL);
}

View File

@@ -0,0 +1,389 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <bsp_cfg.h>
#include <bsp_extern.h>
#include <acrn_hv_defs.h>
#include <hv_debug.h>
#include <multiboot.h>
#define BOOT_ARGS_LOAD_ADDR 0x24EFC000
#define ACRN_DBG_GUEST 6
/* for VM0 e820 */
uint32_t e820_entries;
struct e820_entry e820[E820_MAX_ENTRIES];
struct e820_mem_params e820_mem;
inline bool
is_vm0(struct vm *vm)
{
return (vm->attr.boot_idx & 0x7F) == 0;
}
inline struct vcpu *vcpu_from_vid(struct vm *vm, int vcpu_id)
{
int i;
struct vcpu *vcpu;
foreach_vcpu(i, vm, vcpu) {
if (vcpu->vcpu_id == vcpu_id)
return vcpu;
}
return NULL;
}
inline struct vcpu *vcpu_from_pid(struct vm *vm, int pcpu_id)
{
int i;
struct vcpu *vcpu;
foreach_vcpu(i, vm, vcpu) {
if (vcpu->pcpu_id == pcpu_id)
return vcpu;
}
return NULL;
}
inline struct vcpu *get_primary_vcpu(struct vm *vm)
{
int i;
struct vcpu *vcpu;
foreach_vcpu(i, vm, vcpu) {
if (is_vcpu_bsp(vcpu))
return vcpu;
}
return NULL;
}
inline uint64_t vcpumask2pcpumask(struct vm *vm, uint64_t vdmask)
{
int vcpu_id;
uint64_t dmask = 0;
struct vcpu *vcpu;
while ((vcpu_id = bitmap_ffs(&vdmask)) >= 0) {
bitmap_clr(vcpu_id, &vdmask);
vcpu = vcpu_from_vid(vm, vcpu_id);
ASSERT(vcpu, "vcpu_from_vid failed");
bitmap_set(vcpu->pcpu_id, &dmask);
}
return dmask;
}
inline bool vm_lapic_disabled(struct vm *vm)
{
int i;
struct vcpu *vcpu;
foreach_vcpu(i, vm, vcpu) {
if (vlapic_enabled(vcpu->arch_vcpu.vlapic))
return false;
}
return true;
}
int init_vm0_boot_info(struct vm *vm)
{
struct multiboot_module *mods = NULL;
struct multiboot_info *mbi = NULL;
if (!is_vm0(vm)) {
pr_err("just for vm0 to get info!");
return -EINVAL;
}
if (boot_regs[0] != MULTIBOOT_INFO_MAGIC) {
ASSERT(0, "no multiboot info found");
return -EINVAL;
}
mbi = (struct multiboot_info *)((uint64_t)boot_regs[1]);
dev_dbg(ACRN_DBG_GUEST, "Multiboot detected, flag=0x%x", mbi->mi_flags);
if (!(mbi->mi_flags & MULTIBOOT_INFO_HAS_MODS)) {
ASSERT(0, "no sos kernel info found");
return -EINVAL;
}
dev_dbg(ACRN_DBG_GUEST, "mod counts=%d\n", mbi->mi_mods_count);
/* mod[0] is for kernel&cmdline, other mod for ramdisk/firmware info*/
mods = (struct multiboot_module *)(uint64_t)mbi->mi_mods_addr;
dev_dbg(ACRN_DBG_GUEST, "mod0 start=0x%x, end=0x%x",
mods[0].mm_mod_start, mods[0].mm_mod_end);
dev_dbg(ACRN_DBG_GUEST, "cmd addr=0x%x, str=%s", mods[0].mm_string,
(char *) (uint64_t)mods[0].mm_string);
vm->sw.kernel_type = VM_LINUX_GUEST;
vm->sw.kernel_info.kernel_src_addr =
(void *)(uint64_t)mods[0].mm_mod_start;
vm->sw.kernel_info.kernel_size =
mods[0].mm_mod_end - mods[0].mm_mod_start;
vm->sw.kernel_info.kernel_load_addr =
(void *)(uint64_t)mods[0].mm_mod_start;
vm->sw.linux_info.bootargs_src_addr =
(void *)(uint64_t)mods[0].mm_string;
vm->sw.linux_info.bootargs_load_addr =
(void *)BOOT_ARGS_LOAD_ADDR;
vm->sw.linux_info.bootargs_size =
strnlen_s((char *)(uint64_t) mods[0].mm_string, MEM_2K);
return 0;
}
uint64_t gva2gpa(struct vm *vm, uint64_t cr3, uint64_t gva)
{
int level, index, shift;
uint64_t *base, addr, entry, page_size;
uint64_t gpa = 0;
addr = cr3;
for (level = 3; level >= 0; level--) {
addr = addr & IA32E_REF_MASK;
base = GPA2HVA(vm, addr);
ASSERT(base != NULL, "invalid ptp base.");
shift = level * 9 + 12;
index = (gva >> shift) & 0x1FF;
page_size = 1UL << shift;
entry = base[index];
if (level > 0 && (entry & MMU_32BIT_PDE_PS) != 0)
break;
addr = entry;
}
entry >>= shift; entry <<= (shift + 12); entry >>= 12;
gpa = entry | (gva & (page_size - 1));
return gpa;
}
void init_e820(void)
{
unsigned int i;
if (boot_regs[0] == MULTIBOOT_INFO_MAGIC) {
struct multiboot_info *mbi =
(struct multiboot_info *)((uint64_t)boot_regs[1]);
pr_info("Multiboot info detected\n");
if (mbi->mi_flags & 0x40) {
struct multiboot_mmap *mmap =
(struct multiboot_mmap *)
((uint64_t)mbi->mi_mmap_addr);
e820_entries = mbi->mi_mmap_length/
sizeof(struct multiboot_mmap);
if (e820_entries > E820_MAX_ENTRIES) {
pr_err("Too many E820 entries %d\n",
e820_entries);
e820_entries = E820_MAX_ENTRIES;
}
dev_dbg(ACRN_DBG_GUEST,
"mmap length 0x%x addr 0x%x entries %d\n",
mbi->mi_mmap_length, mbi->mi_mmap_addr,
e820_entries);
for (i = 0; i < e820_entries; i++) {
e820[i].baseaddr = mmap[i].baseaddr;
e820[i].length = mmap[i].length;
e820[i].type = mmap[i].type;
dev_dbg(ACRN_DBG_GUEST,
"mmap table: %d type: 0x%x\n",
i, mmap[i].type);
dev_dbg(ACRN_DBG_GUEST,
"Base: 0x%016llx length: 0x%016llx",
mmap[i].baseaddr, mmap[i].length);
}
}
} else
ASSERT(0, "no multiboot info found");
}
void obtain_e820_mem_info(void)
{
unsigned int i;
struct e820_entry *entry;
e820_mem.mem_bottom = UINT64_MAX;
e820_mem.mem_top = 0x00;
e820_mem.max_ram_blk_base = 0;
e820_mem.max_ram_blk_size = 0;
for (i = 0; i < e820_entries; i++) {
entry = &e820[i];
if (e820_mem.mem_bottom > entry->baseaddr)
e820_mem.mem_bottom = entry->baseaddr;
if (entry->baseaddr + entry->length
> e820_mem.mem_top) {
e820_mem.mem_top = entry->baseaddr
+ entry->length;
}
if (entry->baseaddr == UOS_DEFAULT_START_ADDR
&& entry->type == E820_TYPE_RAM) {
e820_mem.max_ram_blk_base =
entry->baseaddr;
e820_mem.max_ram_blk_size = entry->length;
}
}
}
static void rebuild_vm0_e820(void)
{
unsigned int i;
uint64_t entry_start;
uint64_t entry_end;
uint64_t hv_start = CONFIG_RAM_START;
uint64_t hv_end = hv_start + CONFIG_RAM_SIZE;
struct e820_entry *entry, new_entry = {0};
/* hypervisor mem need be filter out from e820 table
* it's hv itself + other hv reserved mem like vgt etc
*/
for (i = 0; i < e820_entries; i++) {
entry = &e820[i];
entry_start = entry->baseaddr;
entry_end = entry->baseaddr + entry->length;
/* No need handle in these cases*/
if (entry->type != E820_TYPE_RAM || entry_end <= hv_start
|| entry_start >= hv_end) {
continue;
}
/* filter out hv mem and adjust length of this entry*/
if (entry_start < hv_start && entry_end <= hv_end) {
entry->length = hv_start - entry_start;
continue;
}
/* filter out hv mem and need to create a new entry*/
if (entry_start < hv_start && entry_end > hv_end) {
entry->length = hv_start - entry_start;
new_entry.baseaddr = hv_end;
new_entry.length = entry_end - hv_end;
new_entry.type = E820_TYPE_RAM;
continue;
}
/* This entry is within the range of hv mem
* change to E820_TYPE_RESERVED
*/
if (entry_start >= hv_start && entry_end <= hv_end) {
entry->type = E820_TYPE_RESERVED;
continue;
}
if (entry_start >= hv_start && entry_start < hv_end
&& entry_end > hv_end) {
entry->baseaddr = hv_end;
entry->length = entry_end - hv_end;
continue;
}
}
if (new_entry.length > 0) {
e820_entries++;
ASSERT(e820_entries <= E820_MAX_ENTRIES,
"e820 entry overflow");
entry = &e820[e820_entries - 1];
entry->baseaddr = new_entry.baseaddr;
entry->length = new_entry.length;
entry->type = new_entry.type;
}
}
int prepare_vm0_memmap_and_e820(struct vm *vm)
{
unsigned int i;
uint32_t attr_wb = (MMU_MEM_ATTR_READ |
MMU_MEM_ATTR_WRITE |
MMU_MEM_ATTR_EXECUTE |
MMU_MEM_ATTR_WB_CACHE);
uint32_t attr_uc = (MMU_MEM_ATTR_READ |
MMU_MEM_ATTR_WRITE |
MMU_MEM_ATTR_EXECUTE |
MMU_MEM_ATTR_UNCACHED);
struct e820_entry *entry;
ASSERT(is_vm0(vm), "This func only for vm0");
rebuild_vm0_e820();
dev_dbg(ACRN_DBG_GUEST,
"vm0: bottom memory - 0x%llx, top memory - 0x%llx\n",
e820_mem.mem_bottom, e820_mem.mem_top);
/* create real ept map for all ranges with UC */
ept_mmap(vm, e820_mem.mem_bottom, e820_mem.mem_bottom,
(e820_mem.mem_top - e820_mem.mem_bottom),
MAP_MMIO, attr_uc);
/* update ram entries to WB attr */
for (i = 0; i < e820_entries; i++) {
entry = &e820[i];
if (entry->type == E820_TYPE_RAM)
ept_mmap(vm, entry->baseaddr, entry->baseaddr,
entry->length, MAP_MEM, attr_wb);
}
dev_dbg(ACRN_DBG_GUEST, "VM0 e820 layout:\n");
for (i = 0; i < e820_entries; i++) {
entry = &e820[i];
dev_dbg(ACRN_DBG_GUEST,
"e820 table: %d type: 0x%x", i, entry->type);
dev_dbg(ACRN_DBG_GUEST,
"BaseAddress: 0x%016llx length: 0x%016llx\n",
entry->baseaddr, entry->length);
}
/* unmap hypervisor itself for safety
* will cause EPT violation if sos accesses hv memory
*/
ept_mmap(vm, CONFIG_RAM_START, CONFIG_RAM_START,
CONFIG_RAM_SIZE, MAP_UNMAP, 0);
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,95 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_INSTRUCTION_EMUL_H_
#define _VMM_INSTRUCTION_EMUL_H_
/*
* Callback functions to read and write memory regions.
*/
typedef int (*mem_region_read_t)(struct vcpu *vcpu, uint64_t gpa,
uint64_t *rval, int rsize, void *arg);
typedef int (*mem_region_write_t)(struct vcpu *vcpu, uint64_t gpa,
uint64_t wval, int wsize, void *arg);
/*
* Emulate the decoded 'vie' instruction.
*
* The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
* containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
* callback functions.
*
* 'void *vm' should be 'struct vm *' when called from kernel context and
* 'struct vmctx *' when called from user context.
* s
*/
int vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
struct vm_guest_paging *paging, mem_region_read_t mrr,
mem_region_write_t mrw, void *mrarg);
int vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,
uint64_t val, int size);
/*
* Returns 1 if an alignment check exception should be injected and 0 otherwise.
*/
int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
uint64_t rflags, uint64_t gla);
/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
uint64_t vie_size2mask(int size);
int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
uint64_t *gla);
void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
/*
* Decode the instruction fetched into 'vie' so it can be emulated.
*
* 'gla' is the guest linear address provided by the hardware assist
* that caused the nested page table fault. It is used to verify that
* the software instruction decoding is in agreement with the hardware.
*
* Some hardware assists do not provide the 'gla' to the hypervisor.
* To skip the 'gla' verification for this or any other reason pass
* in VIE_INVALID_GLA instead.
*/
#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
int vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
int emulate_instruction(struct vcpu *vcpu, struct mem_io *mmio);
int analyze_instruction(struct vcpu *vcpu, struct mem_io *mmio);
#endif /* _VMM_INSTRUCTION_EMUL_H_ */

View File

@@ -0,0 +1,466 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <bsp_extern.h>
#include <hv_debug.h>
#include "instr_emul_wrapper.h"
#include "instr_emul.h"
struct emul_cnx {
struct vie vie;
struct vm_guest_paging paging;
struct vcpu *vcpu;
struct mem_io *mmio;
};
static DEFINE_CPU_DATA(struct emul_cnx, g_inst_ctxt);
static int
encode_vmcs_seg_desc(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc);
static int32_t
get_vmcs_field(int ident);
static bool
is_segment_register(int reg);
static bool
is_descriptor_table(int reg);
int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
{
struct run_context *cur_context;
if (!vcpu)
return -EINVAL;
if ((reg >= VM_REG_LAST) || (reg < VM_REG_GUEST_RAX))
return -EINVAL;
if ((reg >= VM_REG_GUEST_RAX) && (reg <= VM_REG_GUEST_RDI)) {
cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
*retval = cur_context->guest_cpu_regs.longs[reg];
} else if ((reg > VM_REG_GUEST_RDI) && (reg < VM_REG_LAST)) {
int32_t field = get_vmcs_field(reg);
if (field != -1)
*retval = exec_vmread(field);
else
return -EINVAL;
}
return 0;
}
int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
{
struct run_context *cur_context;
if (!vcpu)
return -EINVAL;
if ((reg >= VM_REG_LAST) || (reg < VM_REG_GUEST_RAX))
return -EINVAL;
if ((reg >= VM_REG_GUEST_RAX) && (reg <= VM_REG_GUEST_RDI)) {
cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
cur_context->guest_cpu_regs.longs[reg] = val;
} else if ((reg > VM_REG_GUEST_RDI) && (reg < VM_REG_LAST)) {
int32_t field = get_vmcs_field(reg);
if (field != -1)
exec_vmwrite(field, val);
else
return -EINVAL;
}
return 0;
}
int vm_set_seg_desc(struct vcpu *vcpu, int seg, struct seg_desc *ret_desc)
{
int error;
uint32_t base, limit, access;
if ((!vcpu) || (!ret_desc))
return -EINVAL;
if (!is_segment_register(seg) && !is_descriptor_table(seg))
return -EINVAL;
error = encode_vmcs_seg_desc(seg, &base, &limit, &access);
if ((error != 0) || (access == 0xffffffff))
return -EINVAL;
exec_vmwrite(base, ret_desc->base);
exec_vmwrite(limit, ret_desc->limit);
exec_vmwrite(access, ret_desc->access);
return 0;
}
int vm_get_seg_desc(struct vcpu *vcpu, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
if ((!vcpu) || (!desc))
return -EINVAL;
if (!is_segment_register(seg) && !is_descriptor_table(seg))
return -EINVAL;
error = encode_vmcs_seg_desc(seg, &base, &limit, &access);
if ((error != 0) || (access == 0xffffffff))
return -EINVAL;
desc->base = exec_vmread(base);
desc->limit = exec_vmread(limit);
desc->access = exec_vmread(access);
return 0;
}
int vm_restart_instruction(struct vcpu *vcpu)
{
if (!vcpu)
return -EINVAL;
VCPU_RETAIN_RIP(vcpu);
return 0;
}
static bool is_descriptor_table(int reg)
{
switch (reg) {
case VM_REG_GUEST_IDTR:
case VM_REG_GUEST_GDTR:
return true;
default:
return false;
}
}
static bool is_segment_register(int reg)
{
switch (reg) {
case VM_REG_GUEST_ES:
case VM_REG_GUEST_CS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_TR:
case VM_REG_GUEST_LDTR:
return true;
default:
return false;
}
}
static int encode_vmcs_seg_desc(int seg, uint32_t *base, uint32_t *lim,
uint32_t *acc)
{
switch (seg) {
case VM_REG_GUEST_ES:
*base = VMX_GUEST_ES_BASE;
*lim = VMX_GUEST_ES_LIMIT;
*acc = VMX_GUEST_ES_ATTR;
break;
case VM_REG_GUEST_CS:
*base = VMX_GUEST_CS_BASE;
*lim = VMX_GUEST_CS_LIMIT;
*acc = VMX_GUEST_CS_ATTR;
break;
case VM_REG_GUEST_SS:
*base = VMX_GUEST_SS_BASE;
*lim = VMX_GUEST_SS_LIMIT;
*acc = VMX_GUEST_SS_ATTR;
break;
case VM_REG_GUEST_DS:
*base = VMX_GUEST_DS_BASE;
*lim = VMX_GUEST_DS_LIMIT;
*acc = VMX_GUEST_DS_ATTR;
break;
case VM_REG_GUEST_FS:
*base = VMX_GUEST_FS_BASE;
*lim = VMX_GUEST_FS_LIMIT;
*acc = VMX_GUEST_FS_ATTR;
break;
case VM_REG_GUEST_GS:
*base = VMX_GUEST_GS_BASE;
*lim = VMX_GUEST_GS_LIMIT;
*acc = VMX_GUEST_GS_ATTR;
break;
case VM_REG_GUEST_TR:
*base = VMX_GUEST_TR_BASE;
*lim = VMX_GUEST_TR_LIMIT;
*acc = VMX_GUEST_TR_ATTR;
break;
case VM_REG_GUEST_LDTR:
*base = VMX_GUEST_LDTR_BASE;
*lim = VMX_GUEST_LDTR_LIMIT;
*acc = VMX_GUEST_LDTR_ATTR;
break;
case VM_REG_GUEST_IDTR:
*base = VMX_GUEST_IDTR_BASE;
*lim = VMX_GUEST_IDTR_LIMIT;
*acc = 0xffffffff;
break;
case VM_REG_GUEST_GDTR:
*base = VMX_GUEST_GDTR_BASE;
*lim = VMX_GUEST_GDTR_LIMIT;
*acc = 0xffffffff;
break;
default:
return -EINVAL;
}
return 0;
}
static int32_t get_vmcs_field(int ident)
{
switch (ident) {
case VM_REG_GUEST_CR0:
return VMX_GUEST_CR0;
case VM_REG_GUEST_CR3:
return VMX_GUEST_CR3;
case VM_REG_GUEST_CR4:
return VMX_GUEST_CR4;
case VM_REG_GUEST_DR7:
return VMX_GUEST_DR7;
case VM_REG_GUEST_RSP:
return VMX_GUEST_RSP;
case VM_REG_GUEST_RIP:
return VMX_GUEST_RIP;
case VM_REG_GUEST_RFLAGS:
return VMX_GUEST_RFLAGS;
case VM_REG_GUEST_ES:
return VMX_GUEST_ES_SEL;
case VM_REG_GUEST_CS:
return VMX_GUEST_CS_SEL;
case VM_REG_GUEST_SS:
return VMX_GUEST_SS_SEL;
case VM_REG_GUEST_DS:
return VMX_GUEST_DS_SEL;
case VM_REG_GUEST_FS:
return VMX_GUEST_FS_SEL;
case VM_REG_GUEST_GS:
return VMX_GUEST_GS_SEL;
case VM_REG_GUEST_TR:
return VMX_GUEST_TR_SEL;
case VM_REG_GUEST_LDTR:
return VMX_GUEST_LDTR_SEL;
case VM_REG_GUEST_EFER:
return VMX_GUEST_IA32_EFER_FULL;
case VM_REG_GUEST_PDPTE0:
return VMX_GUEST_PDPTE0_FULL;
case VM_REG_GUEST_PDPTE1:
return VMX_GUEST_PDPTE1_FULL;
case VM_REG_GUEST_PDPTE2:
return VMX_GUEST_PDPTE2_FULL;
case VM_REG_GUEST_PDPTE3:
return VMX_GUEST_PDPTE3_FULL;
default:
return -1;
}
}
static enum vm_cpu_mode get_vmx_cpu_mode(void)
{
uint32_t csar;
if (exec_vmread(VMX_GUEST_IA32_EFER_FULL) & EFER_LMA) {
csar = exec_vmread(VMX_GUEST_CS_ATTR);
if (csar & 0x2000)
return CPU_MODE_64BIT; /* CS.L = 1 */
else
return CPU_MODE_COMPATIBILITY;
} else if (exec_vmread(VMX_GUEST_CR0) & CR0_PE) {
return CPU_MODE_PROTECTED;
} else {
return CPU_MODE_REAL;
}
}
static void get_guest_paging_info(struct vcpu *vcpu, struct emul_cnx *emul_cnx)
{
uint32_t cpl, csar;
ASSERT(emul_cnx != NULL && vcpu != NULL, "Error in input arguments");
csar = exec_vmread(VMX_GUEST_CS_ATTR);
cpl = (csar >> 5) & 3;
emul_cnx->paging.cr3 =
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3;
emul_cnx->paging.cpl = cpl;
emul_cnx->paging.cpu_mode = get_vmx_cpu_mode();
emul_cnx->paging.paging_mode = PAGING_MODE_FLAT;/*maybe change later*/
}
static int mmio_read(struct vcpu *vcpu, __unused uint64_t gpa, uint64_t *rval,
__unused int size, __unused void *arg)
{
struct emul_cnx *emul_cnx;
struct mem_io *mmio;
if (!vcpu)
return -EINVAL;
emul_cnx = &per_cpu(g_inst_ctxt, vcpu->pcpu_id);
mmio = emul_cnx->mmio;
ASSERT(mmio != NULL, "invalid mmio when reading");
*rval = mmio->value;
return 0;
}
static int mmio_write(struct vcpu *vcpu, __unused uint64_t gpa, uint64_t wval,
__unused int size, __unused void *arg)
{
struct emul_cnx *emul_cnx;
struct mem_io *mmio;
if (!vcpu)
return -EINVAL;
emul_cnx = &per_cpu(g_inst_ctxt, vcpu->pcpu_id);
mmio = emul_cnx->mmio;
ASSERT(mmio != NULL, "invalid mmio when writing");
mmio->value = wval;
return 0;
}
void vm_gva2gpa(struct vcpu *vcpu, uint64_t gva, uint64_t *gpa)
{
ASSERT(gpa != NULL, "Error in input arguments");
ASSERT(vcpu != NULL,
"Invalid vcpu id when gva2gpa");
*gpa = gva2gpa(vcpu->vm,
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3, gva);
}
int analyze_instruction(struct vcpu *vcpu, struct mem_io *mmio)
{
uint64_t guest_rip_gva, guest_rip_gpa;
char *guest_rip_hva;
struct emul_cnx *emul_cnx;
uint32_t csar;
int retval = 0;
enum vm_cpu_mode cpu_mode;
int i;
guest_rip_gva =
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].rip;
guest_rip_gpa = gva2gpa(vcpu->vm,
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3,
guest_rip_gva);
guest_rip_hva = GPA2HVA(vcpu->vm, guest_rip_gpa);
emul_cnx = &per_cpu(g_inst_ctxt, vcpu->pcpu_id);
emul_cnx->mmio = mmio;
emul_cnx->vcpu = vcpu;
/* by now, HVA <-> HPA is 1:1 mapping, so use hpa is OK*/
vie_init(&emul_cnx->vie, guest_rip_hva,
vcpu->arch_vcpu.inst_len);
get_guest_paging_info(vcpu, emul_cnx);
csar = exec_vmread(VMX_GUEST_CS_ATTR);
cpu_mode = get_vmx_cpu_mode();
mmio->private_data = emul_cnx;
retval = vmm_decode_instruction(vcpu, guest_rip_gva,
cpu_mode, SEG_DESC_DEF32(csar), &emul_cnx->vie);
mmio->access_size = emul_cnx->vie.opsize;
if (retval != 0) {
/* dump to instruction when decoding failed */
pr_err("decode following instruction failed @ 0x%016llx:",
exec_vmread(VMX_GUEST_RIP));
for (i = 0; i < emul_cnx->vie.num_valid; i++) {
if (i >= VIE_INST_SIZE)
break;
if (i == 0)
pr_err("\n");
pr_err("%d=%02hhx ",
i, emul_cnx->vie.inst[i]);
}
}
return retval;
}
int emulate_instruction(struct vcpu *vcpu, struct mem_io *mmio)
{
struct emul_cnx *emul_cnx = (struct emul_cnx *)(mmio->private_data);
struct vm_guest_paging *paging = &emul_cnx->paging;
int i, retval = 0;
uint64_t gpa = mmio->paddr;
mem_region_read_t mread = mmio_read;
mem_region_write_t mwrite = mmio_write;
retval = vmm_emulate_instruction(vcpu, gpa,
&emul_cnx->vie, paging, mread, mwrite, &retval);
if (retval != 0) {
/* dump to instruction when emulation failed */
pr_err("emulate following instruction failed @ 0x%016llx:",
exec_vmread(VMX_GUEST_RIP));
for (i = 0; i < emul_cnx->vie.num_valid; i++) {
if (i >= VIE_INST_SIZE)
break;
if (i == 0)
pr_err("\n");
pr_err("%d=%02hhx ",
i, emul_cnx->vie.inst[i]);
}
}
return retval;
}

View File

@@ -0,0 +1,203 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <cpu.h>
struct vie_op {
uint8_t op_byte; /* actual opcode byte */
uint8_t op_type; /* type of operation (e.g. MOV) */
uint16_t op_flags;
};
#define VIE_INST_SIZE 15
struct vie {
uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
uint8_t num_valid; /* size of the instruction */
uint8_t num_processed;
uint8_t addrsize:4, opsize:4; /* address and operand sizes */
uint8_t rex_w:1, /* REX prefix */
rex_r:1,
rex_x:1,
rex_b:1,
rex_present:1,
repz_present:1, /* REP/REPE/REPZ prefix */
repnz_present:1, /* REPNE/REPNZ prefix */
opsize_override:1, /* Operand size override */
addrsize_override:1, /* Address size override */
segment_override:1; /* Segment override */
uint8_t mod:2, /* ModRM byte */
reg:4,
rm:4;
uint8_t ss:2, /* SIB byte */
index:4,
base:4;
uint8_t disp_bytes;
uint8_t imm_bytes;
uint8_t scale;
int base_register; /* VM_REG_GUEST_xyz */
int index_register; /* VM_REG_GUEST_xyz */
int segment_register; /* VM_REG_GUEST_xyz */
int64_t displacement; /* optional addr displacement */
int64_t immediate; /* optional immediate operand */
uint8_t decoded; /* set to 1 if successfully decoded */
struct vie_op op; /* opcode description */
};
#define PSL_C 0x00000001 /* carry bit */
#define PSL_PF 0x00000004 /* parity bit */
#define PSL_AF 0x00000010 /* bcd carry bit */
#define PSL_Z 0x00000040 /* zero bit */
#define PSL_N 0x00000080 /* negative bit */
#define PSL_T 0x00000100 /* trace enable bit */
#define PSL_I 0x00000200 /* interrupt enable bit */
#define PSL_D 0x00000400 /* string instruction direction bit */
#define PSL_V 0x00000800 /* overflow bit */
#define PSL_IOPL 0x00003000 /* i/o privilege level */
#define PSL_NT 0x00004000 /* nested task bit */
#define PSL_RF 0x00010000 /* resume flag bit */
#define PSL_VM 0x00020000 /* virtual 8086 mode bit */
#define PSL_AC 0x00040000 /* alignment checking */
#define PSL_VIF 0x00080000 /* virtual interrupt enable */
#define PSL_VIP 0x00100000 /* virtual interrupt pending */
#define PSL_ID 0x00200000 /* identification bit */
/*
* The 'access' field has the format specified in Table 21-2 of the Intel
* Architecture Manual vol 3b.
*
* XXX The contents of the 'access' field are architecturally defined except
* bit 16 - Segment Unusable.
*/
struct seg_desc {
uint64_t base;
uint32_t limit;
uint32_t access;
};
/*
* Protections are chosen from these bits, or-ed together
*/
#define PROT_NONE 0x00 /* no permissions */
#define PROT_READ 0x01 /* pages can be read */
#define PROT_WRITE 0x02 /* pages can be written */
#define PROT_EXEC 0x04 /* pages can be executed */
#define SEG_DESC_TYPE(access) ((access) & 0x001f)
#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
enum vm_cpu_mode {
CPU_MODE_REAL,
CPU_MODE_PROTECTED,
CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
};
enum vm_paging_mode {
PAGING_MODE_FLAT,
PAGING_MODE_32,
PAGING_MODE_PAE,
PAGING_MODE_64,
};
struct vm_guest_paging {
uint64_t cr3;
int cpl;
enum vm_cpu_mode cpu_mode;
enum vm_paging_mode paging_mode;
};
/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RBP,
VM_REG_GUEST_RSI,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_RDI,
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR7,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RIP,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_GUEST_CR2,
VM_REG_GUEST_PDPTE0,
VM_REG_GUEST_PDPTE1,
VM_REG_GUEST_PDPTE2,
VM_REG_GUEST_PDPTE3,
VM_REG_GUEST_INTR_SHADOW,
VM_REG_LAST
};
typedef unsigned long u_long;
int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vcpu *vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vcpu *vcpu, int reg,
struct seg_desc *desc);
int vm_restart_instruction(struct vcpu *vcpu);
void vm_gva2gpa(struct vcpu *vcpu, uint64_t gla, uint64_t *gpa);

View File

@@ -0,0 +1,118 @@
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (c) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)time.h 8.5 (Berkeley) 5/4/95
* $FreeBSD$
*/
#ifndef _TIME_H_
#define _TIME_H_
struct callout {
void *c_arg; /* function argument */
void (*c_func)(void *); /* function to call */
short c_flags; /* User State */
};
#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */
#define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */
#define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE)
#define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE)
#define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING)
typedef int64_t time_t;
typedef int64_t sbintime_t;
struct bintime {
time_t sec;
uint64_t frac;
};
static inline void
bintime_add(struct bintime *_bt, const struct bintime *_bt2)
{
uint64_t _u;
_u = _bt->frac;
_bt->frac += _bt2->frac;
if (_u > _bt->frac)
_bt->sec++;
_bt->sec += _bt2->sec;
}
static inline void
bintime_sub(struct bintime *_bt, const struct bintime *_bt2)
{
uint64_t _u;
_u = _bt->frac;
_bt->frac -= _bt2->frac;
if (_u < _bt->frac)
_bt->sec--;
_bt->sec -= _bt2->sec;
}
static inline void
bintime_mul(struct bintime *_bt, uint32_t _x)
{
uint64_t _p1, _p2;
_p1 = (_bt->frac & 0xffffffffull) * _x;
_p2 = (_bt->frac >> 32) * _x + (_p1 >> 32);
_bt->sec *= _x;
_bt->sec += (_p2 >> 32);
_bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull);
}
#define bintime_cmp(a, b, cmp) \
(((a)->sec == (b)->sec) ? \
((a)->frac cmp(b)->frac) : \
((a)->sec cmp(b)->sec))
#define SBT_1S ((sbintime_t)1 << 32)
#define SBT_1US (SBT_1S / 1000000)
#define BT2FREQ(bt) \
(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \
((bt)->frac >> 1))
#define FREQ2BT(freq, bt) \
{ \
(bt)->sec = 0; \
(bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \
}
static inline sbintime_t
bttosbt(const struct bintime _bt)
{
return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32));
}
#endif /* !_TIME_H_ */

View File

@@ -0,0 +1,357 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <schedule.h>
#include <hv_debug.h>
vm_sw_loader_t vm_sw_loader;
/***********************************************************************
* vcpu_id/pcpu_id mapping table:
*
* if
* VM0_CPUS[2] = {0, 2} , VM1_CPUS[2] = {3, 1};
* then
* for physical CPU 0 : vcpu->pcpu_id = 0, vcpu->vcpu_id = 0, vmid = 0;
* for physical CPU 2 : vcpu->pcpu_id = 2, vcpu->vcpu_id = 1, vmid = 0;
* for physical CPU 3 : vcpu->pcpu_id = 3, vcpu->vcpu_id = 0, vmid = 1;
* for physical CPU 1 : vcpu->pcpu_id = 1, vcpu->vcpu_id = 1, vmid = 1;
*
***********************************************************************/
int create_vcpu(int cpu_id, struct vm *vm, struct vcpu **rtn_vcpu_handle)
{
struct vcpu *vcpu;
ASSERT(vm != NULL, "");
ASSERT(rtn_vcpu_handle != NULL, "");
pr_info("Creating VCPU %d", cpu_id);
/* Allocate memory for VCPU */
vcpu = calloc(1, sizeof(struct vcpu));
ASSERT(vcpu != NULL, "");
/* Initialize the physical CPU ID for this VCPU */
vcpu->pcpu_id = cpu_id;
/* Initialize the parent VM reference */
vcpu->vm = vm;
/* Initialize the virtual ID for this VCPU */
/* FIXME:
* We have assumption that we always destroys vcpus in one
* shot (like when vm is destroyed). If we need to support
* specific vcpu destroy on fly, this vcpu_id assignment
* needs revise.
*/
/*
* vcpu->vcpu_id = vm->hw.created_vcpus;
* vm->hw.created_vcpus++;
*/
vcpu->vcpu_id = atomic_xadd_int(&vm->hw.created_vcpus, 1);
/* vm->hw.vcpu_array[vcpu->vcpu_id] = vcpu; */
atomic_store_rel_64(
(unsigned long *)&vm->hw.vcpu_array[vcpu->vcpu_id],
(unsigned long)vcpu);
ASSERT(vcpu->vcpu_id < vm->hw.num_vcpus,
"Allocated vcpu_id is out of range!");
per_cpu(vcpu, cpu_id) = vcpu;
pr_info("PCPU%d is working as VM%d VCPU%d, Role: %s",
vcpu->pcpu_id, vcpu->vm->attr.id, vcpu->vcpu_id,
is_vcpu_bsp(vcpu) ? "PRIMARY" : "SECONDARY");
/* Is this VCPU a VM BSP, create page hierarchy for this VM */
if (is_vcpu_bsp(vcpu)) {
/* Set up temporary guest page tables */
vm->arch_vm.guest_pml4 = create_guest_paging(vm);
pr_info("VM *d VCPU %d CR3: 0x%016llx ",
vm->attr.id, vcpu->vcpu_id, vm->arch_vm.guest_pml4);
}
/* Allocate VMCS region for this VCPU */
vcpu->arch_vcpu.vmcs = alloc_page();
ASSERT(vcpu->arch_vcpu.vmcs != NULL, "");
/* Memset VMCS region for this VCPU */
memset(vcpu->arch_vcpu.vmcs, 0, CPU_PAGE_SIZE);
/* Initialize exception field in VCPU context */
vcpu->arch_vcpu.exception_info.exception = -1;
/* Initialize cur context */
vcpu->arch_vcpu.cur_context = NORMAL_WORLD;
/* Create per vcpu vlapic */
vlapic_create(vcpu);
/* Populate the return handle */
*rtn_vcpu_handle = vcpu;
vcpu->launched = false;
vcpu->paused_cnt = 0;
vcpu->running = 0;
vcpu->ioreq_pending = 0;
vcpu->arch_vcpu.nr_sipi = 0;
vcpu->pending_pre_work = 0;
vcpu->state = VCPU_INIT;
return 0;
}
int start_vcpu(struct vcpu *vcpu)
{
uint64_t rip, instlen;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
int64_t status = 0;
ASSERT(vcpu != NULL, "Incorrect arguments");
/* If this VCPU is not already launched, launch it */
if (!vcpu->launched) {
pr_info("VM %d Starting VCPU %d",
vcpu->vm->attr.id, vcpu->vcpu_id);
/* Set vcpu launched */
vcpu->launched = true;
/* avoid VMCS recycling RSB usage, set IBPB.
* NOTE: this should be done for any time vmcs got switch
* currently, there is no other place to do vmcs switch
* Please add IBPB set for future vmcs switch case(like trusty)
*/
if (ibrs_type == IBRS_RAW)
msr_write(MSR_IA32_PRED_CMD, PRED_SET_IBPB);
/* Launch the VM */
status = vmx_vmrun(cur_context, VM_LAUNCH, ibrs_type);
/* See if VM launched successfully */
if (status == 0) {
if (is_vcpu_bsp(vcpu)) {
pr_info("VM %d VCPU %d successfully launched",
vcpu->vm->attr.id, vcpu->vcpu_id);
}
}
} else {
/* This VCPU was already launched, check if the last guest
* instruction needs to be repeated and resume VCPU accordingly
*/
instlen = vcpu->arch_vcpu.inst_len;
rip = cur_context->rip;
exec_vmwrite(VMX_GUEST_RIP, ((rip + instlen) &
0xFFFFFFFFFFFFFFFF));
/* Resume the VM */
status = vmx_vmrun(cur_context, VM_RESUME, ibrs_type);
}
/* Save guest CR3 register */
cur_context->cr3 = exec_vmread(VMX_GUEST_CR3);
/* Obtain current VCPU instruction pointer and length */
cur_context->rip = exec_vmread(VMX_GUEST_RIP);
vcpu->arch_vcpu.inst_len = exec_vmread(VMX_EXIT_INSTR_LEN);
cur_context->rsp = exec_vmread(VMX_GUEST_RSP);
cur_context->rflags = exec_vmread(VMX_GUEST_RFLAGS);
/* Obtain VM exit reason */
vcpu->arch_vcpu.exit_reason = exec_vmread(VMX_EXIT_REASON);
if (status != 0) {
/* refer to 64-ia32 spec section 24.9.1 volume#3 */
if (vcpu->arch_vcpu.exit_reason & VMX_VMENTRY_FAIL)
pr_fatal("vmentry fail reason=%lx", vcpu->arch_vcpu.exit_reason);
else
pr_fatal("vmexit fail err_inst=%lx", exec_vmread(VMX_INSTR_ERROR));
ASSERT(status == 0, "vm fail");
}
return status;
}
int shutdown_vcpu(__unused struct vcpu *vcpu)
{
/* TODO : Implement VCPU shutdown sequence */
return 0;
}
int destroy_vcpu(struct vcpu *vcpu)
{
ASSERT(vcpu != NULL, "Incorrect arguments");
/* vcpu->vm->hw.vcpu_array[vcpu->vcpu_id] = NULL; */
atomic_store_rel_64(
(unsigned long *)&vcpu->vm->hw.vcpu_array[vcpu->vcpu_id],
(unsigned long)NULL);
atomic_subtract_int(&vcpu->vm->hw.created_vcpus, 1);
vlapic_free(vcpu);
free(vcpu->arch_vcpu.vmcs);
free(vcpu->guest_msrs);
free_pcpu(vcpu->pcpu_id);
free(vcpu);
return 0;
}
/* NOTE:
* vcpu should be paused before call this function.
*/
void reset_vcpu(struct vcpu *vcpu)
{
struct vlapic *vlapic;
pr_dbg("vcpu%d reset", vcpu->vcpu_id);
ASSERT(vcpu->state != VCPU_RUNNING,
"reset vcpu when it's running");
if (vcpu->state == VCPU_INIT)
return;
vcpu->state = VCPU_INIT;
vcpu->launched = false;
vcpu->paused_cnt = 0;
vcpu->running = 0;
vcpu->ioreq_pending = 0;
vcpu->arch_vcpu.nr_sipi = 0;
vcpu->pending_pre_work = 0;
vlapic = vcpu->arch_vcpu.vlapic;
vlapic_init(vlapic);
}
void init_vcpu(struct vcpu *vcpu)
{
if (is_vcpu_bsp(vcpu))
vcpu->arch_vcpu.cpu_mode = PAGE_PROTECTED_MODE;
else
vcpu->arch_vcpu.cpu_mode = REAL_MODE;
/* init_vmcs is delayed to vcpu vmcs launch first time */
}
void pause_vcpu(struct vcpu *vcpu, enum vcpu_state new_state)
{
int pcpu_id = get_cpu_id();
pr_dbg("vcpu%d paused, new state: %d",
vcpu->vcpu_id, new_state);
vcpu->prev_state = vcpu->state;
vcpu->state = new_state;
get_schedule_lock(pcpu_id);
if (atomic_load_acq_32(&vcpu->running) == 1) {
remove_vcpu_from_runqueue(vcpu);
make_reschedule_request(vcpu);
release_schedule_lock(pcpu_id);
if (vcpu->pcpu_id != pcpu_id) {
while (atomic_load_acq_32(&vcpu->running) == 1)
__asm__ __volatile("pause" ::: "memory");
}
} else {
remove_vcpu_from_runqueue(vcpu);
release_schedule_lock(pcpu_id);
}
}
void resume_vcpu(struct vcpu *vcpu)
{
pr_dbg("vcpu%d resumed", vcpu->vcpu_id);
vcpu->state = vcpu->prev_state;
get_schedule_lock(vcpu->pcpu_id);
if (vcpu->state == VCPU_RUNNING) {
add_vcpu_to_runqueue(vcpu);
make_reschedule_request(vcpu);
}
release_schedule_lock(vcpu->pcpu_id);
}
void schedule_vcpu(struct vcpu *vcpu)
{
vcpu->state = VCPU_RUNNING;
pr_dbg("vcpu%d scheduled", vcpu->vcpu_id);
get_schedule_lock(vcpu->pcpu_id);
add_vcpu_to_runqueue(vcpu);
make_reschedule_request(vcpu);
release_schedule_lock(vcpu->pcpu_id);
}
/* help function for vcpu create */
int prepare_vcpu(struct vm *vm, int pcpu_id)
{
int ret = 0;
struct vcpu *vcpu = NULL;
ret = create_vcpu(pcpu_id, vm, &vcpu);
ASSERT(ret == 0, "vcpu create failed");
if (is_vcpu_bsp(vcpu)) {
/* Load VM SW */
if (!vm_sw_loader)
vm_sw_loader = general_sw_loader;
vm_sw_loader(vm, vcpu);
vcpu->arch_vcpu.cpu_mode = PAGE_PROTECTED_MODE;
} else {
vcpu->arch_vcpu.cpu_mode = REAL_MODE;
}
/* init_vmcs is delayed to vcpu vmcs launch first time */
/* initialize the vcpu tsc aux */
vcpu->msr_tsc_aux_guest = vcpu->vcpu_id;
set_pcpu_used(pcpu_id);
INIT_LIST_HEAD(&vcpu->run_list);
return ret;
}
void request_vcpu_pre_work(struct vcpu *vcpu, int pre_work_id)
{
bitmap_set(pre_work_id, &vcpu->pending_pre_work);
}

View File

@@ -0,0 +1,662 @@
/*-
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#define pr_fmt(fmt) "vioapic: " fmt
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#define IOREGSEL 0x00
#define IOWIN 0x10
#define IOEOI 0x40
#define REDIR_ENTRIES_HW 120 /* SOS align with native ioapic */
#define REDIR_ENTRIES_UOS 24 /* UOS pins*/
#define RTBL_RO_BITS ((uint64_t)(IOAPIC_RTE_REM_IRR | IOAPIC_RTE_DELIVS))
#define ACRN_DBG_IOAPIC 6
struct vioapic {
struct vm *vm;
spinlock_t mtx;
uint32_t id;
uint32_t ioregsel;
struct {
uint64_t reg;
int acnt; /* sum of pin asserts (+1) and deasserts (-1) */
} rtbl[REDIR_ENTRIES_HW];
};
#define VIOAPIC_LOCK(vioapic) spinlock_obtain(&((vioapic)->mtx))
#define VIOAPIC_UNLOCK(vioapic) spinlock_release(&((vioapic)->mtx))
static inline const char *pinstate_str(bool asserted)
{
return (asserted) ? "asserted" : "deasserted";
}
struct vioapic *
vm_ioapic(struct vm *vm)
{
return (struct vioapic *)vm->arch_vm.virt_ioapic;
}
static void
vioapic_send_intr(struct vioapic *vioapic, int pin)
{
int vector, delmode;
uint32_t low, high, dest;
bool level, phys;
if (pin < 0 || pin >= vioapic_pincount(vioapic->vm))
pr_err("vioapic_send_intr: invalid pin number %d", pin);
low = vioapic->rtbl[pin].reg;
high = vioapic->rtbl[pin].reg >> 32;
if ((low & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMSET) {
dev_dbg(ACRN_DBG_IOAPIC, "ioapic pin%d: masked", pin);
return;
}
phys = ((low & IOAPIC_RTE_DESTMOD) == IOAPIC_RTE_DESTPHY);
delmode = low & IOAPIC_RTE_DELMOD;
level = low & IOAPIC_RTE_TRGRLVL ? true : false;
if (level)
vioapic->rtbl[pin].reg |= IOAPIC_RTE_REM_IRR;
vector = low & IOAPIC_RTE_INTVEC;
dest = high >> APIC_ID_SHIFT;
vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
}
static void
vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate)
{
int oldcnt, newcnt;
bool needintr;
if (pin < 0 || pin >= vioapic_pincount(vioapic->vm))
pr_err("vioapic_set_pinstate: invalid pin number %d", pin);
oldcnt = vioapic->rtbl[pin].acnt;
if (newstate)
vioapic->rtbl[pin].acnt++;
else
vioapic->rtbl[pin].acnt--;
newcnt = vioapic->rtbl[pin].acnt;
if (newcnt < 0) {
pr_err("ioapic pin%d: bad acnt %d", pin, newcnt);
}
needintr = false;
if (oldcnt == 0 && newcnt == 1) {
needintr = true;
dev_dbg(ACRN_DBG_IOAPIC, "ioapic pin%d: asserted", pin);
} else if (oldcnt == 1 && newcnt == 0) {
dev_dbg(ACRN_DBG_IOAPIC, "ioapic pin%d: deasserted", pin);
} else {
dev_dbg(ACRN_DBG_IOAPIC, "ioapic pin%d: %s, ignored, acnt %d",
pin, pinstate_str(newstate), newcnt);
}
if (needintr)
vioapic_send_intr(vioapic, pin);
}
enum irqstate {
IRQSTATE_ASSERT,
IRQSTATE_DEASSERT,
IRQSTATE_PULSE
};
static int
vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
{
struct vioapic *vioapic;
if (irq < 0 || irq >= vioapic_pincount(vm))
return -EINVAL;
vioapic = vm_ioapic(vm);
VIOAPIC_LOCK(vioapic);
switch (irqstate) {
case IRQSTATE_ASSERT:
vioapic_set_pinstate(vioapic, irq, true);
break;
case IRQSTATE_DEASSERT:
vioapic_set_pinstate(vioapic, irq, false);
break;
case IRQSTATE_PULSE:
vioapic_set_pinstate(vioapic, irq, true);
vioapic_set_pinstate(vioapic, irq, false);
break;
default:
panic("vioapic_set_irqstate: invalid irqstate %d", irqstate);
}
VIOAPIC_UNLOCK(vioapic);
return 0;
}
int
vioapic_assert_irq(struct vm *vm, int irq)
{
return vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT);
}
int
vioapic_deassert_irq(struct vm *vm, int irq)
{
return vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT);
}
int
vioapic_pulse_irq(struct vm *vm, int irq)
{
return vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE);
}
/*
* Reset the vlapic's trigger-mode register to reflect the ioapic pin
* configuration.
*/
void
vioapic_update_tmr(struct vcpu *vcpu)
{
struct vioapic *vioapic;
struct vlapic *vlapic;
uint32_t low;
int delmode, pin, vector;
bool level;
vlapic = vcpu->arch_vcpu.vlapic;
vioapic = vm_ioapic(vcpu->vm);
VIOAPIC_LOCK(vioapic);
for (pin = 0; pin < vioapic_pincount(vioapic->vm); pin++) {
low = vioapic->rtbl[pin].reg;
level = low & IOAPIC_RTE_TRGRLVL ? true : false;
/*
* For a level-triggered 'pin' let the vlapic figure out if
* an assertion on this 'pin' would result in an interrupt
* being delivered to it. If yes, then it will modify the
* TMR bit associated with this vector to level-triggered.
*/
delmode = low & IOAPIC_RTE_DELMOD;
vector = low & IOAPIC_RTE_INTVEC;
vlapic_set_tmr_one_vec(vlapic, delmode, vector, level);
}
vlapic_apicv_batch_set_tmr(vlapic);
VIOAPIC_UNLOCK(vioapic);
}
static uint32_t
vioapic_read(struct vioapic *vioapic, uint32_t addr)
{
int regnum, pin, rshift;
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
return vioapic->id;
case IOAPIC_VER:
return ((vioapic_pincount(vioapic->vm) - 1) << MAX_RTE_SHIFT)
| 0x11;
case IOAPIC_ARB:
return vioapic->id;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + vioapic_pincount(vioapic->vm) * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
rshift = 32;
else
rshift = 0;
return vioapic->rtbl[pin].reg >> rshift;
}
return 0;
}
/*
* version 0x20+ ioapic has EOI register. And cpu could write vector to this
* register to clear related IRR.
*/
static void
vioapic_write_eoi(struct vioapic *vioapic, int32_t vector)
{
struct vm *vm = vioapic->vm;
int pin;
if (vector < VECTOR_FOR_INTR_START || vector > NR_MAX_VECTOR)
pr_err("vioapic_process_eoi: invalid vector %d", vector);
VIOAPIC_LOCK(vioapic);
for (pin = 0; pin < vioapic_pincount(vm); pin++) {
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_REM_IRR) == 0)
continue;
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_INTVEC) !=
(uint64_t)vector)
continue;
vioapic->rtbl[pin].reg &= ~IOAPIC_RTE_REM_IRR;
if (vioapic->rtbl[pin].acnt > 0) {
dev_dbg(ACRN_DBG_IOAPIC,
"ioapic pin%d: asserted at eoi, acnt %d",
pin, vioapic->rtbl[pin].acnt);
vioapic_send_intr(vioapic, pin);
}
}
VIOAPIC_UNLOCK(vioapic);
}
static void
vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
{
uint64_t data64, mask64;
uint64_t last, new, changed;
int regnum, pin, lshift;
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
vioapic->id = data & APIC_ID_MASK;
break;
case IOAPIC_VER:
case IOAPIC_ARB:
/* readonly */
break;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + vioapic_pincount(vioapic->vm) * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
lshift = 32;
else
lshift = 0;
last = new = vioapic->rtbl[pin].reg;
data64 = (uint64_t)data << lshift;
mask64 = (uint64_t)0xffffffff << lshift;
new &= ~mask64 | RTBL_RO_BITS;
new |= data64 & ~RTBL_RO_BITS;
changed = last ^ new;
/* pin0 from vpic mask/unmask */
if (pin == 0 && (changed & IOAPIC_RTE_INTMASK)) {
/* mask -> umask */
if ((last & IOAPIC_RTE_INTMASK) &&
((new & IOAPIC_RTE_INTMASK) == 0)) {
if ((vioapic->vm->vpic_wire_mode
== VPIC_WIRE_NULL) ||
(vioapic->vm->vpic_wire_mode
== VPIC_WIRE_INTR)) {
atomic_set_int(
&vioapic->vm->vpic_wire_mode,
VPIC_WIRE_IOAPIC);
dev_dbg(ACRN_DBG_IOAPIC,
"vpic wire mode -> IOAPIC");
} else {
pr_err("WARNING: invalid vpic wire mode change");
return;
}
/* unmask -> mask */
} else if (((last & IOAPIC_RTE_INTMASK) == 0) &&
(new & IOAPIC_RTE_INTMASK)) {
if (vioapic->vm->vpic_wire_mode
== VPIC_WIRE_IOAPIC) {
atomic_set_int(
&vioapic->vm->vpic_wire_mode,
VPIC_WIRE_INTR);
dev_dbg(ACRN_DBG_IOAPIC,
"vpic wire mode -> INTR");
}
}
}
vioapic->rtbl[pin].reg = new;
dev_dbg(ACRN_DBG_IOAPIC, "ioapic pin%d: redir table entry %#lx",
pin, vioapic->rtbl[pin].reg);
/*
* If any fields in the redirection table entry (except mask
* or polarity) have changed then rendezvous all the vcpus
* to update their vlapic trigger-mode registers.
*/
if (changed & ~(IOAPIC_RTE_INTMASK | IOAPIC_RTE_INTPOL)) {
int i;
struct vcpu *vcpu;
dev_dbg(ACRN_DBG_IOAPIC,
"ioapic pin%d: recalculate vlapic trigger-mode reg",
pin);
VIOAPIC_UNLOCK(vioapic);
foreach_vcpu(i, vioapic->vm, vcpu) {
vcpu_make_request(vcpu, ACRN_REQUEST_TMR_UPDATE);
}
VIOAPIC_LOCK(vioapic);
}
/*
* Generate an interrupt if the following conditions are met:
* - pin is not masked
* - previous interrupt has been EOIed
* - pin level is asserted
*/
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_INTMASK) ==
IOAPIC_RTE_INTMCLR &&
(vioapic->rtbl[pin].reg & IOAPIC_RTE_REM_IRR) == 0 &&
(vioapic->rtbl[pin].acnt > 0)) {
dev_dbg(ACRN_DBG_IOAPIC,
"ioapic pin%d: asserted at rtbl write, acnt %d",
pin, vioapic->rtbl[pin].acnt);
vioapic_send_intr(vioapic, pin);
}
/* remap for active: interrupt mask -> unmask
* remap for deactive: interrupt mask & vector set to 0
*/
data64 = vioapic->rtbl[pin].reg;
if ((((data64 & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMCLR)
&& ((last & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMSET))
|| (((data64 & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMSET)
&& ((vioapic->rtbl[pin].reg & IOAPIC_RTE_INTVEC) == 0))) {
/* VM enable intr */
struct ptdev_intx_info intx;
/* NOTE: only support max 256 pin */
intx.virt_pin = (uint8_t)pin;
intx.vpin_src = PTDEV_VPIN_IOAPIC;
ptdev_intx_pin_remap(vioapic->vm, &intx);
}
}
}
static int
vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa,
uint64_t *data, int size, bool doread)
{
uint64_t offset;
offset = gpa - VIOAPIC_BASE;
/*
* The IOAPIC specification allows 32-bit wide accesses to the
* IOREGSEL (offset 0) and IOWIN (offset 16) registers.
*/
if (size != 4 || (offset != IOREGSEL && offset != IOWIN &&
offset != IOEOI)) {
if (doread)
*data = 0;
return 0;
}
VIOAPIC_LOCK(vioapic);
if (offset == IOREGSEL) {
if (doread)
*data = vioapic->ioregsel;
else
vioapic->ioregsel = *data;
} else if (offset == IOEOI) {
/* only need to handle write operation */
if (!doread)
vioapic_write_eoi(vioapic, *data);
} else {
if (doread) {
*data = vioapic_read(vioapic, vioapic->ioregsel);
} else {
vioapic_write(vioapic, vioapic->ioregsel,
*data);
}
}
VIOAPIC_UNLOCK(vioapic);
return 0;
}
int
vioapic_mmio_read(void *vm, uint64_t gpa, uint64_t *rval,
int size)
{
int error;
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, gpa, rval, size, true);
return error;
}
int
vioapic_mmio_write(void *vm, uint64_t gpa, uint64_t wval,
int size)
{
int error;
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, gpa, &wval, size, false);
return error;
}
void
vioapic_process_eoi(struct vm *vm, int vector)
{
struct vioapic *vioapic;
int pin;
if (vector < VECTOR_FOR_INTR_START || vector > NR_MAX_VECTOR)
pr_err("vioapic_process_eoi: invalid vector %d", vector);
vioapic = vm_ioapic(vm);
dev_dbg(ACRN_DBG_IOAPIC, "ioapic processing eoi for vector %d", vector);
/* notify device to ack if assigned pin */
for (pin = 0; pin < vioapic_pincount(vm); pin++) {
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_REM_IRR) == 0)
continue;
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_INTVEC) !=
(uint64_t)vector)
continue;
ptdev_intx_ack(vm, pin, PTDEV_VPIN_IOAPIC);
}
/*
* XXX keep track of the pins associated with this vector instead
* of iterating on every single pin each time.
*/
VIOAPIC_LOCK(vioapic);
for (pin = 0; pin < vioapic_pincount(vm); pin++) {
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_REM_IRR) == 0)
continue;
if ((vioapic->rtbl[pin].reg & IOAPIC_RTE_INTVEC) !=
(uint64_t)vector)
continue;
vioapic->rtbl[pin].reg &= ~IOAPIC_RTE_REM_IRR;
if (vioapic->rtbl[pin].acnt > 0) {
dev_dbg(ACRN_DBG_IOAPIC,
"ioapic pin%d: asserted at eoi, acnt %d",
pin, vioapic->rtbl[pin].acnt);
vioapic_send_intr(vioapic, pin);
}
}
VIOAPIC_UNLOCK(vioapic);
}
struct vioapic *
vioapic_init(struct vm *vm)
{
int i;
struct vioapic *vioapic;
vioapic = calloc(1, sizeof(struct vioapic));
ASSERT(vioapic != NULL, "");
vioapic->vm = vm;
spinlock_init(&vioapic->mtx);
/* Initialize all redirection entries to mask all interrupts */
for (i = 0; i < vioapic_pincount(vioapic->vm); i++)
vioapic->rtbl[i].reg = 0x0001000000010000UL;
register_mmio_emulation_handler(vm,
vioapic_mmio_access_handler,
(uint64_t)VIOAPIC_BASE,
(uint64_t)VIOAPIC_BASE + VIOAPIC_SIZE,
(void *) 0);
return vioapic;
}
void
vioapic_cleanup(struct vioapic *vioapic)
{
unregister_mmio_emulation_handler(vioapic->vm,
(uint64_t)VIOAPIC_BASE,
(uint64_t)VIOAPIC_BASE + VIOAPIC_SIZE);
free(vioapic);
}
int
vioapic_pincount(struct vm *vm)
{
if (is_vm0(vm))
return REDIR_ENTRIES_HW;
else
return REDIR_ENTRIES_UOS;
}
int vioapic_mmio_access_handler(struct vcpu *vcpu, struct mem_io *mmio,
void *handler_private_data)
{
struct vm *vm = vcpu->vm;
uint64_t gpa = mmio->paddr;
int ret = 0;
(void)handler_private_data;
/* Note all RW to IOAPIC are 32-Bit in size */
ASSERT(mmio->access_size == 4,
"All RW to LAPIC must be 32-bits in size");
if (mmio->read_write == HV_MEM_IO_READ) {
ret = vioapic_mmio_read(vm,
gpa,
&mmio->value,
mmio->access_size);
mmio->mmio_status = MMIO_TRANS_VALID;
} else if (mmio->read_write == HV_MEM_IO_WRITE) {
ret = vioapic_mmio_write(vm,
gpa,
mmio->value,
mmio->access_size);
mmio->mmio_status = MMIO_TRANS_VALID;
}
return ret;
}
bool vioapic_get_rte(struct vm *vm, int pin, void *rte)
{
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
if (vioapic && rte) {
*(uint64_t *)rte = vioapic->rtbl[pin].reg;
return true;
} else
return false;
}
int get_vioapic_info(char *str, int str_max, int vmid)
{
int pin, len, size = str_max, vector, delmode;
uint64_t rte;
uint32_t low, high, dest;
bool level, phys, remote_irr, mask;
struct vm *vm = get_vm_from_vmid(vmid);
if (!vm) {
len = snprintf(str, size,
"\r\nvm is not exist for vmid %d", vmid);
size -= len;
str += len;
goto END;
}
len = snprintf(str, size,
"\r\nPIN\tVEC\tDM\tDEST\tTM\tDELM\tIRR\tMASK");
size -= len;
str += len;
for (pin = 0 ; pin < vioapic_pincount(vm); pin++) {
vioapic_get_rte(vm, pin, (void *)&rte);
low = rte;
high = rte >> 32;
mask = ((low & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMSET);
remote_irr = ((low & IOAPIC_RTE_REM_IRR) == IOAPIC_RTE_REM_IRR);
phys = ((low & IOAPIC_RTE_DESTMOD) == IOAPIC_RTE_DESTPHY);
delmode = low & IOAPIC_RTE_DELMOD;
level = low & IOAPIC_RTE_TRGRLVL ? true : false;
vector = low & IOAPIC_RTE_INTVEC;
dest = high >> APIC_ID_SHIFT;
len = snprintf(str, size,
"\r\n%d\t0x%X\t%s\t0x%X\t%s\t%d\t%d\t%d",
pin, vector, phys ? "phys" : "logic",
dest, level ? "level" : "edge",
delmode >> 8, remote_irr, mask);
size -= len;
str += len;
}
END:
snprintf(str, size, "\r\n");
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,153 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VLAPIC_PRIV_H_
#define _VLAPIC_PRIV_H_
/*
* APIC Register: Offset Description
*/
#define APIC_OFFSET_ID 0x20 /* Local APIC ID */
#define APIC_OFFSET_VER 0x30 /* Local APIC Version */
#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */
#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */
#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */
#define APIC_OFFSET_EOI 0xB0 /* EOI Register */
#define APIC_OFFSET_RRR 0xC0 /* Remote read */
#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */
#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */
#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */
#define APIC_OFFSET_ISR0 0x100 /* In Service Register */
#define APIC_OFFSET_ISR1 0x110
#define APIC_OFFSET_ISR2 0x120
#define APIC_OFFSET_ISR3 0x130
#define APIC_OFFSET_ISR4 0x140
#define APIC_OFFSET_ISR5 0x150
#define APIC_OFFSET_ISR6 0x160
#define APIC_OFFSET_ISR7 0x170
#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */
#define APIC_OFFSET_TMR1 0x190
#define APIC_OFFSET_TMR2 0x1A0
#define APIC_OFFSET_TMR3 0x1B0
#define APIC_OFFSET_TMR4 0x1C0
#define APIC_OFFSET_TMR5 0x1D0
#define APIC_OFFSET_TMR6 0x1E0
#define APIC_OFFSET_TMR7 0x1F0
#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */
#define APIC_OFFSET_IRR1 0x210
#define APIC_OFFSET_IRR2 0x220
#define APIC_OFFSET_IRR3 0x230
#define APIC_OFFSET_IRR4 0x240
#define APIC_OFFSET_IRR5 0x250
#define APIC_OFFSET_IRR6 0x260
#define APIC_OFFSET_IRR7 0x270
#define APIC_OFFSET_ESR 0x280 /* Error Status Register */
#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */
#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */
#define APIC_OFFSET_ICR_HI 0x310
#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */
#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */
#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */
#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */
#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */
#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */
#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */
#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */
#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */
#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */
/*
* 16 priority levels with at most one vector injected per level.
*/
#define ISRVEC_STK_SIZE (16 + 1)
#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI
struct vlapic;
struct pir_desc {
uint64_t pir[4];
uint64_t pending;
uint64_t unused[3];
} __aligned(64);
struct vlapic_ops {
int (*apicv_set_intr_ready)
(struct vlapic *vlapic, int vector, bool level);
int (*apicv_pending_intr)(struct vlapic *vlapic, int *vecptr);
void (*apicv_intr_accepted)(struct vlapic *vlapic, int vector);
void (*apicv_post_intr)(struct vlapic *vlapic, int hostcpu);
void (*apicv_set_tmr)(struct vlapic *vlapic, int vector, bool level);
void (*apicv_batch_set_tmr)(struct vlapic *vlapic);
void (*enable_x2apic_mode)(struct vlapic *vlapic);
};
struct vlapic {
struct vm *vm;
struct vcpu *vcpu;
struct lapic *apic_page;
struct pir_desc *pir_desc;
struct vlapic_ops ops;
uint32_t esr_pending;
int esr_firing;
struct callout callout; /* vlapic timer */
struct bintime timer_fire_bt; /* callout expiry time */
struct bintime timer_freq_bt; /* timer frequency */
struct bintime timer_period_bt; /* timer period */
long last_timer; /* the last timer id */
spinlock_t timer_mtx;
/*
* The 'isrvec_stk' is a stack of vectors injected by the local apic.
* A vector is popped from the stack when the processor does an EOI.
* The vector on the top of the stack is used to compute the
* Processor Priority in conjunction with the TPR.
*/
uint8_t isrvec_stk[ISRVEC_STK_SIZE];
int isrvec_stk_top;
uint64_t msr_apicbase;
/*
* Copies of some registers in the virtual APIC page. We do this for
* a couple of different reasons:
* - to be able to detect what changed (e.g. svr_last)
* - to maintain a coherent snapshot of the register (e.g. lvt_last)
*/
uint32_t svr_last;
uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1];
struct pir_desc pir;
};
void vlapic_cleanup(struct vlapic *vlapic);
#endif /* _VLAPIC_PRIV_H_ */

View File

@@ -0,0 +1,324 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <bsp_extern.h>
#include <hv_debug.h>
/* Local variables */
/* VMs list */
struct list_head vm_list = {
.next = &vm_list,
.prev = &vm_list,
};
/* Lock for VMs list */
spinlock_t vm_list_lock = {
.head = 0,
.tail = 0
};
/* used for vmid allocation. And this means the max vm number is 64 */
static unsigned long vmid_bitmap;
static void init_vm(struct vm_description *vm_desc,
struct vm *vm_handle)
{
/* Populate VM attributes from VM description */
vm_handle->hw.num_vcpus = vm_desc->vm_hw_num_cores;
vm_handle->state_info.privilege = vm_desc->vm_state_info_privilege;
vm_handle->state_info.boot_count = 0;
}
/* return a pointer to the virtual machine structure associated with
* this VM ID
*/
struct vm *get_vm_from_vmid(int vm_id)
{
struct vm *vm = NULL;
struct list_head *pos;
spinlock_obtain(&vm_list_lock);
list_for_each(pos, &vm_list) {
vm = list_entry(pos, struct vm, list);
if (vm->attr.id == vm_id) {
spinlock_release(&vm_list_lock);
return vm;
}
}
spinlock_release(&vm_list_lock);
return NULL;
}
int create_vm(struct vm_description *vm_desc, struct vm **rtn_vm)
{
unsigned int id;
struct vm *vm;
int status = 0;
if ((vm_desc == NULL) || (rtn_vm == NULL))
status = -EINVAL;
if (status == 0) {
/* Allocate memory for virtual machine */
vm = calloc(1, sizeof(struct vm));
ASSERT(vm != NULL, "vm allocation failed");
/*
* Map Virtual Machine to its VM Description
*/
init_vm(vm_desc, vm);
/* Init mmio list */
INIT_LIST_HEAD(&vm->mmio_list);
if (vm->hw.num_vcpus == 0)
vm->hw.num_vcpus = phy_cpu_num;
vm->hw.vcpu_array =
calloc(1, sizeof(struct vcpu *) * vm->hw.num_vcpus);
ASSERT(vm->hw.vcpu_array != NULL,
"vcpu_array allocation failed");
for (id = 0; id < sizeof(long) * 8; id++)
if (bitmap_test_and_set(id, &vmid_bitmap) == 0)
break;
vm->attr.id = vm->attr.boot_idx = id;
snprintf(&vm->attr.name[0], MAX_VM_NAME_LEN, "vm_%d",
vm->attr.id);
atomic_store_rel_int(&vm->hw.created_vcpus, 0);
/* gpa_lowtop are used for system start up */
vm->hw.gpa_lowtop = 0;
/* Only for SOS: Configure VM software information */
/* For UOS: This VM software information is configure in DM */
if (is_vm0(vm)) {
prepare_vm0_memmap_and_e820(vm);
#ifndef CONFIG_EFI_STUB
status = init_vm0_boot_info(vm);
#endif
} else {
/* populate UOS vm fields according to vm_desc */
vm->secure_world_enabled =
vm_desc->secure_world_enabled;
memcpy_s(&vm->GUID[0], sizeof(vm->GUID),
&vm_desc->GUID[0],
sizeof(vm_desc->GUID));
}
INIT_LIST_HEAD(&vm->list);
spinlock_obtain(&vm_list_lock);
list_add(&vm->list, &vm_list);
spinlock_release(&vm_list_lock);
/* Ensure VM software information obtained */
if (status == 0) {
/* Set up IO bit-mask such that VM exit occurs on
* selected IO ranges
*/
setup_io_bitmap(vm);
/* Create virtual uart */
if (is_vm0(vm))
vm->vuart = vuart_init(vm);
vm->vpic = vpic_init(vm);
/* vpic wire_mode default is INTR */
vm->vpic_wire_mode = VPIC_WIRE_INTR;
/* Allocate full emulated vIOAPIC instance */
vm->arch_vm.virt_ioapic = vioapic_init(vm);
/* Populate return VM handle */
*rtn_vm = vm;
ptdev_vm_init(vm);
vm->sw.req_buf = 0;
vm->state = VM_CREATED;
}
}
/* Return status to caller */
return status;
}
int shutdown_vm(struct vm *vm)
{
int i, status = 0;
struct vcpu *vcpu = NULL;
if (vm == NULL)
return -EINVAL;
pause_vm(vm);
/* Only allow shutdown paused vm */
if (vm->state != VM_PAUSED)
return -EINVAL;
foreach_vcpu(i, vm, vcpu) {
reset_vcpu(vcpu);
destroy_vcpu(vcpu);
}
spinlock_obtain(&vm_list_lock);
list_del_init(&vm->list);
spinlock_release(&vm_list_lock);
ptdev_vm_deinit(vm);
/* cleanup and free vioapic */
vioapic_cleanup(vm->arch_vm.virt_ioapic);
/* Free EPT allocated resources assigned to VM */
destroy_ept(vm);
/* Free MSR bitmap */
free(vm->arch_vm.msr_bitmap);
/* TODO: De-initialize I/O Emulation */
free_io_emulation_resource(vm);
/* Free iommu_domain */
if (vm->iommu_domain)
destroy_iommu_domain(vm->iommu_domain);
bitmap_clr(vm->attr.id, &vmid_bitmap);
if (vm->vpic)
vpic_cleanup(vm);
free(vm->hw.vcpu_array);
/* TODO: De-Configure HV-SW */
/* Deallocate VM */
free(vm);
/* Return status to caller */
return status;
}
int start_vm(struct vm *vm)
{
struct vcpu *vcpu = NULL;
vm->state = VM_STARTED;
/* Only start BSP (vid = 0) and let BSP start other APs */
vcpu = vcpu_from_vid(vm, 0);
ASSERT(vcpu != NULL, "vm%d, vcpu0", vm->attr.id);
schedule_vcpu(vcpu);
return 0;
}
/*
* DM only pause vm for shutdown/reboot. If we need to
* extend the pause vm for DM, this API should be extended.
*/
int pause_vm(struct vm *vm)
{
int i;
struct vcpu *vcpu = NULL;
if (vm->state == VM_PAUSED)
return 0;
vm->state = VM_PAUSED;
foreach_vcpu(i, vm, vcpu)
pause_vcpu(vcpu, VCPU_ZOMBIE);
return 0;
}
int vm_resume(struct vm *vm)
{
int i;
struct vcpu *vcpu = NULL;
foreach_vcpu(i, vm, vcpu)
resume_vcpu(vcpu);
vm->state = VM_STARTED;
return 0;
}
/* Finally, we will remove the array and only maintain vm0 desc */
struct vm_description *get_vm_desc(int idx)
{
struct vm_description_array *vm_desc_array;
/* Obtain base of user defined VM description array data
* structure
*/
vm_desc_array = (struct vm_description_array *)get_vm_desc_base();
/* Obtain VM description array base */
if (idx >= vm_desc_array->num_vm_desc)
return NULL;
else
return &vm_desc_array->vm_desc_array[idx];
}
/* Create vm/vcpu for vm0 */
int prepare_vm0(void)
{
int i, ret;
struct vm *vm = NULL;
struct vm_description *vm_desc = NULL;
vm_desc = get_vm_desc(0);
ASSERT(vm_desc, "get vm desc failed");
ret = create_vm(vm_desc, &vm);
ASSERT(ret == 0, "VM creation failed!");
prepare_vcpu(vm, vm_desc->vm_hw_logical_core_ids[0]);
/* Prepare the AP for vm0 */
for (i = 1; i < vm_desc->vm_hw_num_cores; i++)
prepare_vcpu(vm, vm_desc->vm_hw_logical_core_ids[i]);
/* start vm0 BSP automatically */
start_vm(vm);
pr_fatal("Start VM0");
return 0;
}

View File

@@ -0,0 +1,148 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#include <acrn_hv_defs.h>
#include <hypercall.h>
int vmcall_handler(struct vcpu *vcpu)
{
int64_t ret = 0;
struct vm *vm = vcpu->vm;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
/* hypercall ID from guest*/
uint64_t hypcall_id = cur_context->guest_cpu_regs.regs.r8;
/* hypercall param1 from guest*/
uint64_t param1 = cur_context->guest_cpu_regs.regs.rdi;
/* hypercall param2 from guest*/
uint64_t param2 = cur_context->guest_cpu_regs.regs.rsi;
/* hypercall param3 from guest, reserved*/
/* uint64_t param3 = cur_context->guest_cpu_regs.regs.rdx; */
/* hypercall param4 from guest, reserved*/
/* uint64_t param4 = cur_context->guest_cpu_regs.regs.rcx; */
/* Dispatch the hypercall handler */
switch (hypcall_id) {
case HC_GET_API_VERSION:
ret = hcall_get_api_version(vm, param1);
break;
case HC_CREATE_VM:
ret = hcall_create_vm(vm, param1);
break;
case HC_DESTROY_VM:
ret = hcall_destroy_vm(param1);
break;
case HC_START_VM:
ret = hcall_resume_vm(param1);
break;
case HC_PAUSE_VM:
ret = hcall_pause_vm(param1);
break;
case HC_CREATE_VCPU:
ret = hcall_create_vcpu(vm, param1, param2);
break;
case HC_ASSERT_IRQLINE:
ret = hcall_assert_irqline(vm, param1, param2);
break;
case HC_DEASSERT_IRQLINE:
ret = hcall_deassert_irqline(vm, param1, param2);
break;
case HC_PULSE_IRQLINE:
ret = hcall_pulse_irqline(vm, param1, param2);
break;
case HC_INJECT_MSI:
ret = hcall_inject_msi(vm, param1, param2);
break;
case HC_SET_IOREQ_BUFFER:
ret = hcall_set_ioreq_buffer(vm, param1, param2);
break;
case HC_NOTIFY_REQUEST_FINISH:
ret = hcall_notify_req_finish(param1, param2);
break;
case HC_VM_SET_MEMMAP:
ret = hcall_set_vm_memmap(vm, param1, param2);
break;
case HC_VM_PCI_MSIX_REMAP:
ret = hcall_remap_pci_msix(vm, param1, param2);
break;
case HC_VM_GPA2HPA:
ret = hcall_gpa_to_hpa(vm, param1, param2);
break;
case HC_ASSIGN_PTDEV:
ret = hcall_assign_ptdev(vm, param1, param2);
break;
case HC_DEASSIGN_PTDEV:
ret = hcall_deassign_ptdev(vm, param1, param2);
break;
case HC_SET_PTDEV_INTR_INFO:
ret = hcall_set_ptdev_intr_info(vm, param1, param2);
break;
case HC_RESET_PTDEV_INTR_INFO:
ret = hcall_reset_ptdev_intr_info(vm, param1, param2);
break;
case HC_SETUP_SBUF:
ret = hcall_setup_sbuf(vm, param1);
break;
default:
pr_err("op %d: Invalid hypercall\n", hypcall_id);
ret = -1;
break;
}
cur_context->guest_cpu_regs.regs.rax = ret;
TRACE_2L(TRC_VMEXIT_VMCALL, vm->attr.id, hypcall_id);
return 0;
}

View File

@@ -0,0 +1,321 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
/*MRS need to be emulated, the order in this array better as freq of ops*/
static const uint32_t emulated_msrs[] = {
MSR_IA32_TSC_DEADLINE, /* Enable TSC_DEADLINE VMEXIT */
/* following MSR not emulated now */
/*
* MSR_IA32_APIC_BASE,
* MSR_IA32_SYSENTER_CS,
* MSR_IA32_SYSENTER_ESP,
* MSR_IA32_SYSENTER_EIP,
* MSR_IA32_TSC_AUX,
* MSR_IA32_TIME_STAMP_COUNTER,
*/
};
/* the index is matched with emulated msrs array*/
enum {
IDX_TSC_DEADLINE,
IDX_MAX_MSR
};
static void enable_msr_interception(uint8_t *bitmap, uint32_t msr)
{
uint8_t *read_map;
uint8_t *write_map;
uint8_t value;
/* low MSR */
if (msr < 0x1FFF) {
read_map = bitmap;
write_map = bitmap + 2048;
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
read_map = bitmap + 1024;
write_map = bitmap + 3072;
} else {
pr_err("Invalid MSR");
return;
}
msr &= 0x1FFF;
value = read_map[(msr>>3)];
value |= 1<<(msr%8);
/* right now we trap for both r/w */
read_map[(msr>>3)] = value;
write_map[(msr>>3)] = value;
}
/* not used now just leave it for some cases it may be used as API*/
void disable_msr_interception(uint8_t *bitmap, uint32_t msr)
{
uint8_t *read_map;
uint8_t *write_map;
uint8_t value;
/* low MSR */
if (msr < 0x1FFF) {
read_map = bitmap;
write_map = bitmap + 2048;
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
read_map = bitmap + 1024;
write_map = bitmap + 3072;
} else {
pr_err("Invalid MSR");
return;
}
msr &= 0x1FFF;
value = read_map[(msr>>3)];
value &= ~(1<<(msr%8));
/* right now we trap for both r/w */
read_map[(msr>>3)] = value;
write_map[(msr>>3)] = value;
}
void init_msr_emulation(struct vcpu *vcpu)
{
uint32_t i = 0;
uint32_t msrs_count = ARRAY_SIZE(emulated_msrs);
void *msr_bitmap;
uint64_t value64;
ASSERT(msrs_count == IDX_MAX_MSR,
"MSR ID should be matched with emulated_msrs");
/*msr bitmap, just allocated/init once, and used for all vm's vcpu*/
if (is_vcpu_bsp(vcpu)) {
/* Allocate and initialize memory for MSR bitmap region*/
vcpu->vm->arch_vm.msr_bitmap = alloc_page();
ASSERT(vcpu->vm->arch_vm.msr_bitmap, "");
memset(vcpu->vm->arch_vm.msr_bitmap, 0x0, CPU_PAGE_SIZE);
msr_bitmap = vcpu->vm->arch_vm.msr_bitmap;
for (i = 0; i < msrs_count; i++)
enable_msr_interception(msr_bitmap, emulated_msrs[i]);
/* below MSR protected from guest OS, if access to inject gp*/
enable_msr_interception(msr_bitmap, MSR_IA32_MTRR_CAP);
enable_msr_interception(msr_bitmap, MSR_IA32_MTRR_DEF_TYPE);
for (i = MSR_IA32_MTRR_PHYSBASE_0;
i <= MSR_IA32_MTRR_PHYSMASK_9; i++) {
enable_msr_interception(msr_bitmap, i);
}
enable_msr_interception(msr_bitmap, MSR_IA32_MTRR_FIX64K_00000);
enable_msr_interception(msr_bitmap, MSR_IA32_MTRR_FIX16K_80000);
enable_msr_interception(msr_bitmap, MSR_IA32_MTRR_FIX16K_A0000);
for (i = MSR_IA32_MTRR_FIX4K_C0000;
i <= MSR_IA32_MTRR_FIX4K_F8000; i++) {
enable_msr_interception(msr_bitmap, i);
}
}
/* Set up MSR bitmap - pg 2904 24.6.9 */
value64 = (int64_t) vcpu->vm->arch_vm.msr_bitmap;
exec_vmwrite64(VMX_MSR_BITMAP_FULL, value64);
pr_dbg("VMX_MSR_BITMAP: 0x%016llx ", value64);
vcpu->guest_msrs = (uint64_t *)calloc(msrs_count, sizeof(uint64_t));
ASSERT(vcpu->guest_msrs != NULL, "");
memset(vcpu->guest_msrs, 0, msrs_count * sizeof(uint64_t));
}
int rdmsr_handler(struct vcpu *vcpu)
{
uint32_t msr;
uint64_t v = 0;
uint32_t id;
int cur_context = vcpu->arch_vcpu.cur_context;
/* Read the msr value */
msr = vcpu->arch_vcpu.contexts[cur_context].guest_cpu_regs.regs.rcx;
/* Do the required processing for each msr case */
switch (msr) {
case MSR_IA32_TSC_DEADLINE:
{
v = vcpu->guest_msrs[IDX_TSC_DEADLINE];
break;
}
case MSR_IA32_MTRR_CAP:
case MSR_IA32_MTRR_DEF_TYPE:
case MSR_IA32_MTRR_PHYSBASE_0 ... MSR_IA32_MTRR_PHYSMASK_9:
case MSR_IA32_MTRR_FIX64K_00000 ... MSR_IA32_MTRR_FIX4K_F8000:
{
vcpu_inject_gp(vcpu);
break;
}
/* following MSR not emulated now just left for future */
case MSR_IA32_SYSENTER_CS:
{
v = exec_vmread(VMX_GUEST_IA32_SYSENTER_CS);
break;
}
case MSR_IA32_SYSENTER_ESP:
{
v = exec_vmread(VMX_GUEST_IA32_SYSENTER_ESP);
break;
}
case MSR_IA32_SYSENTER_EIP:
{
v = exec_vmread(VMX_GUEST_IA32_SYSENTER_EIP);
break;
}
case MSR_IA32_TSC_AUX:
{
v = vcpu->arch_vcpu.msr_tsc_aux;
break;
}
case MSR_IA32_TIME_STAMP_COUNTER:
{
/* Read the host TSC value */
CPU_RDTSCP_EXECUTE(&v, &id);
/* Add the TSC_offset to host TSC and return the value */
v += exec_vmread64(VMX_TSC_OFFSET_FULL);
break;
}
case MSR_IA32_APIC_BASE:
{
bool ret;
/* Read APIC base */
vlapic_rdmsr(vcpu, msr, &v, &ret);
break;
}
default:
{
pr_warn("rdmsr: %lx should not come here!", msr);
v = 0;
break;
}
}
/* Store the MSR contents in RAX and RDX */
vcpu->arch_vcpu.contexts[cur_context].guest_cpu_regs.regs.rax =
v & 0xffffffff;
vcpu->arch_vcpu.contexts[cur_context].guest_cpu_regs.regs.rdx = v >> 32;
TRACE_2L(TRC_VMEXIT_RDMSR, msr, v);
return 0;
}
int wrmsr_handler(struct vcpu *vcpu)
{
uint32_t msr;
uint64_t v;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
/* Read the MSR ID */
msr = cur_context->guest_cpu_regs.regs.rcx;
/* Get the MSR contents */
v = (((uint64_t) cur_context->guest_cpu_regs.regs.rdx) << 32) |
((uint64_t) cur_context->guest_cpu_regs.regs.rax);
/* Do the required processing for each msr case */
switch (msr) {
case MSR_IA32_TSC_DEADLINE:
{
bool ret;
/* Write APIC base */
vlapic_wrmsr(vcpu, msr, v, &ret);
vcpu->guest_msrs[IDX_TSC_DEADLINE] = v;
break;
}
case MSR_IA32_MTRR_CAP:
case MSR_IA32_MTRR_DEF_TYPE:
case MSR_IA32_MTRR_PHYSBASE_0 ... MSR_IA32_MTRR_PHYSMASK_9:
case MSR_IA32_MTRR_FIX64K_00000 ... MSR_IA32_MTRR_FIX4K_F8000:
{
vcpu_inject_gp(vcpu);
break;
}
/* following MSR not emulated now just left for future */
case MSR_IA32_SYSENTER_CS:
{
exec_vmwrite(VMX_GUEST_IA32_SYSENTER_CS, v);
break;
}
case MSR_IA32_SYSENTER_ESP:
{
exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, v);
break;
}
case MSR_IA32_SYSENTER_EIP:
{
exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, v);
break;
}
case MSR_IA32_GS_BASE:
{
exec_vmwrite(VMX_GUEST_GS_BASE, v);
break;
}
case MSR_IA32_TSC_AUX:
{
vcpu->arch_vcpu.msr_tsc_aux = v;
break;
}
case MSR_IA32_APIC_BASE:
{
bool ret;
/* Write APIC base */
vlapic_wrmsr(vcpu, msr, v, &ret);
break;
}
default:
{
ASSERT(0, "wrmsr: %lx should not come here!", msr);
msr_write(msr, v);
break;
}
}
TRACE_2L(TRC_VMEXIT_WRMSR, msr, v);
return 0;
}

View File

@@ -0,0 +1,950 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#define pr_fmt(fmt) "vpic: " fmt
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#define VPIC_LOCK_INIT(vpic) spinlock_init(&((vpic)->lock))
#define VPIC_LOCK(vpic) spinlock_obtain(&((vpic)->lock))
#define VPIC_UNLOCK(vpic) spinlock_release(&((vpic)->lock))
/* TODO: add spinlock_locked support? */
/*#define VPIC_LOCKED(vpic) spinlock_locked(&((vpic)->lock))*/
#define vm_pic(vm) (vm->vpic)
#define true 1
#define false 0
#define ACRN_DBG_PIC 6
enum irqstate {
IRQSTATE_ASSERT,
IRQSTATE_DEASSERT,
IRQSTATE_PULSE
};
struct pic {
bool ready;
int icw_num;
int rd_cmd_reg;
bool aeoi;
bool poll;
bool rotate;
bool sfn; /* special fully-nested mode */
int irq_base;
uint8_t request; /* Interrupt Request Register (IIR) */
uint8_t service; /* Interrupt Service (ISR) */
uint8_t mask; /* Interrupt Mask Register (IMR) */
uint8_t smm; /* special mask mode */
int acnt[8]; /* sum of pin asserts and deasserts */
int lowprio; /* lowest priority irq */
bool intr_raised;
uint8_t elc;
};
struct vpic {
struct vm *vm;
spinlock_t lock;
struct pic pic[2];
};
/*
* Loop over all the pins in priority order from highest to lowest.
*/
#define PIC_PIN_FOREACH(pinvar, pic, tmpvar) \
for (tmpvar = 0, pinvar = (pic->lowprio + 1) & 0x7; \
tmpvar < 8; \
tmpvar++, pinvar = (pinvar + 1) & 0x7)
static void vpic_set_pinstate(struct vpic *vpic, int pin, bool newstate);
static inline bool master_pic(struct vpic *vpic, struct pic *pic)
{
if (pic == &vpic->pic[0])
return true;
else
return false;
}
static inline int vpic_get_highest_isrpin(struct pic *pic)
{
int bit, pin;
int i;
PIC_PIN_FOREACH(pin, pic, i) {
bit = (1 << pin);
if (pic->service & bit) {
/*
* An IS bit that is masked by an IMR bit will not be
* cleared by a non-specific EOI in Special Mask Mode.
*/
if (pic->smm && (pic->mask & bit) != 0)
continue;
else
return pin;
}
}
return -1;
}
static inline int vpic_get_highest_irrpin(struct pic *pic)
{
int serviced;
int bit, pin, tmp;
/*
* In 'Special Fully-Nested Mode' when an interrupt request from
* a slave is in service, the slave is not locked out from the
* master's priority logic.
*/
serviced = pic->service;
if (pic->sfn)
serviced &= ~(1 << 2);
/*
* In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
* further interrupts at that level and enables interrupts from all
* other levels that are not masked. In other words the ISR has no
* bearing on the levels that can generate interrupts.
*/
if (pic->smm)
serviced = 0;
PIC_PIN_FOREACH(pin, pic, tmp) {
bit = 1 << pin;
/*
* If there is already an interrupt in service at the same
* or higher priority then bail.
*/
if ((serviced & bit) != 0)
break;
/*
* If an interrupt is asserted and not masked then return
* the corresponding 'pin' to the caller.
*/
if ((pic->request & bit) != 0 && (pic->mask & bit) == 0)
return pin;
}
return -1;
}
static void vpic_notify_intr(struct vpic *vpic)
{
struct pic *pic;
int pin;
/*
* First check the slave.
*/
pic = &vpic->pic[1];
pin = vpic_get_highest_irrpin(pic);
if (!pic->intr_raised && pin != -1) {
dev_dbg(ACRN_DBG_PIC,
"pic slave notify pin = %d (imr 0x%x irr 0x%x isr 0x%x)\n",
pin, pic->mask, pic->request, pic->service);
/*
* Cascade the request from the slave to the master.
*/
pic->intr_raised = true;
vpic_set_pinstate(vpic, 2, true);
vpic_set_pinstate(vpic, 2, false);
} else {
dev_dbg(ACRN_DBG_PIC,
"pic slave no eligible interrupt (imr 0x%x irr 0x%x isr 0x%x)",
pic->mask, pic->request, pic->service);
}
/*
* Then check the master.
*/
pic = &vpic->pic[0];
pin = vpic_get_highest_irrpin(pic);
if (!pic->intr_raised && pin != -1) {
dev_dbg(ACRN_DBG_PIC,
"pic master notify pin = %d (imr 0x%x irr 0x%x isr 0x%x)\n",
pin, pic->mask, pic->request, pic->service);
/*
* From Section 3.6.2, "Interrupt Modes", in the
* MPtable Specification, Version 1.4
*
* PIC interrupts are routed to both the Local APIC
* and the I/O APIC to support operation in 1 of 3
* modes.
*
* 1. Legacy PIC Mode: the PIC effectively bypasses
* all APIC components. In this mode the local APIC is
* disabled and LINT0 is reconfigured as INTR to
* deliver the PIC interrupt directly to the CPU.
*
* 2. Virtual Wire Mode: the APIC is treated as a
* virtual wire which delivers interrupts from the PIC
* to the CPU. In this mode LINT0 is programmed as
* ExtINT to indicate that the PIC is the source of
* the interrupt.
*
* 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
* fielded by the I/O APIC and delivered to the appropriate
* CPU. In this mode the I/O APIC input 0 is programmed
* as ExtINT to indicate that the PIC is the source of the
* interrupt.
*/
pic->intr_raised = true;
if (vpic->vm->vpic_wire_mode == VPIC_WIRE_INTR) {
struct vcpu *vcpu = vcpu_from_vid(vpic->vm, 0);
ASSERT(vcpu != NULL, "vm%d, vcpu0", vpic->vm->attr.id);
vcpu_inject_extint(vcpu);
} else {
vlapic_set_local_intr(vpic->vm, -1, APIC_LVT_LINT0);
/* notify vioapic pin0 if existing
* For vPIC + vIOAPIC mode, vpic master irq connected
* to vioapic pin0 (irq2)
* From MPSpec session 5.1
*/
vioapic_pulse_irq(vpic->vm, 0);
}
} else {
dev_dbg(ACRN_DBG_PIC,
"pic master no eligible interrupt (imr 0x%x irr 0x%x isr 0x%x)",
pic->mask, pic->request, pic->service);
}
}
static int vpic_icw1(__unused struct vpic *vpic, struct pic *pic, uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic icw1 0x%x\n",
vpic->vm, val);
pic->ready = false;
pic->icw_num = 1;
pic->request = 0;
pic->mask = 0;
pic->lowprio = 7;
pic->rd_cmd_reg = 0;
pic->poll = 0;
pic->smm = 0;
if ((val & ICW1_SNGL) != 0) {
dev_dbg(ACRN_DBG_PIC, "vpic cascade mode required\n");
return -1;
}
if ((val & ICW1_IC4) == 0) {
dev_dbg(ACRN_DBG_PIC, "vpic icw4 required\n");
return -1;
}
pic->icw_num++;
return 0;
}
static int vpic_icw2(__unused struct vpic *vpic, struct pic *pic, uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic icw2 0x%x\n",
vpic->vm, val);
pic->irq_base = val & 0xf8;
pic->icw_num++;
return 0;
}
static int vpic_icw3(__unused struct vpic *vpic, struct pic *pic,
__unused uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic icw3 0x%x\n",
vpic->vm, val);
pic->icw_num++;
return 0;
}
static int vpic_icw4(struct vpic *vpic, struct pic *pic, uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic icw4 0x%x\n",
vpic->vm, val);
if ((val & ICW4_8086) == 0) {
dev_dbg(ACRN_DBG_PIC,
"vpic microprocessor mode required\n");
return -1;
}
if ((val & ICW4_AEOI) != 0)
pic->aeoi = true;
if ((val & ICW4_SFNM) != 0) {
if (master_pic(vpic, pic)) {
pic->sfn = true;
} else {
dev_dbg(ACRN_DBG_PIC,
"Ignoring special fully nested mode on slave pic: %#x",
val);
}
}
pic->icw_num = 0;
pic->ready = true;
return 0;
}
bool vpic_is_pin_mask(struct vpic *vpic, uint8_t virt_pin)
{
struct pic *pic;
if (virt_pin < 8)
pic = &vpic->pic[0];
else if (virt_pin < 16) {
pic = &vpic->pic[1];
virt_pin -= 8;
} else
return true;
if (pic->mask & (1 << virt_pin))
return true;
else
return false;
}
static int vpic_ocw1(struct vpic *vpic, struct pic *pic, uint8_t val)
{
int pin, i, bit;
uint8_t old = pic->mask;
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic ocw1 0x%x\n",
vpic->vm, val);
pic->mask = val & 0xff;
/* query and setup if pin/irq is for passthrough device */
PIC_PIN_FOREACH(pin, pic, i) {
bit = (1 << pin);
/* remap for active: interrupt mask -> unmask
* remap for deactive: when vIOAPIC take it over
*/
if (((pic->mask & bit) == 0) && (old & bit)) {
struct ptdev_intx_info intx;
/* master pic pin2 connect with slave pic,
* not device, so not need pt remap
*/
if ((pin == 2) && master_pic(vpic, pic))
continue;
intx.virt_pin = pin;
intx.vpin_src = PTDEV_VPIN_PIC;
if (!master_pic(vpic, pic))
intx.virt_pin += 8;
ptdev_intx_pin_remap(vpic->vm, &intx);
}
}
return 0;
}
static int vpic_ocw2(struct vpic *vpic, struct pic *pic, uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic ocw2 0x%x\n",
vpic->vm, val);
pic->rotate = ((val & OCW2_R) != 0);
if ((val & OCW2_EOI) != 0) {
int isr_bit;
if ((val & OCW2_SL) != 0) {
/* specific EOI */
isr_bit = val & 0x7;
} else {
/* non-specific EOI */
isr_bit = vpic_get_highest_isrpin(pic);
}
if (isr_bit != -1) {
pic->service &= ~(1 << isr_bit);
if (pic->rotate)
pic->lowprio = isr_bit;
}
/* if level ack PTDEV */
if (pic->elc & (1 << (isr_bit & 0x7))) {
ptdev_intx_ack(vpic->vm,
master_pic(vpic, pic) ? isr_bit : isr_bit + 8,
PTDEV_VPIN_PIC);
}
} else if ((val & OCW2_SL) != 0 && pic->rotate == true) {
/* specific priority */
pic->lowprio = val & 0x7;
}
return 0;
}
static int vpic_ocw3(__unused struct vpic *vpic, struct pic *pic, uint8_t val)
{
dev_dbg(ACRN_DBG_PIC, "vm 0x%x: pic ocw3 0x%x\n",
vpic->vm, val);
if (val & OCW3_ESMM) {
pic->smm = val & OCW3_SMM ? 1 : 0;
dev_dbg(ACRN_DBG_PIC, "%s pic special mask mode %s\n",
master_pic(vpic, pic) ? "master" : "slave",
pic->smm ? "enabled" : "disabled");
}
if (val & OCW3_RR) {
/* read register command */
pic->rd_cmd_reg = val & OCW3_RIS;
/* Polling mode */
pic->poll = ((val & OCW3_P) != 0);
}
return 0;
}
static void vpic_set_pinstate(struct vpic *vpic, int pin, bool newstate)
{
struct pic *pic;
int oldcnt, newcnt;
bool level;
ASSERT(pin >= 0 && pin < 16,
"vpic_set_pinstate: invalid pin number");
pic = &vpic->pic[pin >> 3];
oldcnt = pic->acnt[pin & 0x7];
if (newstate)
pic->acnt[pin & 0x7]++;
else
pic->acnt[pin & 0x7]--;
newcnt = pic->acnt[pin & 0x7];
if (newcnt < 0) {
pr_warn("pic pin%d: bad acnt %d\n", pin, newcnt);
}
level = ((vpic->pic[pin >> 3].elc & (1 << (pin & 0x7))) != 0);
if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
/* rising edge or level */
dev_dbg(ACRN_DBG_PIC, "pic pin%d: asserted\n", pin);
pic->request |= (1 << (pin & 0x7));
} else if (oldcnt == 1 && newcnt == 0) {
/* falling edge */
dev_dbg(ACRN_DBG_PIC, "pic pin%d: deasserted\n", pin);
if (level)
pic->request &= ~(1 << (pin & 0x7));
} else {
dev_dbg(ACRN_DBG_PIC,
"pic pin%d: %s, ignored, acnt %d\n",
pin, newstate ? "asserted" : "deasserted", newcnt);
}
vpic_notify_intr(vpic);
}
static int vpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
{
struct vpic *vpic;
struct pic *pic;
if (irq < 0 || irq > 15)
return -EINVAL;
vpic = vm_pic(vm);
pic = &vpic->pic[irq >> 3];
if (pic->ready == false)
return 0;
VPIC_LOCK(vpic);
switch (irqstate) {
case IRQSTATE_ASSERT:
vpic_set_pinstate(vpic, irq, true);
break;
case IRQSTATE_DEASSERT:
vpic_set_pinstate(vpic, irq, false);
break;
case IRQSTATE_PULSE:
vpic_set_pinstate(vpic, irq, true);
vpic_set_pinstate(vpic, irq, false);
break;
default:
ASSERT(0, "vpic_set_irqstate: invalid irqstate");
}
VPIC_UNLOCK(vpic);
return 0;
}
/* hypervisor interface: assert/deassert/pulse irq */
int vpic_assert_irq(struct vm *vm, int irq)
{
return vpic_set_irqstate(vm, irq, IRQSTATE_ASSERT);
}
int vpic_deassert_irq(struct vm *vm, int irq)
{
return vpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT);
}
int vpic_pulse_irq(struct vm *vm, int irq)
{
return vpic_set_irqstate(vm, irq, IRQSTATE_PULSE);
}
int vpic_set_irq_trigger(struct vm *vm, int irq, enum vpic_trigger trigger)
{
struct vpic *vpic;
if (irq < 0 || irq > 15)
return -EINVAL;
/*
* See comment in vpic_elc_handler. These IRQs must be
* edge triggered.
*/
if (trigger == LEVEL_TRIGGER) {
switch (irq) {
case 0:
case 1:
case 2:
case 8:
case 13:
return -EINVAL;
}
}
vpic = vm_pic(vm);
VPIC_LOCK(vpic);
if (trigger == LEVEL_TRIGGER)
vpic->pic[irq >> 3].elc |= 1 << (irq & 0x7);
else
vpic->pic[irq >> 3].elc &= ~(1 << (irq & 0x7));
VPIC_UNLOCK(vpic);
return 0;
}
int vpic_get_irq_trigger(struct vm *vm, int irq, enum vpic_trigger *trigger)
{
struct vpic *vpic;
if (irq < 0 || irq > 15)
return -EINVAL;
vpic = vm_pic(vm);
if (!vpic)
return -EINVAL;
if (vpic->pic[irq>>3].elc & (1 << (irq & 0x7)))
*trigger = LEVEL_TRIGGER;
else
*trigger = EDGE_TRIGGER;
return 0;
}
void vpic_pending_intr(struct vm *vm, int *vecptr)
{
struct vpic *vpic;
struct pic *pic;
int pin;
vpic = vm_pic(vm);
pic = &vpic->pic[0];
VPIC_LOCK(vpic);
pin = vpic_get_highest_irrpin(pic);
if (pin == 2) {
pic = &vpic->pic[1];
pin = vpic_get_highest_irrpin(pic);
}
/*
* If there are no pins active at this moment then return the spurious
* interrupt vector instead.
*/
if (pin == -1) {
*vecptr = -1;
VPIC_UNLOCK(vpic);
return;
}
ASSERT(pin >= 0 && pin <= 7, "invalid pin");
*vecptr = pic->irq_base + pin;
dev_dbg(ACRN_DBG_PIC, "Got pending vector 0x%x\n", *vecptr);
VPIC_UNLOCK(vpic);
}
static void vpic_pin_accepted(struct pic *pic, int pin)
{
pic->intr_raised = false;
if ((pic->elc & (1 << pin)) == 0) {
/*only used edge trigger mode*/
pic->request &= ~(1 << pin);
}
if (pic->aeoi == true) {
if (pic->rotate == true)
pic->lowprio = pin;
} else {
pic->service |= (1 << pin);
}
}
void vpic_intr_accepted(struct vm *vm, int vector)
{
struct vpic *vpic;
int pin;
vpic = vm_pic(vm);
VPIC_LOCK(vpic);
pin = vector & 0x7;
if ((vector & ~0x7) == vpic->pic[1].irq_base) {
vpic_pin_accepted(&vpic->pic[1], pin);
/*
* If this vector originated from the slave,
* accept the cascaded interrupt too.
*/
vpic_pin_accepted(&vpic->pic[0], 2);
} else {
vpic_pin_accepted(&vpic->pic[0], pin);
}
vpic_notify_intr(vpic);
VPIC_UNLOCK(vpic);
}
static int vpic_read(struct vpic *vpic, struct pic *pic,
int port, uint32_t *eax)
{
int pin;
VPIC_LOCK(vpic);
if (pic->poll) {
pic->poll = 0;
pin = vpic_get_highest_irrpin(pic);
if (pin >= 0) {
vpic_pin_accepted(pic, pin);
*eax = 0x80 | pin;
} else {
*eax = 0;
}
} else {
if (port & ICU_IMR_OFFSET) {
/* read interrupt mask register */
*eax = pic->mask;
} else {
if (pic->rd_cmd_reg == OCW3_RIS) {
/* read interrupt service register */
*eax = pic->service;
} else {
/* read interrupt request register */
*eax = pic->request;
}
}
}
VPIC_UNLOCK(vpic);
return 0;
}
static int vpic_write(struct vpic *vpic, struct pic *pic,
int port, uint32_t *eax)
{
int error;
uint8_t val;
error = 0;
val = *eax;
VPIC_LOCK(vpic);
if (port & ICU_IMR_OFFSET) {
switch (pic->icw_num) {
case 2:
error = vpic_icw2(vpic, pic, val);
break;
case 3:
error = vpic_icw3(vpic, pic, val);
break;
case 4:
error = vpic_icw4(vpic, pic, val);
break;
default:
error = vpic_ocw1(vpic, pic, val);
break;
}
} else {
if (val & (1 << 4))
error = vpic_icw1(vpic, pic, val);
if (pic->ready) {
if (val & (1 << 3))
error = vpic_ocw3(vpic, pic, val);
else
error = vpic_ocw2(vpic, pic, val);
}
}
if (pic->ready)
vpic_notify_intr(vpic);
VPIC_UNLOCK(vpic);
return error;
}
static int vpic_master_handler(struct vm *vm, bool in, int port, int bytes,
uint32_t *eax)
{
struct vpic *vpic;
struct pic *pic;
vpic = vm_pic(vm);
pic = &vpic->pic[0];
if (bytes != 1)
return -1;
if (in)
return vpic_read(vpic, pic, port, eax);
return vpic_write(vpic, pic, port, eax);
}
static uint32_t vpic_master_io_read(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width)
{
uint32_t val = 0;
if (vpic_master_handler(vm, true, (int)addr, (int)width, &val) < 0)
pr_err("pic master read port 0x%x width=%d failed\n",
addr, width);
return val;
}
static void vpic_master_io_write(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width, uint32_t v)
{
uint32_t val = v;
if (vpic_master_handler(vm, false, (int)addr, (int)width, &val) < 0)
pr_err("%s: write port 0x%x width=%d value 0x%x failed\n",
__func__, addr, width, val);
}
static int vpic_slave_handler(struct vm *vm, bool in, int port, int bytes,
uint32_t *eax)
{
struct vpic *vpic;
struct pic *pic;
vpic = vm_pic(vm);
pic = &vpic->pic[1];
if (bytes != 1)
return -1;
if (in)
return vpic_read(vpic, pic, port, eax);
return vpic_write(vpic, pic, port, eax);
}
static uint32_t vpic_slave_io_read(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width)
{
uint32_t val = 0;
if (vpic_slave_handler(vm, true, (int)addr, (int)width, &val) < 0)
pr_err("pic slave read port 0x%x width=%d failed\n",
addr, width);
return val;
}
static void vpic_slave_io_write(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width, uint32_t v)
{
uint32_t val = v;
if (vpic_slave_handler(vm, false, (int)addr, (int)width, &val) < 0)
pr_err("%s: write port 0x%x width=%d value 0x%x failed\n",
__func__, addr, width, val);
}
static int vpic_elc_handler(struct vm *vm, bool in, int port, int bytes,
uint32_t *eax)
{
struct vpic *vpic;
bool is_master;
vpic = vm_pic(vm);
is_master = (port == IO_ELCR1);
if (bytes != 1)
return -1;
VPIC_LOCK(vpic);
if (in) {
if (is_master)
*eax = vpic->pic[0].elc;
else
*eax = vpic->pic[1].elc;
} else {
/*
* For the master PIC the cascade channel (IRQ2), the
* heart beat timer (IRQ0), and the keyboard
* controller (IRQ1) cannot be programmed for level
* mode.
*
* For the slave PIC the real time clock (IRQ8) and
* the floating point error interrupt (IRQ13) cannot
* be programmed for level mode.
*/
if (is_master)
vpic->pic[0].elc = (*eax & 0xf8);
else
vpic->pic[1].elc = (*eax & 0xde);
}
VPIC_UNLOCK(vpic);
return 0;
}
static uint32_t vpic_elc_io_read(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width)
{
uint32_t val = 0;
if (vpic_elc_handler(vm, true, (int)addr, (int)width, &val) < 0)
pr_err("pic elc read port 0x%x width=%d failed", addr, width);
return val;
}
static void vpic_elc_io_write(__unused struct vm_io_handler *hdlr,
struct vm *vm, ioport_t addr, size_t width, uint32_t v)
{
uint32_t val = v;
if (vpic_elc_handler(vm, false, (int)addr, (int)width, &val) < 0)
pr_err("%s: write port 0x%x width=%d value 0x%x failed\n",
__func__, addr, width, val);
}
void vpic_register_io_handler(struct vm *vm)
{
struct vm_io_range master_range = {
.flags = IO_ATTR_RW,
.base = 0x20,
.len = 2
};
struct vm_io_range slave_range = {
.flags = IO_ATTR_RW,
.base = 0xa0,
.len = 2
};
struct vm_io_range elcr_range = {
.flags = IO_ATTR_RW,
.base = 0x4d0,
.len = 2
};
register_io_emulation_handler(vm, &master_range,
&vpic_master_io_read, &vpic_master_io_write);
register_io_emulation_handler(vm, &slave_range,
&vpic_slave_io_read, &vpic_slave_io_write);
register_io_emulation_handler(vm, &elcr_range,
&vpic_elc_io_read, &vpic_elc_io_write);
}
void *vpic_init(struct vm *vm)
{
struct vpic *vpic;
vpic_register_io_handler(vm);
vpic = malloc(sizeof(struct vpic));
ASSERT(vpic != NULL, "");
vpic->vm = vm;
vpic->pic[0].mask = 0xff;
vpic->pic[1].mask = 0xff;
VPIC_LOCK_INIT(vpic);
return vpic;
}
void vpic_cleanup(struct vm *vm)
{
if (vm->vpic) {
free(vm->vpic);
vm->vpic = NULL;
}
}

441
hypervisor/arch/x86/idt.S Normal file
View File

@@ -0,0 +1,441 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <gdt.h>
#include <idt.h>
.altmacro
.global HOST_IDT
.global HOST_IDTR
.section .data
.align 8
.long 0
.short 0
HOST_IDTR:
.short HOST_IDT_SIZE - 1
.quad HOST_IDT
/*
* We'll rearrange and fix up the descriptors at runtime
*/
.macro interrupt_descriptor entry, dpl=0 ist=0
.long HOST_GDT_RING0_CODE_SEL << 16
.long 0x00008e00 + (dpl << 13) + ist
.quad entry
.endm
.macro trap_descriptor entry, dpl=0, ist=0
.long HOST_GDT_RING0_CODE_SEL << 16
.long 0x00008f00 + (dpl <<13) + ist
.quad entry
.endm
.macro _external_interrupt_descriptor vector
__external_interrupt_descriptor %vector
.endm
.macro __external_interrupt_descriptor vector
interrupt_descriptor external_interrupt_\vector
.endm
#define MACHINE_CHECK_IST (0x1)
#define DOUBLE_FAULT_IST (0x2)
#define STACK_FAULT_IST (0x3)
/*
* We'll use interrupt gates. Change to trap or task only as needed.
*/
.section .rodata
.align 16
HOST_IDT:
interrupt_descriptor excp_divide_error
interrupt_descriptor excp_debug, 3
interrupt_descriptor excp_nmi
interrupt_descriptor excp_breakpoint, 3
interrupt_descriptor excp_overflow, 3
interrupt_descriptor excp_bounds_check
interrupt_descriptor excp_illegal_opcode
interrupt_descriptor excp_device_not_available
interrupt_descriptor excp_double_fault, 0, DOUBLE_FAULT_IST
interrupt_descriptor excp_rsvd_09
interrupt_descriptor excp_invalid_tss
interrupt_descriptor excp_segment_not_present
interrupt_descriptor excp_stack_fault, 0, STACK_FAULT_IST
interrupt_descriptor excp_general_protection
interrupt_descriptor excp_page_fault
interrupt_descriptor excp_rsvd_0f
interrupt_descriptor excp_float_error
interrupt_descriptor excp_alignment_check
interrupt_descriptor expt_machine_check, 0, MACHINE_CHECK_IST
interrupt_descriptor excp_simd_fp_error
interrupt_descriptor excp_virtualization
interrupt_descriptor excp_rsvd_21
interrupt_descriptor excp_rsvd_22
interrupt_descriptor excp_rsvd_23
interrupt_descriptor excp_rsvd_24
interrupt_descriptor excp_rsvd_25
interrupt_descriptor excp_rsvd_26
interrupt_descriptor excp_rsvd_27
interrupt_descriptor excp_rsvd_28
interrupt_descriptor excp_rsvd_29
interrupt_descriptor excp_rsvd_30
interrupt_descriptor excp_rsvd_31
vector = 0x20
.rept (0x100 - 0x20)
_external_interrupt_descriptor vector
vector = vector + 1
.endr
.section .text
.align 16
excp_divide_error:
pushq $0x0 /* pseudo error code */
pushq $0x00
jmp excp_save_frame
.align 8
excp_debug:
pushq $0x0 /* pseudo error code */
pushq $0x01
jmp excp_save_frame
.align 8
excp_nmi:
.align 8
excp_breakpoint:
pushq $0x0 /* pseudo error code */
pushq $0x03
jmp excp_save_frame
.align 8
excp_overflow:
pushq $0x0 /* pseudo error code */
pushq $0x04
jmp excp_save_frame
.align 8
excp_bounds_check:
pushq $0x0 /* pseudo error code */
pushq $0x05
jmp excp_save_frame
.align 8
excp_illegal_opcode:
pushq $0x0 /* pseudo error code */
pushq $0x06
jmp excp_save_frame
.align 8
excp_device_not_available:
pushq $0x0 /* pseudo error code */
pushq $0x07
jmp excp_save_frame
.align 8
excp_double_fault:
pushq $0x08
jmp excp_save_frame
.align 8
excp_invalid_tss:
pushq $0x0A
jmp excp_save_frame
.align 8
excp_segment_not_present:
pushq $0x0B
jmp excp_save_frame
.align 8
excp_stack_fault:
pushq $0x0C
jmp excp_save_frame
.align 8
excp_general_protection:
pushq $0x0D
jmp excp_save_frame
.align 8
excp_page_fault:
pushq $0x0E
jmp excp_save_frame
.align 8
excp_float_error:
pushq $0x0 /* pseudo error code */
pushq $0x10
jmp excp_save_frame
.align 8
excp_alignment_check:
pushq $0x11
jmp excp_save_frame
.align 8
expt_machine_check:
pushq $0x0 /* pseudo error code */
pushq $0x12
jmp excp_save_frame
.align 8
excp_simd_fp_error:
pushq $0x0 /* pseudo error code */
pushq $0x13
jmp excp_save_frame
.align 8
excp_virtualization:
pushq $0x0 /* pseudo error code */
pushq $0x14
jmp excp_save_frame
/*
* Macros for rsvd vectors. Vectors 0x09, 0x0F, 0x15 through 0x1F
*/
.macro _rsvd_vector vector
__rsvd_vector %vector
.endm
.macro __rsvd_vector vector
.align 8
excp_rsvd_\vector\():
pushq $0x0 /* pseudo error code */
pushq $\vector
jmp excp_rsvd
.endm
.align 8
excp_rsvd_09:
_rsvd_vector 0x09
.align 8
excp_rsvd_0f:
_rsvd_vector 0x0f
vector = 0x15
.rept (0x20 - 0x15)
_rsvd_vector vector
vector = vector + 1
.endr
/*
* Macros for external interrupts. Vectors$0x20 through$0xFF
*/
.macro _external_interrupt vector
__external_interrupt %vector
.endm
.macro __external_interrupt vector
.align 8
external_interrupt_\vector\():
pushq $0x0 /* pseudo error code */
pushq $\vector
jmp external_interrupt_save_frame
.endm
vector =0x20
.rept (0x100 - 0x20)
_external_interrupt vector
vector = vector + 1
.endr
/*
* Common entry point for defined exceptions
*/
.align 8
excp_save_frame:
pushq %r11
pushq %r10
pushq %r9
pushq %r8
pushq %rdi
pushq %rsi
pushq %rdx
pushq %rcx
pushq %rax
pushq %rbp
pushq %rbx
pushq %r15
pushq %r14
pushq %r13
pushq %r12
/* Put current stack pointer into 1st param register (rdi) */
movq %rsp, %rdi
call dispatch_exception
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbx
popq %rbp
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
popq %r8
popq %r9
popq %r10
popq %r11
/* Skip vector and error code*/
add $16, %rsp
iretq
/*
* Common entry point for reserved exceptions.
* These should never execute.
* We put a handler on them anyway to highlight the unexpected.
*/
.align 8
excp_rsvd:
pushq %r11
pushq %r10
pushq %r9
pushq %r8
pushq %rdi
pushq %rsi
pushq %rdx
pushq %rcx
pushq %rax
pushq %rbp
pushq %rbx
pushq %r15
pushq %r14
pushq %r13
pushq %r12
/* Put current stack pointer into 1st param register (rdi) */
movq %rsp, %rdi
call dispatch_exception
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbx
popq %rbp
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
popq %r8
popq %r9
popq %r10
popq %r11
/* Skip vector and error code*/
add $16, %rsp
iretq
/*
* Common entry point for defined interrupts.
* Vectors 0x20 through 0xFF
*/
.align 8
external_interrupt_save_frame:
pushq %r11
pushq %r10
pushq %r9
pushq %r8
pushq %rdi
pushq %rsi
pushq %rdx
pushq %rcx
pushq %rax
pushq %rbp
pushq %rbx
pushq %r15
pushq %r14
pushq %r13
pushq %r12
/* Put current stack pointer into 1st param register (rdi) */
movq %rsp, %rdi
call dispatch_interrupt
/*
* We disable softirq path from interrupt IRET, since right now all IRQ
* are for Guest, and we can execute softirq in hv_main() loop
*/
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbx
popq %rbp
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
popq %r8
popq %r9
popq %r10
popq %r11
/* Skip vector and error code*/
add $16, %rsp
iretq

View File

@@ -0,0 +1,431 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#define EXCEPTION_ERROR_CODE_VALID 8
#define INTERRPUT_QUEUE_BUFF_SIZE 255
#define ACRN_DBG_INTR 6
static const uint16_t exception_type[] = {
[0] = VMX_INT_TYPE_HW_EXP,
[1] = VMX_INT_TYPE_HW_EXP,
[2] = VMX_INT_TYPE_HW_EXP,
[3] = VMX_INT_TYPE_HW_EXP,
[4] = VMX_INT_TYPE_HW_EXP,
[5] = VMX_INT_TYPE_HW_EXP,
[6] = VMX_INT_TYPE_HW_EXP,
[7] = VMX_INT_TYPE_HW_EXP,
[8] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[9] = VMX_INT_TYPE_HW_EXP,
[10] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[11] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[12] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[13] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[14] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[15] = VMX_INT_TYPE_HW_EXP,
[16] = VMX_INT_TYPE_HW_EXP,
[17] = VMX_INT_TYPE_HW_EXP | EXCEPTION_ERROR_CODE_VALID,
[18] = VMX_INT_TYPE_HW_EXP,
[19] = VMX_INT_TYPE_HW_EXP,
[20] = VMX_INT_TYPE_HW_EXP,
[21] = VMX_INT_TYPE_HW_EXP,
[22] = VMX_INT_TYPE_HW_EXP,
[23] = VMX_INT_TYPE_HW_EXP,
[24] = VMX_INT_TYPE_HW_EXP,
[25] = VMX_INT_TYPE_HW_EXP,
[26] = VMX_INT_TYPE_HW_EXP,
[27] = VMX_INT_TYPE_HW_EXP,
[28] = VMX_INT_TYPE_HW_EXP,
[29] = VMX_INT_TYPE_HW_EXP,
[30] = VMX_INT_TYPE_HW_EXP,
[31] = VMX_INT_TYPE_HW_EXP
};
static int is_guest_irq_enabled(struct vcpu *vcpu)
{
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
uint32_t guest_rflags, guest_state;
int status = false;
/* Read the RFLAGS of the guest */
guest_rflags = cur_context->rflags;
/* Check the RFLAGS[IF] bit first */
if (guest_rflags & HV_ARCH_VCPU_RFLAGS_IF) {
/* Interrupts are allowed */
/* Check for temporarily disabled interrupts */
guest_state = exec_vmread(VMX_GUEST_INTERRUPTIBILITY_INFO);
if ((guest_state & (HV_ARCH_VCPU_BLOCKED_BY_STI |
HV_ARCH_VCPU_BLOCKED_BY_MOVSS)) == 0) {
status = true;
}
}
return status;
}
static bool vcpu_pending_request(struct vcpu *vcpu)
{
struct vlapic *vlapic;
int vector = 0;
int ret = 0;
/* Query vLapic to get vector to inject */
vlapic = vcpu->arch_vcpu.vlapic;
ret = vlapic_pending_intr(vlapic, &vector);
/* we need to check and raise request if we have pending event
* in LAPIC IRR
*/
if (ret != 0) {
/* we have pending IRR */
vcpu_make_request(vcpu, ACRN_REQUEST_EVENT);
}
return vcpu->arch_vcpu.pending_intr != 0;
}
int vcpu_make_request(struct vcpu *vcpu, int eventid)
{
bitmap_set(eventid, &vcpu->arch_vcpu.pending_intr);
/*
* if current hostcpu is not the target vcpu's hostcpu, we need
* to invoke IPI to wake up target vcpu
*
* TODO: Here we just compare with cpuid, since cpuid currently is
* global under pCPU / vCPU 1:1 mapping. If later we enabled vcpu
* scheduling, we need change here to determine it target vcpu is
* VMX non-root or root mode
*/
if ((int)get_cpu_id() != vcpu->pcpu_id)
send_single_ipi(vcpu->pcpu_id, VECTOR_NOTIFY_VCPU);
return 0;
}
static int vcpu_do_pending_event(struct vcpu *vcpu)
{
struct vlapic *vlapic = vcpu->arch_vcpu.vlapic;
int vector = 0;
int ret = 0;
if (is_apicv_enabled()) {
apicv_inject_pir(vlapic);
return 0;
}
/* Query vLapic to get vector to inject */
ret = vlapic_pending_intr(vlapic, &vector);
/*
* From the Intel SDM, Volume 3, 6.3.2 Section "Maskable
* Hardware Interrupts":
* - maskable interrupt vectors [16,255] can be delivered
* through the local APIC.
*/
if (ret == 0)
return -1;
if (!(vector >= 16 && vector <= 255)) {
dev_dbg(ACRN_DBG_INTR, "invalid vector %d from local APIC",
vector);
return -1;
}
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD, VMX_INT_INFO_VALID |
(vector & 0xFF));
vlapic_intr_accepted(vlapic, vector);
return 0;
}
static int vcpu_do_pending_extint(struct vcpu *vcpu)
{
struct vm *vm;
struct vcpu *primary;
int vector;
vm = vcpu->vm;
/* check if there is valid interrupt from vPIC, if yes just inject it */
/* PIC only connect with primary CPU */
primary = get_primary_vcpu(vm);
if (vm->vpic && vcpu == primary) {
vpic_pending_intr(vcpu->vm, &vector);
if (vector > 0) {
dev_dbg(ACRN_DBG_INTR, "VPIC: to inject PIC vector %d\n",
vector & 0xFF);
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD,
VMX_INT_INFO_VALID |
(vector & 0xFF));
vpic_intr_accepted(vcpu->vm, vector);
}
}
return 0;
}
static int vcpu_do_pending_gp(__unused struct vcpu *vcpu)
{
/* GP vector = 13 */
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD,
VMX_INT_INFO_VALID | 13);
return 0;
}
/* please keep this for interrupt debug:
* 1. Timer alive or not
* 2. native LAPIC interrupt pending/EOI status
* 3. CPU stuck or not
*/
void dump_lapic(void)
{
dev_dbg(ACRN_DBG_INTR,
"LAPIC: TIME %08x, init=0x%x cur=0x%x ISR=0x%x IRR=0x%x",
mmio_read_long(0xFEE00000 + LAPIC_LVT_TIMER_REGISTER),
mmio_read_long(0xFEE00000 + LAPIC_INITIAL_COUNT_REGISTER),
mmio_read_long(0xFEE00000 + LAPIC_CURRENT_COUNT_REGISTER),
mmio_read_long(0xFEE00000 + LAPIC_IN_SERVICE_REGISTER_7),
mmio_read_long(0xFEE00000 + LAPIC_INT_REQUEST_REGISTER_7));
}
int vcpu_inject_extint(struct vcpu *vcpu)
{
return vcpu_make_request(vcpu, ACRN_REQUEST_EXTINT);
}
int vcpu_inject_nmi(struct vcpu *vcpu)
{
return vcpu_make_request(vcpu, ACRN_REQUEST_NMI);
}
int vcpu_inject_gp(struct vcpu *vcpu)
{
return vcpu_make_request(vcpu, ACRN_REQUEST_GP);
}
int interrupt_win_exiting_handler(struct vcpu *vcpu)
{
int value32;
TRACE_2L(TRC_VMEXIT_INTERRUPT_WINDOW, 0, 0);
if (!vcpu)
return -1;
if (vcpu_pending_request(vcpu)) {
/* Do nothing
* acrn_do_intr_process will continue for this vcpu
*/
} else {
/* No interrupts to inject.
* Disable the interrupt window exiting
*/
vcpu->arch_vcpu.irq_window_enabled = 0;
value32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS);
value32 &= ~(VMX_PROCBASED_CTLS_IRQ_WIN);
exec_vmwrite(VMX_PROC_VM_EXEC_CONTROLS, value32);
}
VCPU_RETAIN_RIP(vcpu);
return 0;
}
int external_interrupt_handler(struct vcpu *vcpu)
{
int vector = exec_vmread(VMX_EXIT_INT_INFO) & 0xFF;
struct intr_ctx ctx;
ctx.vector = vector;
/* do not RETAIN RIP for spurious interrupt */
if (dispatch_interrupt(&ctx) == 0)
VCPU_RETAIN_RIP(vcpu);
TRACE_2L(TRC_VMEXIT_EXTERNAL_INTERRUPT, vector, 0);
return 0;
}
int acrn_do_intr_process(struct vcpu *vcpu)
{
int ret = 0;
int vector;
int tmp;
bool intr_pending = false;
uint64_t *pending_intr_bits = &vcpu->arch_vcpu.pending_intr;
if (bitmap_test_and_clear(ACRN_REQUEST_TLB_FLUSH, pending_intr_bits))
mmu_invept(vcpu);
if (bitmap_test_and_clear(ACRN_REQUEST_TMR_UPDATE, pending_intr_bits))
vioapic_update_tmr(vcpu);
/* handling pending vector injection:
* there are many reason inject failed, we need re-inject again
*/
if (vcpu->arch_vcpu.exit_interrupt_info & VMX_INT_INFO_VALID) {
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD,
vcpu->arch_vcpu.exit_interrupt_info);
goto INTR_WIN;
}
/* handling exception request */
vector = vcpu->arch_vcpu.exception_info.exception;
/* If there is a valid exception, inject exception to guest */
if (vector >= 0) {
if (exception_type[vector] &
EXCEPTION_ERROR_CODE_VALID) {
exec_vmwrite(VMX_ENTRY_EXCEPTION_EC,
vcpu->arch_vcpu.exception_info.error);
}
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD,
VMX_INT_INFO_VALID |
((exception_type[vector] & 15) << 8)
| (vector & 0xFF));
vcpu->arch_vcpu.exception_info.exception = -1;
goto INTR_WIN;
}
/* Do pending interrupts process */
/* TODO: checkin NMI intr windows before inject */
if (bitmap_test_and_clear(ACRN_REQUEST_NMI, pending_intr_bits)) {
/* Inject NMI vector = 2 */
exec_vmwrite(VMX_ENTRY_INT_INFO_FIELD,
VMX_INT_INFO_VALID | (VMX_INT_TYPE_NMI << 8) | 2);
/* Intel SDM 10.8.1
* NMI, SMI, INIT, ExtINT, or SIPI directly deliver to CPU
* do not need EOI to LAPIC
* However, ExtINT need EOI to PIC
*/
goto INTR_WIN;
}
/* Guest interruptable or not */
if (!is_guest_irq_enabled(vcpu)) {
/* interrupt window unavailable */
goto INTR_WIN;
}
/* Inject external interrupt first */
if (bitmap_test_and_clear(ACRN_REQUEST_EXTINT, pending_intr_bits)) {
/* has pending external interrupts */
ret = vcpu_do_pending_extint(vcpu);
goto INTR_WIN;
}
/* Inject vLAPIC vectors */
if (bitmap_test_and_clear(ACRN_REQUEST_EVENT, pending_intr_bits)) {
/* has pending vLAPIC interrupts */
ret = vcpu_do_pending_event(vcpu);
goto INTR_WIN;
}
/* Inject GP event */
if (bitmap_test_and_clear(ACRN_REQUEST_GP, pending_intr_bits)) {
/* has pending GP interrupts */
ret = vcpu_do_pending_gp(vcpu);
goto INTR_WIN;
}
INTR_WIN:
/* check if we have new interrupt pending for next VMExit */
intr_pending = vcpu_pending_request(vcpu);
/* Enable interrupt window exiting if pending */
if (intr_pending && vcpu->arch_vcpu.irq_window_enabled == 0) {
vcpu->arch_vcpu.irq_window_enabled = 1;
tmp = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS);
tmp |= (VMX_PROCBASED_CTLS_IRQ_WIN);
exec_vmwrite(VMX_PROC_VM_EXEC_CONTROLS, tmp);
}
return ret;
}
int exception_handler(struct vcpu *vcpu)
{
uint32_t intinfo, int_err_code;
uint32_t exception_vector;
uint32_t cpl;
int status = 0;
if (vcpu == NULL) {
TRACE_4I(TRC_VMEXIT_EXCEPTION_OR_NMI, 0, 0, 0, 0);
status = -EINVAL;
}
if (status != 0)
return status;
pr_dbg(" Handling guest exception");
/* Obtain VM-Exit information field pg 2912 */
intinfo = exec_vmread(VMX_EXIT_INT_INFO);
exception_vector = intinfo & 0xFF;
/* Check if exception caused by the guest is a HW exception. If the
* exit occurred due to a HW exception obtain the error code to be
* conveyed to get via the stack
*/
if (intinfo & VMX_INT_INFO_ERR_CODE_VALID) {
int_err_code = exec_vmread(VMX_EXIT_INT_EC);
/* get current privilege level and fault address */
cpl = exec_vmread(VMX_GUEST_CS_ATTR);
cpl = (cpl >> 5) & 3;
if (cpl < 3)
int_err_code &= ~4;
else
int_err_code |= 4;
} else {
int_err_code = 0;
}
/* Handle all other exceptions */
VCPU_RETAIN_RIP(vcpu);
vcpu->arch_vcpu.exception_info.exception = exception_vector;
vcpu->arch_vcpu.exception_info.error = int_err_code;
TRACE_4I(TRC_VMEXIT_EXCEPTION_OR_NMI,
exception_vector, int_err_code, 2, 0);
return status;
}

View File

@@ -0,0 +1,418 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <bsp_extern.h>
#include <hv_debug.h>
/* Rate range 1 to 1000 or 1uSec to 1mSec */
#define APIC_TIMER_MAX 0xffffffff
#define HYPE_PERIOD_MAX 1000
#define APIC_DIVIDE_BY_ONE 0x0b
#define PIT_TARGET 0x3FFF
/* xAPIC/x2APIC Interrupt Command Register (ICR) structure */
union apic_icr {
uint64_t value;
struct {
uint32_t lo_32;
uint32_t hi_32;
} value_32;
struct {
uint64_t vector:8;
uint64_t delivery_mode:3;
uint64_t destination_mode:1;
uint64_t delivery_status:1;
uint64_t rsvd_1:1;
uint64_t level:1;
uint64_t trigger_mode:1;
uint64_t rsvd_2:2;
uint64_t shorthand:2;
uint64_t rsvd_3:12;
uint64_t rsvd_4:32;
} bits;
struct {
uint64_t rsvd_1:32;
uint64_t rsvd_2:24;
uint64_t dest_field:8;
} x_bits;
struct {
uint64_t rsvd_1:32;
uint64_t dest_field:32;
} x2_bits;
};
/* xAPIC/x2APIC Interrupt Command Register (ICR) structure */
union apic_lvt {
uint32_t value;
union {
struct {
uint32_t vector:8;
uint32_t rsvd_1:4;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t mode:2;
uint32_t rsvd_3:13;
} timer;
struct {
uint32_t vector:8;
uint32_t delivery_mode:3;
uint32_t rsvd_1:1;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t rsvd_3:15;
} cmci;
struct {
uint32_t vector:8;
uint32_t delivery_mode:3;
uint32_t rsvd_1:1;
uint32_t delivery_status:1;
uint32_t polarity:1;
uint32_t remote_irr:1;
uint32_t trigger_mode:1;
uint32_t mask:1;
uint32_t rsvd_2:15;
} lint;
struct {
uint32_t vector:8;
uint32_t rsvd_1:4;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t rsvd_3:15;
} error;
struct {
uint32_t vector:8;
uint32_t delivery_mode:3;
uint32_t rsvd_1:1;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t rsvd_3:15;
} pmc;
struct {
uint32_t vector:8;
uint32_t delivery_mode:3;
uint32_t rsvd_1:1;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t rsvd_3:15;
} thermal;
struct {
uint32_t vector:8;
uint32_t rsvd_1:4;
uint32_t delivery_status:1;
uint32_t rsvd_2:3;
uint32_t mask:1;
uint32_t rsvd_3:15;
} common;
} bits;
};
union lapic_base_msr {
uint64_t value;
struct {
uint64_t rsvd_1:8;
uint64_t bsp:1;
uint64_t rsvd_2:1;
uint64_t x2APIC_enable:1;
uint64_t xAPIC_enable:1;
uint64_t lapic_paddr:24;
uint64_t rsvd_3:28;
} fields;
};
struct lapic_info {
int init_status;
struct {
paddr_t paddr;
vaddr_t vaddr;
} xapic;
};
static struct lapic_info lapic_info;
static uint32_t read_lapic_reg32(uint32_t offset)
{
ASSERT((offset >= 0x020) && (offset <= 0x3FF), "");
return mmio_read_long(lapic_info.xapic.vaddr + offset);
}
static void write_lapic_reg32(uint32_t offset, uint32_t value)
{
ASSERT((offset >= 0x020) && (offset <= 0x3FF), "");
mmio_write_long(value, lapic_info.xapic.vaddr + offset);
}
static void clear_lapic_isr(void)
{
uint64_t isr_reg = LAPIC_IN_SERVICE_REGISTER_0;
/* This is a Intel recommended procedure and assures that the processor
* does not get hung up due to already set "in-service" interrupts left
* over from the boot loader environment. This actually occurs in real
* life, therefore we will ensure all the in-service bits are clear.
*/
do {
if (read_lapic_reg32(isr_reg)) {
write_lapic_reg32(LAPIC_EOI_REGISTER, 0);
continue;
}
isr_reg += 0x10;
} while (isr_reg <= LAPIC_IN_SERVICE_REGISTER_7);
}
static void map_lapic(void)
{
/* At some point we may need to translate this paddr to a vaddr. 1:1
* mapping for now.
*/
lapic_info.xapic.vaddr = lapic_info.xapic.paddr;
}
int early_init_lapic(void)
{
union lapic_base_msr lapic_base_msr;
/* Get local APIC base address */
lapic_base_msr.value = msr_read(MSR_IA32_APIC_BASE);
/* Initialize globals only 1 time */
if (lapic_info.init_status == false) {
/* Get Local APIC physical address. */
lapic_info.xapic.paddr = LAPIC_BASE;
/* Map in the local xAPIC */
map_lapic();
lapic_info.init_status = true;
}
/* Check if xAPIC mode enabled */
if (lapic_base_msr.fields.xAPIC_enable == 0) {
/* Ensure in xAPIC mode */
lapic_base_msr.fields.xAPIC_enable = 1;
lapic_base_msr.fields.x2APIC_enable = 0;
msr_write(MSR_IA32_APIC_BASE, lapic_base_msr.value);
} else {
/* Check if x2apic is disabled */
ASSERT(lapic_base_msr.fields.x2APIC_enable == 0,
"Disable X2APIC in BIOS");
}
return 0;
}
int init_lapic(uint32_t cpu_id)
{
/* Set the Logical Destination Register */
write_lapic_reg32(LAPIC_LOGICAL_DESTINATION_REGISTER,
(1 << cpu_id) << 24);
/* Set the Destination Format Register */
write_lapic_reg32(LAPIC_DESTINATION_FORMAT_REGISTER, 0xf << 28);
/* Mask all LAPIC LVT entries before enabling the local APIC */
write_lapic_reg32(LAPIC_LVT_CMCI_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_TIMER_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_THERMAL_SENSOR_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_PMC_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_LINT0_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_LINT1_REGISTER, LAPIC_LVT_MASK);
write_lapic_reg32(LAPIC_LVT_ERROR_REGISTER, LAPIC_LVT_MASK);
/* Enable Local APIC */
/* TODO: add spurious-interrupt handler */
write_lapic_reg32(LAPIC_SPURIOUS_VECTOR_REGISTER,
LAPIC_SVR_APIC_ENABLE_MASK | LAPIC_SVR_VECTOR);
/* Ensure there are no ISR bits set. */
clear_lapic_isr();
return 0;
}
int send_lapic_eoi(void)
{
write_lapic_reg32(LAPIC_EOI_REGISTER, 0);
return 0;
}
static void wait_for_delivery(void)
{
union apic_icr tmp;
do {
tmp.value_32.lo_32 =
read_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0);
} while (tmp.bits.delivery_status);
}
uint32_t get_cur_lapic_id(void)
{
uint32_t lapic_id;
lapic_id = read_lapic_reg32(LAPIC_ID_REGISTER);
lapic_id = (lapic_id >> 24);
return lapic_id;
}
int
send_startup_ipi(enum intr_cpu_startup_shorthand cpu_startup_shorthand,
uint32_t cpu_startup_dest, paddr_t cpu_startup_start_address)
{
union apic_icr icr;
uint8_t shorthand;
int status = 0;
uint32_t eax, ebx, ecx, edx;
uint32_t family;
if (cpu_startup_shorthand >= INTR_CPU_STARTUP_UNKNOWN)
status = -EINVAL;
ASSERT(status == 0, "Incorrect arguments");
icr.value = 0;
icr.bits.destination_mode = INTR_LAPIC_ICR_PHYSICAL;
if (cpu_startup_shorthand == INTR_CPU_STARTUP_USE_DEST) {
shorthand = INTR_LAPIC_ICR_USE_DEST_ARRAY;
icr.x_bits.dest_field = per_cpu(lapic_id, cpu_startup_dest);
} else { /* Use destination shorthand */
shorthand = INTR_LAPIC_ICR_ALL_EX_SELF;
icr.value_32.hi_32 = 0;
}
/*
* family calculation from SDM Vol. 2A
* CPUID with INPUT EAX=01h:Returns Model, Family, Stepping Information
*/
cpuid(CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
family = (eax >> 8) & 0xff;
if (family == 0xF)
family += (eax >> 20) & 0xff;
/* Assert INIT IPI */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, icr.value_32.hi_32);
icr.bits.shorthand = shorthand;
icr.bits.delivery_mode = INTR_LAPIC_ICR_INIT;
icr.bits.level = INTR_LAPIC_ICR_ASSERT;
icr.bits.trigger_mode = INTR_LAPIC_ICR_LEVEL;
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, icr.value_32.lo_32);
wait_for_delivery();
/* Give 10ms for INIT sequence to complete for old processors.
* Modern processors (family == 6) don't need to wait here.
*/
if (family != 6)
mdelay(10);
/* De-assert INIT IPI */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, icr.value_32.hi_32);
icr.bits.level = INTR_LAPIC_ICR_DEASSERT;
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, icr.value_32.lo_32);
wait_for_delivery();
/* Send Start IPI with page number of secondary reset code */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, icr.value_32.hi_32);
icr.value_32.lo_32 = 0;
icr.bits.shorthand = shorthand;
icr.bits.delivery_mode = INTR_LAPIC_ICR_STARTUP;
icr.bits.vector = ((paddr_t) cpu_startup_start_address) >> 12;
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, icr.value_32.lo_32);
wait_for_delivery();
if (family == 6) /* 10us is enough for Modern processors */
udelay(10);
else /* 200us for old processors */
udelay(200);
/* Send another start IPI as per the Intel Arch specification */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, icr.value_32.hi_32);
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, icr.value_32.lo_32);
wait_for_delivery();
return status;
}
void send_single_ipi(uint32_t pcpu_id, uint32_t vector)
{
uint32_t dest_lapic_id, hi_32, lo_32;
/* Get the lapic ID of the destination processor. */
dest_lapic_id = per_cpu(lapic_id, pcpu_id);
/* Set the target processor. */
hi_32 = dest_lapic_id << 24;
/* Set the vector ID. */
lo_32 = vector;
/* Set the destination field to the target processor. */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, hi_32);
/* Write the vector ID to ICR. */
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, lo_32);
wait_for_delivery();
}
int send_shorthand_ipi(uint8_t vector,
enum intr_lapic_icr_shorthand shorthand,
enum intr_lapic_icr_delivery_mode delivery_mode)
{
union apic_icr icr;
int status = 0;
if ((shorthand < INTR_LAPIC_ICR_SELF)
|| (shorthand > INTR_LAPIC_ICR_ALL_EX_SELF)
|| (delivery_mode > INTR_LAPIC_ICR_NMI))
status = -EINVAL;
ASSERT(status == 0, "Incorrect arguments");
icr.value = 0;
icr.bits.shorthand = shorthand;
icr.bits.delivery_mode = delivery_mode;
icr.bits.vector = vector;
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_1, icr.value_32.hi_32);
write_lapic_reg32(LAPIC_INT_COMMAND_REGISTER_0, icr.value_32.lo_32);
wait_for_delivery();
return status;
}

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
int interrupt_init(uint32_t cpu_id)
{
struct host_idt_descriptor *idtd = &HOST_IDTR;
int status;
set_idt(idtd);
status = init_lapic(cpu_id);
ASSERT(status == 0, "lapic init failed");
if (status != 0)
return -ENODEV;
status = init_default_irqs(cpu_id);
ASSERT(status == 0, "irqs init failed");
if (status != 0)
return -ENODEV;
CPU_IRQ_ENABLE();
return status;
}

292
hypervisor/arch/x86/io.c Normal file
View File

@@ -0,0 +1,292 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#include <hypercall.h>
int dm_emulate_pio_post(struct vcpu *vcpu)
{
int cur = vcpu->vcpu_id;
int cur_context = vcpu->arch_vcpu.cur_context;
struct vhm_request_buffer *req_buf =
(void *)HPA2HVA(vcpu->vm->sw.req_buf);
uint32_t mask =
0xFFFFFFFFul >> (32 - 8 * vcpu->req.reqs.pio_request.size);
uint64_t *rax;
ASSERT(cur_context == 0, "pio emulation only happen in normal wrold");
rax = &vcpu->arch_vcpu.contexts[cur_context].guest_cpu_regs.regs.rax;
vcpu->req.reqs.pio_request.value =
req_buf->req_queue[cur].reqs.pio_request.value;
/* VHM emulation data already copy to req, mark to free slot now */
req_buf->req_queue[cur].valid = false;
if (req_buf->req_queue[cur].processed != REQ_STATE_SUCCESS)
return -1;
if (vcpu->req.reqs.pio_request.direction == REQUEST_READ)
*rax = ((*rax) & ~mask) |
(vcpu->req.reqs.pio_request.value & mask);
return 0;
}
static void dm_emulate_pio_pre(struct vcpu *vcpu, uint64_t exit_qual,
uint32_t sz, uint64_t req_value)
{
vcpu->req.type = REQ_PORTIO;
if (VM_EXIT_IO_INSTRUCTION_ACCESS_DIRECTION(exit_qual))
vcpu->req.reqs.pio_request.direction = REQUEST_READ;
else
vcpu->req.reqs.pio_request.direction = REQUEST_WRITE;
vcpu->req.reqs.pio_request.address =
VM_EXIT_IO_INSTRUCTION_PORT_NUMBER(exit_qual);
vcpu->req.reqs.pio_request.size = sz;
vcpu->req.reqs.pio_request.value = req_value;
}
int io_instr_handler(struct vcpu *vcpu)
{
uint32_t sz;
uint32_t mask;
uint32_t port;
int8_t direction;
struct vm_io_handler *handler;
uint64_t exit_qual;
struct vm *vm = vcpu->vm;
int cur_context_idx = vcpu->arch_vcpu.cur_context;
struct run_context *cur_context;
int status = -EINVAL;
ASSERT(cur_context_idx == 0,
"pio emulation only happen in normal wrold");
cur_context = &vcpu->arch_vcpu.contexts[cur_context_idx];
exit_qual = vcpu->arch_vcpu.exit_qualification;
sz = VM_EXIT_IO_INSTRUCTION_SIZE(exit_qual) + 1;
port = VM_EXIT_IO_INSTRUCTION_PORT_NUMBER(exit_qual);
direction = VM_EXIT_IO_INSTRUCTION_ACCESS_DIRECTION(exit_qual);
mask = 0xfffffffful >> (32 - 8 * sz);
memset(&vcpu->req, 0, sizeof(struct vhm_request));
TRACE_4I(TRC_VMEXIT_IO_INSTRUCTION, port, direction, sz,
cur_context_idx);
for (handler = vm->arch_vm.io_handler;
handler; handler = handler->next) {
if ((port >= handler->desc.addr + handler->desc.len) ||
(port + sz <= handler->desc.addr))
continue;
/* Dom0 do not require IO emulation */
if (is_vm0(vm))
status = 0;
if (direction == 0) {
if (handler->desc.io_write == NULL)
continue;
handler->desc.io_write(handler, vm, port, sz,
cur_context->guest_cpu_regs.regs.rax);
pr_dbg("IO write on port %04x, data %08x", port,
cur_context->guest_cpu_regs.regs.rax & mask);
status = 0;
break;
} else if (handler->desc.io_read) {
uint32_t data = handler->desc.io_read(handler, vm,
port, sz);
cur_context->guest_cpu_regs.regs.rax &= ~mask;
cur_context->guest_cpu_regs.regs.rax |= data & mask;
pr_dbg("IO read on port %04x, data %08x", port, data);
status = 0;
break;
}
}
/* Go for VHM */
if (status != 0) {
uint64_t *rax = &cur_context->guest_cpu_regs.regs.rax;
dm_emulate_pio_pre(vcpu, exit_qual, sz, *rax);
status = acrn_insert_request_wait(vcpu, &vcpu->req);
}
if (status != 0) {
pr_fatal("IO %s access to port 0x%04x, size=%u",
direction ? "read" : "write", port, sz);
}
/* Catch any problems */
ASSERT(status == 0, "Invalid IO access");
return status;
}
static void register_io_handler(struct vm *vm, struct vm_io_handler *hdlr)
{
if (vm->arch_vm.io_handler)
hdlr->next = vm->arch_vm.io_handler;
vm->arch_vm.io_handler = hdlr;
}
static void empty_io_handler_list(struct vm *vm)
{
struct vm_io_handler *handler = vm->arch_vm.io_handler;
struct vm_io_handler *tmp;
while (handler) {
tmp = handler;
handler = tmp->next;
free(tmp);
}
vm->arch_vm.io_handler = NULL;
}
void free_io_emulation_resource(struct vm *vm)
{
empty_io_handler_list(vm);
/* Free I/O emulation bitmaps */
free(vm->arch_vm.iobitmap[0]);
free(vm->arch_vm.iobitmap[1]);
}
static void deny_guest_io_access(struct vm *vm, uint32_t address, uint32_t nbytes)
{
uint32_t *b;
uint32_t i;
uint32_t a;
for (i = 0; i < nbytes; i++) {
b = vm->arch_vm.iobitmap[0];
if (address & 0x8000)
b = vm->arch_vm.iobitmap[1];
a = address & 0x7fff;
b[a >> 5] |= (1 << (a & 0x1f));
address++;
}
}
static uint32_t
default_io_read(__unused struct vm_io_handler *hdlr, __unused struct vm *vm,
ioport_t address, size_t width)
{
uint32_t v = io_read(address, width);
return v;
}
static void default_io_write(__unused struct vm_io_handler *hdlr,
__unused struct vm *vm, ioport_t addr,
size_t width, uint32_t v)
{
io_write(v, addr, width);
}
static struct vm_io_handler *create_io_handler(uint32_t port, uint32_t len,
io_read_fn_t io_read_fn_ptr,
io_write_fn_t io_write_fn_ptr)
{
struct vm_io_handler *handler;
handler = calloc(1, sizeof(struct vm_io_handler));
if (handler != NULL) {
handler->desc.addr = port;
handler->desc.len = len;
handler->desc.io_read = io_read_fn_ptr;
handler->desc.io_write = io_write_fn_ptr;
} else {
pr_err("Error: out of memory");
}
return handler;
}
void setup_io_bitmap(struct vm *vm)
{
/* Allocate VM architecture state and IO bitmaps A and B */
vm->arch_vm.iobitmap[0] = alloc_page();
vm->arch_vm.iobitmap[1] = alloc_page();
ASSERT(vm->arch_vm.iobitmap[0] && vm->arch_vm.iobitmap[1], "");
if (is_vm0(vm)) {
memset(vm->arch_vm.iobitmap[0], 0x00, CPU_PAGE_SIZE);
memset(vm->arch_vm.iobitmap[1], 0x00, CPU_PAGE_SIZE);
} else {
/* block all IO port access from Guest */
memset(vm->arch_vm.iobitmap[0], 0xFF, CPU_PAGE_SIZE);
memset(vm->arch_vm.iobitmap[1], 0xFF, CPU_PAGE_SIZE);
}
}
void register_io_emulation_handler(struct vm *vm, struct vm_io_range *range,
io_read_fn_t io_read_fn_ptr,
io_write_fn_t io_write_fn_ptr)
{
struct vm_io_handler *handler = NULL;
io_read_fn_t io_read_fn = &default_io_read;
io_write_fn_t io_write_fn = &default_io_write;
if (range->flags == IO_ATTR_RW && io_read_fn_ptr && io_write_fn_ptr) {
io_read_fn = io_read_fn_ptr;
io_write_fn = io_write_fn_ptr;
} else if (range->flags == IO_ATTR_R) {
if (io_read_fn_ptr)
io_read_fn = io_read_fn_ptr;
io_write_fn = NULL;
}
if (is_vm0(vm))
deny_guest_io_access(vm, range->base, range->len);
handler = create_io_handler(range->base,
range->len, io_read_fn, io_write_fn);
register_io_handler(vm, handler);
}

View File

@@ -0,0 +1,439 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
/* Register offsets */
#define IOAPIC_REGSEL_OFFSET 0
#define IOAPIC_WINSWL_OFFSET 0x10
/* IOAPIC Redirection Table (RTE) Entry structure */
struct ioapic_rte {
uint32_t lo_32;
uint32_t hi_32;
} ioapic_rte;
struct gsi_table {
uint8_t ioapic_id;
uint8_t pin;
uint64_t addr;
};
static struct gsi_table gsi_table[NR_MAX_GSI];
static int nr_gsi;
static spinlock_t ioapic_lock;
/*
* the irq to ioapic pin mapping should extract from ACPI MADT table
* hardcoded here
*/
uint16_t legacy_irq_to_pin[NR_LEGACY_IRQ] = {
2, /* IRQ0*/
1, /* IRQ1*/
0, /* IRQ2 connected to Pin0 (ExtInt source of PIC) if existing */
3, /* IRQ3*/
4, /* IRQ4*/
5, /* IRQ5*/
6, /* IRQ6*/
7, /* IRQ7*/
8, /* IRQ8*/
9 | IOAPIC_RTE_TRGRLVL, /* IRQ9*/
10, /* IRQ10*/
11, /* IRQ11*/
12, /* IRQ12*/
13, /* IRQ13*/
14, /* IRQ14*/
15, /* IRQ15*/
};
static uint64_t map_ioapic(
uint64_t ioapic_paddr)
{
/* At some point we may need to translate this paddr to a vaddr.
* 1:1 mapping for now.
*/
return (vaddr_t) ioapic_paddr;
}
static inline uint32_t
ioapic_read_reg32(const uint64_t ioapic_base, const uint8_t offset)
{
uint32_t v;
spinlock_rflags;
spinlock_irqsave_obtain(&ioapic_lock);
/* Write IOREGSEL */
*(uint32_t *)(ioapic_base) = offset;
/* Read IOWIN */
v = *(uint32_t *)(ioapic_base + IOAPIC_WINSWL_OFFSET);
spinlock_irqrestore_release(&ioapic_lock);
return v;
}
static inline void
ioapic_write_reg32(const uint64_t ioapic_base,
const uint8_t offset, const uint32_t value)
{
spinlock_rflags;
spinlock_irqsave_obtain(&ioapic_lock);
/* Write IOREGSEL */
*(uint32_t *)(ioapic_base) = offset;
/* Write IOWIN */
*(uint32_t *)(ioapic_base + IOAPIC_WINSWL_OFFSET) = value;
spinlock_irqrestore_release(&ioapic_lock);
}
static inline uint64_t
get_ioapic_base(int apic_id)
{
uint64_t addr = -1UL;
/* should extract next ioapic from ACPI MADT table */
if (apic_id == 0)
addr = DEFAULT_IO_APIC_BASE;
else if (apic_id == 1)
addr = 0xfec3f000;
else if (apic_id == 2)
addr = 0xfec7f000;
else
ASSERT(apic_id <= 2, "ACPI MADT table missing");
return addr;
}
static inline void
ioapic_get_rte_entry(uint64_t ioapic_addr,
int pin, struct ioapic_rte *rte)
{
rte->lo_32 = ioapic_read_reg32(ioapic_addr, pin*2 + 0x10);
rte->hi_32 = ioapic_read_reg32(ioapic_addr, pin*2 + 0x11);
}
static inline void
ioapic_set_rte_entry(uint64_t ioapic_addr,
int pin, struct ioapic_rte *rte)
{
ioapic_write_reg32(ioapic_addr, pin*2 + 0x10, rte->lo_32);
ioapic_write_reg32(ioapic_addr, pin*2 + 0x11, rte->hi_32);
}
static inline struct ioapic_rte
create_rte_for_legacy_irq(int irq, int vr)
{
struct ioapic_rte rte = {0, 0};
/* Legacy IRQ 0-15 setup, default masked
* are actually defined in either MPTable or ACPI MADT table
* before we have ACPI table parsing in HV we use common hardcode
*/
rte.lo_32 |= IOAPIC_RTE_INTMSET;
rte.lo_32 |= (legacy_irq_to_pin[irq] & IOAPIC_RTE_TRGRLVL);
rte.lo_32 |= DEFAULT_DEST_MODE;
rte.lo_32 |= DEFAULT_DELIVERY_MODE;
rte.lo_32 |= (IOAPIC_RTE_INTVEC & vr);
/* FIXME: Fixed to active Low? */
rte.lo_32 |= IOAPIC_RTE_INTALO;
/* Dest field: legacy irq fixed to CPU0 */
rte.hi_32 |= 1 << 24;
return rte;
}
static inline struct ioapic_rte
create_rte_for_gsi_irq(int irq, int vr)
{
struct ioapic_rte rte = {0, 0};
if (irq < NR_LEGACY_IRQ)
return create_rte_for_legacy_irq(irq, vr);
/* irq default masked, level trig */
rte.lo_32 |= IOAPIC_RTE_INTMSET;
rte.lo_32 |= IOAPIC_RTE_TRGRLVL;
rte.lo_32 |= DEFAULT_DEST_MODE;
rte.lo_32 |= DEFAULT_DELIVERY_MODE;
rte.lo_32 |= (IOAPIC_RTE_INTVEC & vr);
/* FIXME: Fixed to active Low? */
rte.lo_32 |= IOAPIC_RTE_INTALO;
/* Dest field */
rte.hi_32 |= ALL_CPUS_MASK << 24;
return rte;
}
static void ioapic_set_routing(int gsi, int vr)
{
uint64_t addr;
struct ioapic_rte rte;
addr = gsi_table[gsi].addr;
rte = create_rte_for_gsi_irq(gsi, vr);
ioapic_set_rte_entry(addr, gsi_table[gsi].pin, &rte);
if (rte.lo_32 & IOAPIC_RTE_TRGRMOD)
update_irq_handler(gsi, handle_level_interrupt_common);
else
update_irq_handler(gsi, common_handler_edge);
dev_dbg(ACRN_DBG_IRQ, "GSI: irq:%d pin:%d rte:%x",
gsi, gsi_table[gsi].pin,
rte.lo_32);
}
void ioapic_get_rte(int irq, uint64_t *rte)
{
uint64_t addr;
struct ioapic_rte _rte;
if (!irq_is_gsi(irq))
return;
addr = gsi_table[irq].addr;
ioapic_get_rte_entry(addr, gsi_table[irq].pin, &_rte);
*rte = _rte.hi_32;
*rte = *rte << 32 | _rte.lo_32;
}
void ioapic_set_rte(int irq, uint64_t raw_rte)
{
uint64_t addr;
struct ioapic_rte rte;
if (!irq_is_gsi(irq))
return;
addr = gsi_table[irq].addr;
rte.lo_32 = raw_rte;
rte.hi_32 = raw_rte >> 32;
ioapic_set_rte_entry(addr, gsi_table[irq].pin, &rte);
dev_dbg(ACRN_DBG_IRQ, "GSI: irq:%d pin:%d rte:%x",
irq, gsi_table[irq].pin,
rte.lo_32);
}
int irq_gsi_num(void)
{
return nr_gsi;
}
bool irq_is_gsi(int irq)
{
return irq < nr_gsi;
}
int irq_to_pin(int irq)
{
if (irq_is_gsi(irq))
return gsi_table[irq].pin;
else
return -1;
}
int pin_to_irq(int pin)
{
int i;
if (pin < 0)
return IRQ_INVALID;
for (i = 0; i < nr_gsi; i++) {
if (gsi_table[i].pin == (uint8_t) pin)
return i;
}
return IRQ_INVALID;
}
void
irq_gsi_mask_unmask(int irq, bool mask)
{
uint64_t addr = gsi_table[irq].addr;
int pin = gsi_table[irq].pin;
struct ioapic_rte rte;
if (!irq_is_gsi(irq))
return;
ioapic_get_rte_entry(addr, pin, &rte);
if (mask)
rte.lo_32 |= IOAPIC_RTE_INTMSET;
else
rte.lo_32 &= ~IOAPIC_RTE_INTMASK;
ioapic_set_rte_entry(addr, pin, &rte);
dev_dbg(ACRN_DBG_PTIRQ, "update: irq:%d pin:%d rte:%x",
irq, pin, rte.lo_32);
}
void setup_ioapic_irq(void)
{
int ioapic_id;
int gsi;
int vr;
spinlock_init(&ioapic_lock);
for (ioapic_id = 0, gsi = 0; ioapic_id < NR_IOAPICS; ioapic_id++) {
int pin;
int max_pins;
int version;
uint64_t addr;
addr = map_ioapic(get_ioapic_base(ioapic_id));
version = ioapic_read_reg32(addr, IOAPIC_VER);
max_pins = (version & IOAPIC_MAX_RTE_MASK) >> MAX_RTE_SHIFT;
dev_dbg(ACRN_DBG_IRQ, "IOAPIC version: %x", version);
ASSERT(max_pins > NR_LEGACY_IRQ,
"Legacy IRQ num > total GSI");
for (pin = 0; pin < max_pins; pin++) {
gsi_table[gsi].ioapic_id = ioapic_id;
gsi_table[gsi].addr = addr;
if (gsi < NR_LEGACY_IRQ)
gsi_table[gsi].pin =
legacy_irq_to_pin[gsi] & 0xff;
else
gsi_table[gsi].pin = pin;
/* pinned irq before use it */
if (irq_mark_used(gsi) < 0) {
pr_err("failed to alloc IRQ[%d]", gsi);
gsi++;
continue;
}
/* assign vector for this GSI
* for legacy irq, reserved vector and never free
*/
if (gsi < NR_LEGACY_IRQ) {
vr = irq_desc_alloc_vector(gsi, false);
if (vr < 0) {
pr_err("failed to alloc VR");
gsi++;
continue;
}
} else
vr = 0; /* not to allocate VR right now */
ioapic_set_routing(gsi, vr);
gsi++;
}
}
/* system max gsi numbers */
nr_gsi = gsi;
ASSERT(nr_gsi < NR_MAX_GSI, "GSI table overflow");
}
void dump_ioapic(void)
{
int irq;
for (irq = 0; irq < nr_gsi; irq++) {
uint64_t addr = gsi_table[irq].addr;
int pin = gsi_table[irq].pin;
struct ioapic_rte rte;
ioapic_get_rte_entry(addr, pin, &rte);
dev_dbg(ACRN_DBG_IRQ, "DUMP: irq:%d pin:%d rte:%x",
irq, pin, rte.lo_32);
}
}
void get_rte_info(struct ioapic_rte *rte, bool *mask, bool *irr,
bool *phys, int *delmode, bool *level, int *vector, uint32_t *dest)
{
*mask = ((rte->lo_32 & IOAPIC_RTE_INTMASK) == IOAPIC_RTE_INTMSET);
*irr = ((rte->lo_32 & IOAPIC_RTE_REM_IRR) == IOAPIC_RTE_REM_IRR);
*phys = ((rte->lo_32 & IOAPIC_RTE_DESTMOD) == IOAPIC_RTE_DESTPHY);
*delmode = rte->lo_32 & IOAPIC_RTE_DELMOD;
*level = rte->lo_32 & IOAPIC_RTE_TRGRLVL ? true : false;
*vector = rte->lo_32 & IOAPIC_RTE_INTVEC;
*dest = rte->hi_32 >> APIC_ID_SHIFT;
}
int get_ioapic_info(char *str, int str_max_len)
{
int irq, len, size = str_max_len;
len = snprintf(str, size,
"\r\nIRQ\tPIN\tRTE.HI32\tRTE.LO32\tVEC\tDST\tDM\tTM\tDELM\tIRR\tMASK");
size -= len;
str += len;
for (irq = 0; irq < nr_gsi; irq++) {
uint64_t addr = gsi_table[irq].addr;
int pin = gsi_table[irq].pin;
struct ioapic_rte rte;
bool irr, phys, level, mask;
int delmode, vector;
uint32_t dest;
ioapic_get_rte_entry(addr, pin, &rte);
get_rte_info(&rte, &mask, &irr, &phys, &delmode, &level,
&vector, &dest);
len = snprintf(str, size, "\r\n%03d\t%03d\t0x%08X\t0x%08X\t",
irq, pin, rte.hi_32, rte.lo_32);
size -= len;
str += len;
len = snprintf(str, size, "0x%02X\t0x%02X\t%s\t%s\t%d\t%d\t%d",
vector, dest, phys ? "phys" : "logic",
level ? "level" : "edge", delmode >> 8, irr, mask);
size -= len;
str += len;
if (size < 2) {
pr_err("\r\nsmall buffer for ioapic dump");
return -1;
}
}
snprintf(str, size, "\r\n");
return 0;
}

761
hypervisor/arch/x86/irq.c Normal file
View File

@@ -0,0 +1,761 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
static spinlock_t exception_spinlock = { .head = 0, .tail = 0, };
struct irq_request_info {
/* vector set to 0xE0 ~ 0xFF for pri_register_handler
* and set to -1 for normal_register_handler
*/
int vector;
dev_handler_t func;
void *dev_data;
bool share;
bool lowpri;
char *name;
};
/* any field change in below required irq_lock protection with irqsave */
struct irq_desc {
int irq; /* index to irq_desc_base */
enum irq_state used; /* this irq have assigned to device */
enum irq_desc_state state; /* irq_desc status */
int vector; /* assigned vector */
void *handler_data; /* irq_handler private data */
int (*irq_handler)(struct irq_desc *irq_desc, void *handler_data);
struct dev_handler_node *dev_list;
spinlock_t irq_lock;
uint64_t *irq_cnt; /* this irq cnt happened on CPUs */
uint64_t irq_lost_cnt;
};
static struct irq_desc *irq_desc_base;
static int vector_to_irq[NR_MAX_VECTOR + 1];
static DEFINE_CPU_DATA(uint64_t[NR_MAX_IRQS], irq_count);
static DEFINE_CPU_DATA(uint64_t, spurious);
spurious_handler_t spurious_handler;
static void init_irq_desc(void)
{
int i, page_num = 0;
int desc_size = NR_MAX_IRQS * sizeof(struct irq_desc);
page_num = (desc_size + CPU_PAGE_SIZE-1) >> CPU_PAGE_SHIFT;
irq_desc_base = alloc_pages(page_num);
ASSERT(irq_desc_base, "page alloc failed!");
memset(irq_desc_base, 0, page_num * CPU_PAGE_SIZE);
for (i = 0; i < NR_MAX_IRQS; i++) {
irq_desc_base[i].irq = i;
irq_desc_base[i].vector = VECTOR_INVALID;
spinlock_init(&irq_desc_base[i].irq_lock);
}
for (i = 0; i <= NR_MAX_VECTOR; i++)
vector_to_irq[i] = IRQ_INVALID;
}
/*
* alloc vector 0x20-0xDF for irq
* lowpri: 0x20-0x7F
* highpri: 0x80-0xDF
*/
static int find_available_vector(bool lowpri)
{
int i, start, end;
if (lowpri) {
start = VECTOR_FOR_NOR_LOWPRI_START;
end = VECTOR_FOR_NOR_LOWPRI_END;
} else {
start = VECTOR_FOR_NOR_HIGHPRI_START;
end = VECTOR_FOR_NOR_HIGHPRI_END;
}
/* TODO: vector lock required */
for (i = start; i < end; i++) {
if (vector_to_irq[i] == IRQ_INVALID)
return i;
}
return -1;
}
/*
* check and set irq to be assigned
* return: -1 if irq already assigned otherwise return irq
*/
int irq_mark_used(int irq)
{
struct irq_desc *desc;
spinlock_rflags;
if (irq < 0)
return -1;
desc = irq_desc_base + irq;
spinlock_irqsave_obtain(&desc->irq_lock);
if (desc->used == IRQ_NOT_ASSIGNED)
desc->used = IRQ_ASSIGNED_NOSHARE;
spinlock_irqrestore_release(&desc->irq_lock);
return irq;
}
/*
* system find available irq and set assigned
* return: irq, -1 not found
*/
static int alloc_irq(void)
{
int i;
struct irq_desc *desc;
spinlock_rflags;
for (i = irq_gsi_num(); i < NR_MAX_IRQS; i++) {
desc = irq_desc_base + i;
spinlock_irqsave_obtain(&desc->irq_lock);
if (desc->used == IRQ_NOT_ASSIGNED) {
desc->used = IRQ_ASSIGNED_NOSHARE;
spinlock_irqrestore_release(&desc->irq_lock);
break;
}
spinlock_irqrestore_release(&desc->irq_lock);
}
return (i == NR_MAX_IRQS) ? -1:i;
}
/* need irq_lock protection before use */
static void _irq_desc_set_vector(int irq, int vr)
{
struct irq_desc *desc;
desc = irq_desc_base + irq;
vector_to_irq[vr] = irq;
desc->vector = vr;
}
/* lock version of set vector */
static void irq_desc_set_vector(int irq, int vr)
{
struct irq_desc *desc;
spinlock_rflags;
desc = irq_desc_base + irq;
spinlock_irqsave_obtain(&desc->irq_lock);
vector_to_irq[vr] = irq;
desc->vector = vr;
spinlock_irqrestore_release(&desc->irq_lock);
}
/* used with holding irq_lock outside */
static void _irq_desc_free_vector(int irq)
{
struct irq_desc *desc;
int vr;
if (irq > NR_MAX_IRQS || irq < 0)
return;
desc = irq_desc_base + irq;
vr = desc->vector;
desc->used = IRQ_NOT_ASSIGNED;
desc->state = IRQ_DESC_PENDING;
desc->vector = VECTOR_INVALID;
vr &= NR_MAX_VECTOR;
if (vector_to_irq[vr] == irq)
vector_to_irq[vr] = IRQ_INVALID;
}
static void disable_pic_irq(void)
{
io_write_byte(0xff, 0xA1);
io_write_byte(0xff, 0x21);
}
static bool
irq_desc_append_dev(struct irq_desc *desc, void *node, bool share)
{
struct dev_handler_node *dev_list;
bool added = true;
spinlock_rflags;
spinlock_irqsave_obtain(&desc->irq_lock);
dev_list = desc->dev_list;
/* assign if first node */
if (dev_list == NULL) {
desc->dev_list = node;
desc->used = (share)?IRQ_ASSIGNED_SHARED:IRQ_ASSIGNED_NOSHARE;
/* Only GSI possible for Level and it already init during
* ioapic setup.
* caller can later update it with update_irq_handler()
*/
if (!desc->irq_handler)
desc->irq_handler = common_handler_edge;
} else if (!share || desc->used == IRQ_ASSIGNED_NOSHARE) {
/* dev node added failed */
added = false;
} else {
/* dev_list point to last valid node */
while (dev_list->next)
dev_list = dev_list->next;
/* add node */
dev_list->next = node;
}
spinlock_irqrestore_release(&desc->irq_lock);
return added;
}
static struct dev_handler_node*
common_register_handler(int irq,
struct irq_request_info *info)
{
struct dev_handler_node *node = NULL;
struct irq_desc *desc;
bool added = false;
/* ======================================================
* This is low level ISR handler registering function
* case: irq = -1
* caller did not know which irq to use, and want system to
* allocate available irq for it. These irq are in range:
* nr_gsi ~ NR_MAX_IRQS
* a irq will be allocated and the vector will be assigned to this
* irq automatically.
*
* case: irq >=0 and irq < nr_gsi
* caller want to add device ISR handler into ioapic pins.
* two kind of devices: legacy device and PCI device with INTx
* a vector will automatically assigned.
*
* case: irq with speical type (not from IOAPIC/MSI)
* These irq value are pre-defined for Timer, IPI, Spurious etc
* vectors are pre-defined also
*
* return value: pinned irq and assigned vector for this irq.
* caller can use this irq to enable/disable/mask/unmask interrupt
* and if this irq is for:
* GSI legacy: nothing to do for legacy irq, already initialized
* GSI other: need to progam PCI INTx to match this irq pin
* MSI: caller need program vector to PCI device
*
* =====================================================
*/
ASSERT(info != NULL, "Invalid param");
/* HV select a irq for device if irq < 0
* this vector/irq match to APCI DSDT or PCI INTx/MSI
*/
if (irq < 0)
irq = alloc_irq();
else
irq = irq_mark_used(irq);
if (irq < 0) {
pr_err("failed to assign IRQ");
goto OUT;
}
node = calloc(1, sizeof(struct dev_handler_node));
if (node == NULL) {
pr_err("failed to alloc node");
irq_desc_try_free_vector(irq);
goto OUT;
}
desc = irq_desc_base + irq;
added = irq_desc_append_dev(desc, node, info->share);
if (!added) {
free(node);
node = NULL;
pr_err("failed to add node to non-shared irq");
}
OUT:
if (added) {
/* it is safe to call irq_desc_alloc_vector multiple times*/
if (info->vector >= VECTOR_FOR_PRI_START &&
info->vector <= VECTOR_FOR_PRI_END)
irq_desc_set_vector(irq, info->vector);
else if (info->vector < 0)
irq_desc_alloc_vector(irq, info->lowpri);
else {
pr_err("the input vector is not correct");
free(node);
return NULL;
}
node->dev_handler = info->func;
node->dev_data = info->dev_data;
node->desc = desc;
/* we are okay using strcpy_s here even with spinlock
* since no #PG in HV right now
*/
strcpy_s(node->name, 32, info->name);
dev_dbg(ACRN_DBG_IRQ, "[%s] %s irq%d vr:0x%x",
__func__, node->name, irq, desc->vector);
}
return node;
}
/* it is safe to call irq_desc_alloc_vector multiple times*/
int irq_desc_alloc_vector(int irq, bool lowpri)
{
int vr = -1;
struct irq_desc *desc;
spinlock_rflags;
/* irq should be always available at this time */
if (irq > NR_MAX_IRQS || irq < 0)
return false;
desc = irq_desc_base + irq;
spinlock_irqsave_obtain(&desc->irq_lock);
if (desc->vector != VECTOR_INVALID) {
/* already allocated a vector */
goto OUT;
}
/* FLAT mode, a irq connected to every cpu's same vector */
vr = find_available_vector(lowpri);
if (vr < 0) {
pr_err("no vector found for irq[%d]", irq);
goto OUT;
}
_irq_desc_set_vector(irq, vr);
OUT:
spinlock_irqrestore_release(&desc->irq_lock);
return vr;
}
void irq_desc_try_free_vector(int irq)
{
struct irq_desc *desc;
spinlock_rflags;
/* legacy irq's vector is reserved and should not be freed */
if (irq > NR_MAX_IRQS || irq < NR_LEGACY_IRQ)
return;
desc = irq_desc_base + irq;
spinlock_irqsave_obtain(&desc->irq_lock);
if (desc->dev_list == NULL)
_irq_desc_free_vector(irq);
spinlock_irqrestore_release(&desc->irq_lock);
}
int irq_to_vector(int irq)
{
if (irq < NR_MAX_IRQS)
return irq_desc_base[irq].vector;
else
return VECTOR_INVALID;
}
int dev_to_irq(struct dev_handler_node *node)
{
return node->desc->irq;
}
int dev_to_vector(struct dev_handler_node *node)
{
return node->desc->vector;
}
int init_default_irqs(unsigned int cpu_id)
{
if (cpu_id > 0)
return 0;
init_irq_desc();
/* we use ioapic only, disable legacy PIC */
disable_pic_irq();
setup_ioapic_irq();
init_softirq();
return 0;
}
void dispatch_exception(struct intr_ctx *ctx)
{
unsigned int cpu_id = get_cpu_id();
/* Obtain lock to ensure exception dump doesn't get corrupted */
spinlock_obtain(&exception_spinlock);
dump_exception(ctx, cpu_id);
/* Release lock to let other CPUs handle exception */
spinlock_release(&exception_spinlock);
/* Halt the CPU */
cpu_halt(cpu_id);
}
int handle_spurious_interrupt(int vector)
{
send_lapic_eoi();
get_cpu_var(spurious)++;
pr_warn("Spurious vector: 0x%x.", vector);
if (spurious_handler)
return spurious_handler(vector);
else
return 0;
}
/* do_IRQ() */
int dispatch_interrupt(struct intr_ctx *ctx)
{
int vr = ctx->vector;
int irq = vector_to_irq[vr];
struct irq_desc *desc;
if (irq == IRQ_INVALID)
goto ERR;
desc = irq_desc_base + irq;
per_cpu(irq_count, get_cpu_id())[irq]++;
if (vr != desc->vector)
goto ERR;
if (desc->used == IRQ_NOT_ASSIGNED || !desc->irq_handler) {
/* mask irq if possible */
goto ERR;
}
desc->irq_handler(desc, desc->handler_data);
return 0;
ERR:
return handle_spurious_interrupt(vr);
}
int handle_level_interrupt_common(struct irq_desc *desc,
__unused void *handler_data)
{
struct dev_handler_node *dev = desc->dev_list;
spinlock_rflags;
/*
* give other Core a try to return without hold irq_lock
* and record irq_lost count here
*/
if (desc->state != IRQ_DESC_PENDING) {
send_lapic_eoi();
desc->irq_lost_cnt++;
return 0;
}
spinlock_irqsave_obtain(&desc->irq_lock);
desc->state = IRQ_DESC_IN_PROCESS;
/* mask iopaic pin */
if (irq_is_gsi(desc->irq))
GSI_MASK_IRQ(desc->irq);
/* Send EOI to LAPIC/IOAPIC IRR */
send_lapic_eoi();
while (dev) {
if (dev->dev_handler)
dev->dev_handler(desc->irq, dev->dev_data);
dev = dev->next;
}
if (irq_is_gsi(desc->irq))
GSI_UNMASK_IRQ(desc->irq);
desc->state = IRQ_DESC_PENDING;
spinlock_irqrestore_release(&desc->irq_lock);
return 0;
}
int common_handler_edge(struct irq_desc *desc, __unused void *handler_data)
{
struct dev_handler_node *dev = desc->dev_list;
spinlock_rflags;
/*
* give other Core a try to return without hold irq_lock
* and record irq_lost count here
*/
if (desc->state != IRQ_DESC_PENDING) {
send_lapic_eoi();
desc->irq_lost_cnt++;
return 0;
}
spinlock_irqsave_obtain(&desc->irq_lock);
desc->state = IRQ_DESC_IN_PROCESS;
/* Send EOI to LAPIC/IOAPIC IRR */
send_lapic_eoi();
while (dev) {
if (dev->dev_handler)
dev->dev_handler(desc->irq, dev->dev_data);
dev = dev->next;
}
desc->state = IRQ_DESC_PENDING;
spinlock_irqrestore_release(&desc->irq_lock);
return 0;
}
int common_dev_handler_level(struct irq_desc *desc, __unused void *handler_data)
{
struct dev_handler_node *dev = desc->dev_list;
spinlock_rflags;
/*
* give other Core a try to return without hold irq_lock
* and record irq_lost count here
*/
if (desc->state != IRQ_DESC_PENDING) {
send_lapic_eoi();
desc->irq_lost_cnt++;
return 0;
}
spinlock_irqsave_obtain(&desc->irq_lock);
desc->state = IRQ_DESC_IN_PROCESS;
/* mask iopaic pin */
if (irq_is_gsi(desc->irq))
GSI_MASK_IRQ(desc->irq);
/* Send EOI to LAPIC/IOAPIC IRR */
send_lapic_eoi();
while (dev) {
if (dev->dev_handler)
dev->dev_handler(desc->irq, dev->dev_data);
dev = dev->next;
}
desc->state = IRQ_DESC_PENDING;
spinlock_irqrestore_release(&desc->irq_lock);
/* we did not unmask irq until guest EOI the vector */
return 0;
}
/* no desc->irq_lock for quick handling local interrupt like lapic timer */
int quick_handler_nolock(struct irq_desc *desc, __unused void *handler_data)
{
struct dev_handler_node *dev = desc->dev_list;
/* Send EOI to LAPIC/IOAPIC IRR */
send_lapic_eoi();
while (dev) {
if (dev->dev_handler)
dev->dev_handler(desc->irq, dev->dev_data);
dev = dev->next;
}
return 0;
}
void update_irq_handler(int irq, irq_handler_t func)
{
struct irq_desc *desc;
spinlock_rflags;
if (irq >= NR_MAX_IRQS)
return;
desc = irq_desc_base + irq;
spinlock_irqsave_obtain(&desc->irq_lock);
desc->irq_handler = func;
spinlock_irqrestore_release(&desc->irq_lock);
}
void unregister_handler_common(struct dev_handler_node *node)
{
struct dev_handler_node *head;
struct irq_desc *desc;
spinlock_rflags;
if (node == NULL)
return;
dev_dbg(ACRN_DBG_IRQ, "[%s] %s irq%d vr:0x%x",
__func__, node->name,
dev_to_irq(node),
dev_to_vector(node));
desc = node->desc;
spinlock_irqsave_obtain(&desc->irq_lock);
head = desc->dev_list;
if (head == node) {
desc->dev_list = NULL;
goto UNLOCK_EXIT;
}
while (head->next) {
if (head->next == node)
break;
head = head->next;
}
head->next = node->next;
UNLOCK_EXIT:
spinlock_irqrestore_release(&desc->irq_lock);
irq_desc_try_free_vector(desc->irq);
free(node);
}
/*
* Allocate IRQ with Vector from 0x20 ~ 0xDF
*/
struct dev_handler_node*
normal_register_handler(int irq,
dev_handler_t func,
void *dev_data,
bool share,
bool lowpri,
const char *name)
{
struct irq_request_info info;
info.vector = -1;
info.lowpri = lowpri;
info.func = func;
info.dev_data = dev_data;
info.share = share;
info.name = (char *)name;
return common_register_handler(irq, &info);
}
/*
* Allocate IRQ with vector from 0xE0 ~ 0xFF
* Allocate a IRQ and install isr on that specific cpu
* User can install same irq/isr on different CPU by call this function multiple
* times
*/
struct dev_handler_node*
pri_register_handler(int irq,
int vector,
dev_handler_t func,
void *dev_data,
const char *name)
{
struct irq_request_info info;
if (vector < VECTOR_FOR_PRI_START || vector > VECTOR_FOR_PRI_END)
return NULL;
info.vector = vector;
info.lowpri = false;
info.func = func;
info.dev_data = dev_data;
info.share = true;
info.name = (char *)name;
return common_register_handler(irq, &info);
}
int get_cpu_interrupt_info(char *str, int str_max)
{
int irq, vector, pcpu_id, len, size = str_max;
struct irq_desc *desc;
len = snprintf(str, size, "\r\nIRQ\tVECTOR");
size -= len;
str += len;
for (pcpu_id = 0; pcpu_id < phy_cpu_num; pcpu_id++) {
len = snprintf(str, size, "\tCPU%d", pcpu_id);
size -= len;
str += len;
}
len = snprintf(str, size, "\tLOST\tSHARE");
size -= len;
str += len;
for (irq = 0; irq < NR_MAX_IRQS; irq++) {
desc = irq_desc_base + irq;
vector = irq_to_vector(irq);
if (desc->used != IRQ_NOT_ASSIGNED &&
vector != VECTOR_INVALID) {
len = snprintf(str, size, "\r\n%d\t0x%X", irq, vector);
size -= len;
str += len;
for (pcpu_id = 0; pcpu_id < phy_cpu_num; pcpu_id++) {
len = snprintf(str, size, "\t%d",
per_cpu(irq_count, pcpu_id)[irq]++);
size -= len;
str += len;
}
len = snprintf(str, size, "\t%d\t%s",
desc->irq_lost_cnt,
desc->used == IRQ_ASSIGNED_SHARED ?
"shared" : "no-shared");
size -= len;
str += len;
}
}
snprintf(str, size, "\r\n");
return 0;
}

932
hypervisor/arch/x86/mmu.c Normal file
View File

@@ -0,0 +1,932 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* Copyright (c) 2017 Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <bsp_extern.h>
#include <hv_debug.h>
static void *mmu_pml4_addr;
enum mem_map_request_type {
PAGING_REQUEST_TYPE_MAP = 0, /* Creates a new mapping. */
PAGING_REQUEST_TYPE_UNMAP = 1, /* Removes a pre-existing entry */
PAGING_REQUEST_TYPE_MODIFY = 2,
/* Modifies a pre-existing entries attributes. */
PAGING_REQUEST_TYPE_UNKNOWN,
};
struct mm_capability {
/* EPT and MMU 1-GByte page supported flag */
bool ept_1gb_page_supported;
bool invept_supported;
bool invept_single_context_supported;
bool invept_global_context_supported;
bool invvpid_supported;
bool invvpid_single_context_supported;
bool invvpid_global_context_supported;
bool mmu_1gb_page_supported;
};
static struct mm_capability mm_caps;
#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
#define INVEPT_TYPE_ALL_CONTEXTS 2UL
#define INVEPT_SET_ERROR_CODE \
" jnc 1f\n" \
" mov $1, %0\n" /* CF: error = 1 */ \
" jmp 3f\n" \
"1: jnz 2f\n" \
" mov $2, %0\n" /* ZF: error = 2 */ \
" jmp 3f\n" \
"2: mov $0, %0\n" \
"3:"
struct invept_desc {
uint64_t eptp;
uint64_t _res;
};
static inline void _invept(uint64_t type, struct invept_desc desc)
{
int error = 0;
asm volatile ("invept %1, %2\n"
INVEPT_SET_ERROR_CODE
: "=r" (error)
: "m" (desc), "r" (type)
: "memory");
ASSERT(error == 0, "invept error");
}
static void check_mmu_capability(void)
{
uint64_t val;
uint32_t eax, ebx, ecx, edx;
memset(&mm_caps, 0, sizeof(struct mm_capability));
/* Read the MSR register of EPT and VPID Capability - SDM A.10 */
val = msr_read(MSR_IA32_VMX_EPT_VPID_CAP);
mm_caps.ept_1gb_page_supported = (val & MSR_VMX_EPT_VPID_CAP_1GB)
? (true) : (false);
mm_caps.invept_supported =
(val & MSR_VMX_INVEPT) ? (true) : (false);
mm_caps.invept_single_context_supported =
(val & MSR_VMX_INVEPT_SINGLE_CONTEXT) ? (true) : (false);
mm_caps.invept_global_context_supported =
(val & MSR_VMX_INVEPT_GLOBAL_CONTEXT) ? (true) : (false);
mm_caps.invvpid_supported =
(val & MSR_VMX_INVVPID) ? (true) : (false);
mm_caps.invvpid_single_context_supported =
(val & MSR_VMX_INVVPID_SINGLE_CONTEXT) ? (true) : (false);
mm_caps.invvpid_global_context_supported =
(val & MSR_VMX_INVVPID_GLOBAL_CONTEXT) ? (true) : (false);
/* Read CPUID to check if PAGE1GB is supported
* SDM 4.1.4 If CPUID.80000001H:EDX.Page1GB[bit26]=1,
* 1-GByte pages are supported with 4-level paging
*/
cpuid(CPUID_EXTEND_FUNCTION_1, &eax, &ebx, &ecx, &edx);
mm_caps.mmu_1gb_page_supported = (edx & CPUID_EDX_PAGE1GB) ?
(true) : (false);
}
static inline bool check_invept_single_support(void)
{
return mm_caps.invept_supported &&
mm_caps.invept_single_context_supported;
}
static inline bool check_invept_global_support(void)
{
return mm_caps.invept_supported &&
mm_caps.invept_global_context_supported;
}
void mmu_invept(struct vcpu *vcpu)
{
struct invept_desc desc = {0};
if (check_invept_single_support()) {
desc.eptp = (uint64_t) vcpu->vm->arch_vm.ept | (3 << 3) | 6;
_invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
} else if (check_invept_global_support())
_invept(INVEPT_TYPE_ALL_CONTEXTS, desc);
}
static bool check_mmu_1gb_support(struct map_params *map_params)
{
bool status = false;
if (map_params->page_table_type == PT_EPT)
status = mm_caps.ept_1gb_page_supported;
else
status = mm_caps.mmu_1gb_page_supported;
return status;
}
static uint32_t map_mem_region(void *vaddr, void *paddr,
void *table_base, uint64_t attr, uint32_t table_level,
int ept_entry, enum mem_map_request_type request_type)
{
uint64_t table_entry;
uint64_t table_present;
uint32_t table_offset;
uint32_t mapped_size;
if (table_base == NULL || table_level >= IA32E_UNKNOWN
|| request_type >= PAGING_REQUEST_TYPE_UNKNOWN) {
/* Shouldn't go here */
ASSERT(false, "Incorrect Arguments. Failed to map region");
}
/* switch based on of table */
switch (table_level) {
case IA32E_PDPT:
/* Get offset to the entry in the PDPT for this address */
table_offset = IA32E_PDPTE_INDEX_CALC(vaddr);
/* PS bit must be set for these entries to be mapped */
attr |= IA32E_PDPTE_PS_BIT;
/* Set mapped size to 1 GB */
mapped_size = MEM_1G;
break;
case IA32E_PD:
/* Get offset to the entry in the PD for this address */
table_offset = IA32E_PDE_INDEX_CALC(vaddr);
/* PS bit must be set for these entries to be mapped */
attr |= IA32E_PDE_PS_BIT;
/* Set mapped size to 2 MB */
mapped_size = MEM_2M;
break;
case IA32E_PT:
/* Get offset to the entry in the PT for this address */
table_offset = IA32E_PTE_INDEX_CALC(vaddr);
/* NOTE: No PS bit in page table entries */
/* Set mapped size to 4 KB */
mapped_size = MEM_4K;
/* If not a EPT entry, see if the PAT bit is set for PDPT entry
*/
if ((!ept_entry) && (attr & IA32E_PDPTE_PAT_BIT)) {
/* The PAT bit is set; Clear it and set the page table
* PAT bit instead
*/
attr &= (uint64_t) (~((uint64_t) IA32E_PDPTE_PAT_BIT));
attr |= IA32E_PTE_PAT_BIT;
}
break;
case IA32E_PML4:
default:
/* Set mapping size to 0 - can't map memory in PML4 */
mapped_size = 0;
break;
}
/* Check to see if mapping should occur */
if (mapped_size != 0) {
/* Get current table entry */
uint64_t tmp = MEM_READ64(table_base + table_offset);
/* Check if EPT entry */
if (ept_entry) {
/* Use read/write/execute bits to determine presence of
* entry
*/
table_present = (IA32E_EPT_R_BIT |
IA32E_EPT_W_BIT | IA32E_EPT_X_BIT);
} else {
/* Use the P bit to determine if an entry is present */
table_present = IA32E_COMM_P_BIT;
}
switch (request_type) {
case PAGING_REQUEST_TYPE_MAP:
{
/* No need to confirm current table entry
* isn't already present
* support map-->remap
*/
table_entry = (ept_entry
? attr
: (attr | IA32E_COMM_P_BIT));
table_entry |= (uint64_t)paddr;
/* Write the table entry to map this memory */
MEM_WRITE64(table_base + table_offset, table_entry);
break;
}
case PAGING_REQUEST_TYPE_UNMAP:
{
if (tmp & table_present) {
/* Table is present.
* Write the table entry to map this memory
*/
MEM_WRITE64(table_base + table_offset, 0);
}
break;
}
case PAGING_REQUEST_TYPE_MODIFY:
{
/* Allow mapping or modification as requested. */
table_entry = (ept_entry
? attr : (attr | IA32E_COMM_P_BIT));
table_entry |= (uint64_t) paddr;
/* Write the table entry to map this memory */
MEM_WRITE64(table_base + table_offset, table_entry);
break;
}
default:
ASSERT("Bad memory map request type" == 0, "");
break;
}
}
/* Return mapped size to caller */
return mapped_size;
}
static uint32_t fetch_page_table_offset(void *addr, uint32_t table_level)
{
uint32_t table_offset;
/* Switch based on level of table */
switch (table_level) {
case IA32E_PML4:
/* Get offset to the entry in the PML4
* for this address
*/
table_offset = IA32E_PML4E_INDEX_CALC(addr);
break;
case IA32E_PDPT:
/* Get offset to the entry in the PDPT
* for this address
*/
table_offset = IA32E_PDPTE_INDEX_CALC(addr);
break;
case IA32E_PD:
/* Get offset to the entry in the PD
* for this address
*/
table_offset = IA32E_PDE_INDEX_CALC(addr);
break;
case IA32E_PT:
table_offset = IA32E_PTE_INDEX_CALC(addr);
break;
default:
pr_err("Wrong page table level = 0x%lx", table_level);
ASSERT(false, "Wrong page table level");
break;
}
return table_offset;
}
static inline uint32_t check_page_table_present(struct map_params *map_params,
uint64_t table_entry)
{
if (map_params->page_table_type == PT_EPT) {
table_entry &= (IA32E_EPT_R_BIT | IA32E_EPT_W_BIT |
IA32E_EPT_X_BIT);
} else {
table_entry &= (IA32E_COMM_P_BIT);
}
return (table_entry) ? PT_PRESENT : PT_NOT_PRESENT;
}
static uint64_t get_table_entry(struct map_params *map_params, void *addr,
void *table_base, uint32_t table_level)
{
uint32_t table_offset;
uint64_t table_entry;
int status = 0;
if (table_base == NULL
|| table_level >= IA32E_UNKNOWN
|| map_params == NULL) {
status = -EINVAL;
}
ASSERT(status == 0, "Incorrect Arguments");
table_offset = fetch_page_table_offset(addr, table_level);
/* Read the table entry */
table_entry = MEM_READ64(table_base + table_offset);
/* Return the next table in the walk */
return table_entry;
}
static void *walk_paging_struct(void *addr, void *table_base,
uint32_t table_level, struct map_params *map_params)
{
uint32_t table_offset;
uint64_t table_entry;
uint64_t table_present;
/* if table_level == IA32E_PT Just return the same address
* can't walk down any further
*/
void *sub_table_addr = ((table_level == IA32E_PT) ? table_base:NULL);
int status = 0;
if (table_base == NULL || table_level >= IA32E_UNKNOWN
|| map_params == NULL) {
status = -EINVAL;
}
ASSERT(status == 0, "Incorrect Arguments");
table_offset = fetch_page_table_offset(addr, table_level);
/* See if we can skip the rest */
if (sub_table_addr != table_base) {
/* Read the table entry */
table_entry = MEM_READ64(table_base + table_offset);
/* Check if EPT entry being created */
if (map_params->page_table_type == PT_EPT) {
/* Set table present bits to any of the
* read/write/execute bits
*/
table_present = (IA32E_EPT_R_BIT | IA32E_EPT_W_BIT |
IA32E_EPT_X_BIT);
} else {
/* Set table preset bits to P bit or r/w bit */
table_present = (IA32E_COMM_P_BIT | IA32E_COMM_RW_BIT);
}
/* Determine if a valid entry exists */
if ((table_entry & table_present) == 0) {
/* No entry present - need to allocate a new table */
sub_table_addr =
alloc_paging_struct();
/* Check to ensure memory available for this structure*/
if (sub_table_addr == 0) {
/* Error: Unable to find table memory necessary
* to map memory
*/
ASSERT(sub_table_addr == 0,
"Fail to find table memory "
"for map memory");
return sub_table_addr;
}
/* Write entry to current table to reference the new
* sub-table
*/
MEM_WRITE64(table_base + table_offset,
(uint64_t) sub_table_addr | table_present);
} else {
/* Get address of the sub-table */
sub_table_addr = (void *)(table_entry & IA32E_REF_MASK);
}
}
/* Return the next table in the walk */
return sub_table_addr;
}
void *get_paging_pml4(void)
{
/* Return address to caller */
return mmu_pml4_addr;
}
void enable_paging(void *pml4_base_addr)
{
CPU_CR_WRITE(cr3, (unsigned long)pml4_base_addr);
}
void init_paging(void)
{
struct map_params map_params;
struct e820_entry *entry;
uint32_t i;
int attr_wb = (MMU_MEM_ATTR_READ |
MMU_MEM_ATTR_WRITE |
MMU_MEM_ATTR_EXECUTE |
MMU_MEM_ATTR_WB_CACHE);
int attr_uc = (MMU_MEM_ATTR_READ |
MMU_MEM_ATTR_WRITE |
MMU_MEM_ATTR_EXECUTE |
MMU_MEM_ATTR_UNCACHED);
pr_dbg("HV MMU Initialization");
check_mmu_capability();
/* Allocate memory for Hypervisor PML4 table */
mmu_pml4_addr = alloc_paging_struct();
init_e820();
obtain_e820_mem_info();
/* Loop through all memory regions in the e820 table */
map_params.page_table_type = PT_HOST;
map_params.pml4_base = mmu_pml4_addr;
/* Map all memory regions to UC attribute */
map_mem(&map_params, (void *)e820_mem.mem_bottom,
(void *)e820_mem.mem_bottom,
(e820_mem.mem_top - e820_mem.mem_bottom),
attr_uc);
/* Modify WB attribute for E820_TYPE_RAM */
for (i = 0, entry = &e820[0];
i < e820_entries;
i++, entry = &e820[i]) {
if (entry->type == E820_TYPE_RAM) {
modify_mem(&map_params, (void *)entry->baseaddr,
(void *)entry->baseaddr,
entry->length, attr_wb);
}
}
pr_dbg("Enabling MMU ");
/* Enable paging */
enable_paging(mmu_pml4_addr);
}
void *alloc_paging_struct(void)
{
void *ptr = NULL;
/* Allocate a page from Hypervisor heap */
ptr = alloc_page();
ASSERT(ptr, "page alloc failed!");
memset(ptr, 0, CPU_PAGE_SIZE);
return ptr;
}
uint64_t config_page_table_attr(struct map_params *map_params, uint32_t flags)
{
int ept_entry = map_params->page_table_type;
uint64_t attr = 0;
/* Convert generic memory flags to architecture specific attributes */
/* Check if read access */
if (flags & MMU_MEM_ATTR_READ) {
/* Configure for read access */
attr |=
(ept_entry ? IA32E_EPT_R_BIT : MMU_MEM_ATTR_BIT_READ_WRITE);
}
/* Check for write access */
if (flags & MMU_MEM_ATTR_WRITE) {
/* Configure for write access */
attr |=
(ept_entry ? IA32E_EPT_W_BIT : MMU_MEM_ATTR_BIT_READ_WRITE);
}
/* Check for execute access */
if (flags & MMU_MEM_ATTR_EXECUTE) {
/* Configure for execute (EPT only) */
attr |= (ept_entry ? IA32E_EPT_X_BIT : 0);
}
/* EPT & VT-d share the same page tables, set SNP bit
* to force snooping of PCIe devices if the page
* is cachable
*/
if ((flags & MMU_MEM_ATTR_UNCACHED) != MMU_MEM_ATTR_UNCACHED
&& ept_entry == PT_EPT) {
attr |= IA32E_EPT_SNOOP_CTRL;
}
/* Check for cache / memory types */
if (flags & MMU_MEM_ATTR_WB_CACHE) {
/* Configure for write back cache */
attr |=
(ept_entry ? IA32E_EPT_WB : MMU_MEM_ATTR_TYPE_CACHED_WB);
} else if (flags & MMU_MEM_ATTR_WT_CACHE) {
/* Configure for write through cache */
attr |=
(ept_entry ? IA32E_EPT_WT : MMU_MEM_ATTR_TYPE_CACHED_WT);
} else if (flags & MMU_MEM_ATTR_UNCACHED) {
/* Configure for uncached */
attr |=
(ept_entry ? IA32E_EPT_UNCACHED : MMU_MEM_ATTR_TYPE_UNCACHED);
} else if (flags & MMU_MEM_ATTR_WC) {
/* Configure for write combining */
attr |=
(ept_entry ? IA32E_EPT_WC : MMU_MEM_ATTR_TYPE_WRITE_COMBINED);
} else {
/* Configure for write protected */
attr |=
(ept_entry ? IA32E_EPT_WP : MMU_MEM_ATTR_TYPE_WRITE_PROTECTED);
}
return attr;
}
void obtain_last_page_table_entry(struct map_params *map_params,
struct entry_params *entry, void *addr, bool direct)
{
uint64_t table_entry;
uint32_t table_present = 0;
/* Obtain the PML4 address */
void *table_addr = direct ? (map_params->pml4_base)
: (map_params->pml4_inverted);
/* Obtain page table entry from PML4 table*/
table_entry = get_table_entry(map_params, addr,
table_addr, IA32E_PML4);
table_present = check_page_table_present(map_params, table_entry);
if (table_present == PT_NOT_PRESENT) {
/* PML4E not present, return PML4 base address */
entry->entry_level = IA32E_PML4;
entry->entry_base = (uint64_t)table_addr;
entry->entry_present = PT_NOT_PRESENT;
entry->page_size = check_mmu_1gb_support(map_params) ?
(PAGE_SIZE_1G) : (PAGE_SIZE_2M);
entry->entry_off = fetch_page_table_offset(addr, IA32E_PML4);
entry->entry_val = table_entry;
return;
}
/* Obtain page table entry from PDPT table*/
table_addr = (void *)(table_entry & IA32E_REF_MASK);
table_entry = get_table_entry(map_params, addr,
table_addr, IA32E_PDPT);
table_present = check_page_table_present(map_params, table_entry);
if (table_present == PT_NOT_PRESENT) {
/* PDPTE not present, return PDPT base address */
entry->entry_level = IA32E_PDPT;
entry->entry_base = (uint64_t)table_addr;
entry->entry_present = PT_NOT_PRESENT;
entry->page_size = check_mmu_1gb_support(map_params) ?
(PAGE_SIZE_1G) : (PAGE_SIZE_2M);
entry->entry_off = fetch_page_table_offset(addr, IA32E_PDPT);
entry->entry_val = table_entry;
return;
}
if (table_entry & IA32E_PDPTE_PS_BIT) {
/* 1GB page size, return the base addr of the pg entry*/
entry->entry_level = IA32E_PDPT;
entry->entry_base = (uint64_t)table_addr;
entry->page_size = check_mmu_1gb_support(map_params) ?
(PAGE_SIZE_1G) : (PAGE_SIZE_2M);
entry->entry_present = PT_PRESENT;
entry->entry_off = fetch_page_table_offset(addr, IA32E_PDPT);
entry->entry_val = table_entry;
return;
}
/* Obtain page table entry from PD table*/
table_addr = (void *)(table_entry&IA32E_REF_MASK);
table_entry = get_table_entry(map_params, addr,
table_addr, IA32E_PD);
table_present = check_page_table_present(map_params, table_entry);
if (table_present == PT_NOT_PRESENT) {
/* PDE not present, return PDE base address */
entry->entry_level = IA32E_PD;
entry->entry_base = (uint64_t)table_addr;
entry->entry_present = PT_NOT_PRESENT;
entry->page_size = PAGE_SIZE_2M;
entry->entry_off = fetch_page_table_offset(addr, IA32E_PD);
entry->entry_val = table_entry;
return;
}
if (table_entry & IA32E_PDE_PS_BIT) {
/* 2MB page size, return the base addr of the pg entry*/
entry->entry_level = IA32E_PD;
entry->entry_base = (uint64_t)table_addr;
entry->entry_present = PT_PRESENT;
entry->page_size = PAGE_SIZE_2M;
entry->entry_off = fetch_page_table_offset(addr, IA32E_PD);
entry->entry_val = table_entry;
return;
}
/* Obtain page table entry from PT table*/
table_addr = (void *)(table_entry&IA32E_REF_MASK);
table_entry = get_table_entry(map_params, addr,
table_addr, IA32E_PT);
table_present = check_page_table_present(map_params, table_entry);
entry->entry_present = ((table_present == PT_PRESENT)
? (PT_PRESENT):(PT_NOT_PRESENT));
entry->entry_level = IA32E_PT;
entry->entry_base = (uint64_t)table_addr;
entry->page_size = PAGE_SIZE_4K;
entry->entry_off = fetch_page_table_offset(addr, IA32E_PT);
entry->entry_val = table_entry;
}
static uint64_t update_page_table_entry(struct map_params *map_params,
void *paddr, void *vaddr, uint64_t size, uint64_t attr,
enum mem_map_request_type request_type, bool direct)
{
uint64_t remaining_size = size;
uint32_t adjustment_size;
int ept_entry = map_params->page_table_type;
/* Obtain the PML4 address */
void *table_addr = direct ? (map_params->pml4_base)
: (map_params->pml4_inverted);
/* Walk from the PML4 table to the PDPT table */
table_addr = walk_paging_struct(vaddr, table_addr, IA32E_PML4,
map_params);
if ((remaining_size >= MEM_1G)
&& (MEM_ALIGNED_CHECK(vaddr, MEM_1G))
&& (MEM_ALIGNED_CHECK(paddr, MEM_1G))
&& check_mmu_1gb_support(map_params)) {
/* Map this 1 GByte memory region */
adjustment_size = map_mem_region(vaddr, paddr,
table_addr, attr, IA32E_PDPT,
ept_entry, request_type);
} else if ((remaining_size >= MEM_2M)
&& (MEM_ALIGNED_CHECK(vaddr, MEM_2M))
&& (MEM_ALIGNED_CHECK(paddr, MEM_2M))) {
/* Walk from the PDPT table to the PD table */
table_addr = walk_paging_struct(vaddr, table_addr,
IA32E_PDPT, map_params);
/* Map this 2 MByte memory region */
adjustment_size = map_mem_region(vaddr, paddr,
table_addr, attr, IA32E_PD, ept_entry,
request_type);
} else {
/* Walk from the PDPT table to the PD table */
table_addr = walk_paging_struct(vaddr,
table_addr, IA32E_PDPT, map_params);
/* Walk from the PD table to the page table */
table_addr = walk_paging_struct(vaddr,
table_addr, IA32E_PD, map_params);
/* Map this 4 KByte memory region */
adjustment_size = map_mem_region(vaddr, paddr,
table_addr, attr, IA32E_PT,
ept_entry, request_type);
}
return adjustment_size;
}
static uint64_t break_page_table(struct map_params *map_params, void *paddr,
void *vaddr, uint64_t page_size, bool direct)
{
uint32_t i = 0;
uint64_t pa;
uint64_t attr = 0x00;
uint64_t next_page_size = 0x00;
void *sub_tab_addr = NULL;
struct entry_params entry;
switch (page_size) {
/* Breaking 1GB page to 2MB page*/
case PAGE_SIZE_1G:
next_page_size = PAGE_SIZE_2M;
attr |= IA32E_PDE_PS_BIT;
pr_info("%s, Breaking 1GB -->2MB vaddr=0x%llx",
__func__, vaddr);
break;
/* Breaking 2MB page to 4KB page*/
case PAGE_SIZE_2M:
next_page_size = PAGE_SIZE_4K;
pr_info("%s, Breaking 2MB -->4KB vaddr=0x%llx",
__func__, vaddr);
break;
/* 4KB page, No action*/
case PAGE_SIZE_4K:
default:
next_page_size = PAGE_SIZE_4K;
pr_info("%s, Breaking 4KB no action vaddr=0x%llx",
__func__, vaddr);
break;
}
if (page_size != next_page_size) {
obtain_last_page_table_entry(map_params, &entry, vaddr, direct);
/* New entry present - need to allocate a new table */
sub_tab_addr = alloc_paging_struct();
/* Check to ensure memory available for this structure */
if (sub_tab_addr == 0) {
/* Error:
* Unable to find table memory necessary to map memory
*/
pr_err("Fail to find table memory for map memory");
ASSERT(sub_tab_addr == 0, "");
return 0;
}
/* the physical address maybe be not aligned of
* current page size, obtain the starting physical address
* aligned of current page size
*/
pa = ((((uint64_t)paddr) / page_size) * page_size);
if (map_params->page_table_type == PT_EPT) {
/* Keep original attribute(here &0x3f)
* bit 0(R) bit1(W) bit2(X) bit3~5 MT
*/
attr |= (entry.entry_val & 0x3f);
} else {
/* Keep original attribute(here &0x7f) */
attr |= (entry.entry_val & 0x7f);
}
/* write all entries and keep original attr*/
for (i = 0; i < IA32E_NUM_ENTRIES; i++) {
MEM_WRITE64(sub_tab_addr + (i * IA32E_COMM_ENTRY_SIZE),
(attr | (pa + (i * next_page_size))));
}
if (map_params->page_table_type == PT_EPT) {
/* Write the table entry to map this memory,
* SDM chapter28 figure 28-1
* bit 0(R) bit1(W) bit2(X) bit3~5 MUST be reserved
* (here &0x07)
*/
MEM_WRITE64(entry.entry_base + entry.entry_off,
((entry.entry_val & 0x07) |
((uint64_t)sub_tab_addr)));
} else {
/* Write the table entry to map this memory,
* SDM chapter4 figure 4-11
* bit0(P) bit1(RW) bit2(U/S) bit3(PWT) bit4(PCD)
* bit5(A) bit6(D or Ignore)
*/
MEM_WRITE64(entry.entry_base + entry.entry_off,
((entry.entry_val & 0x7f) |
((uint64_t)sub_tab_addr)));
}
}
return next_page_size;
}
static void modify_paging(struct map_params *map_params, void *paddr,
void *vaddr, uint64_t size, uint32_t flags,
enum mem_map_request_type request_type, bool direct)
{
int64_t remaining_size;
uint64_t adjust_size;
uint64_t attr;
int status = 0;
struct entry_params entry;
uint64_t page_size;
uint64_t vaddr_end = ((uint64_t)vaddr) + size;
/* if the address is not PAGE aligned, will drop
* the unaligned part
*/
paddr = (void *)ROUND_PAGE_UP((uint64_t)paddr);
vaddr = (void *)ROUND_PAGE_UP((uint64_t)vaddr);
vaddr_end = ROUND_PAGE_DOWN(vaddr_end);
remaining_size = vaddr_end - (uint64_t)vaddr;
if ((request_type >= PAGING_REQUEST_TYPE_UNKNOWN)
|| (map_params == NULL)) {
pr_err("%s: vaddr=0x%llx size=0x%llx req_type=0x%lx",
__func__, vaddr, size, request_type);
status = -EINVAL;
}
ASSERT(status == 0, "Incorrect Arguments");
attr = config_page_table_attr(map_params, flags);
/* Loop until the entire block of memory is appropriately
* MAP/UNMAP/MODIFY
*/
while (remaining_size > 0) {
obtain_last_page_table_entry(map_params, &entry, vaddr, direct);
/* filter the unmap request, no action in this case*/
page_size = entry.page_size;
if ((request_type == PAGING_REQUEST_TYPE_UNMAP)
&& (entry.entry_present == PT_NOT_PRESENT)) {
adjust_size =
page_size - ((uint64_t)(vaddr) % page_size);
vaddr += adjust_size;
paddr += adjust_size;
remaining_size -= adjust_size;
continue;
}
/* if the address is NOT aligned of current page size,
* or required memory size < page size
* need to break page firstly
*/
if (entry.entry_present == PT_PRESENT) {
/* Maybe need to recursive breaking in this case
* e.g. 1GB->2MB->4KB
*/
while ((uint64_t)remaining_size < page_size
|| (!MEM_ALIGNED_CHECK(vaddr, page_size))
|| (!MEM_ALIGNED_CHECK(paddr, page_size))) {
/* The breaking function return the page size
* of next level page table
*/
page_size = break_page_table(map_params,
paddr, vaddr, page_size, direct);
}
} else {
page_size = ((uint64_t)remaining_size < page_size)
? ((uint64_t)remaining_size) : (page_size);
}
/* The function return the memory size that one entry can map */
adjust_size = update_page_table_entry(map_params, paddr, vaddr,
page_size, attr, request_type, direct);
vaddr += adjust_size;
paddr += adjust_size;
remaining_size -= adjust_size;
}
}
void map_mem(struct map_params *map_params, void *paddr, void *vaddr,
uint64_t size, uint32_t flags)
{
/* used for MMU and EPT*/
modify_paging(map_params, paddr, vaddr, size, flags,
PAGING_REQUEST_TYPE_MAP, true);
/* only for EPT */
if (map_params->page_table_type == PT_EPT) {
modify_paging(map_params, vaddr, paddr, size, flags,
PAGING_REQUEST_TYPE_MAP, false);
}
}
void unmap_mem(struct map_params *map_params, void *paddr, void *vaddr,
uint64_t size, uint32_t flags)
{
/* used for MMU and EPT */
modify_paging(map_params, paddr, vaddr, size, flags,
PAGING_REQUEST_TYPE_UNMAP, true);
/* only for EPT */
if (map_params->page_table_type == PT_EPT) {
modify_paging(map_params, vaddr, paddr, size, flags,
PAGING_REQUEST_TYPE_UNMAP, false);
}
}
void modify_mem(struct map_params *map_params, void *paddr, void *vaddr,
uint64_t size, uint32_t flags)
{
/* used for MMU and EPT*/
modify_paging(map_params, paddr, vaddr, size, flags,
PAGING_REQUEST_TYPE_MODIFY, true);
/* only for EPT */
if (map_params->page_table_type == PT_EPT) {
modify_paging(map_params, vaddr, paddr, size, flags,
PAGING_REQUEST_TYPE_MODIFY, false);
}
}

View File

@@ -0,0 +1,98 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#include <irq.h>
static struct dev_handler_node *notification_node;
/* run in interrupt context */
static int kick_notification(__unused int irq, __unused void *data)
{
/* Notification vector does not require handling here, it's just used
* to kick taget cpu out of non-root mode.
*/
return 0;
}
static int request_notification_irq(dev_handler_t func, void *data,
const char *name)
{
int irq = -1; /* system allocate */
struct dev_handler_node *node = NULL;
if (notification_node != NULL) {
pr_info("%s, Notification vector already allocated on this CPU",
__func__);
return -EBUSY;
}
/* all cpu register the same notification vector */
node = pri_register_handler(irq, VECTOR_NOTIFY_VCPU, func, data, name);
if (node == NULL) {
pr_err("Failed to add notify isr");
return -1;
}
update_irq_handler(dev_to_irq(node), quick_handler_nolock);
notification_node = node;
return 0;
}
void setup_notification(void)
{
int cpu;
char name[32] = {0};
cpu = get_cpu_id();
if (cpu > 0)
return;
/* support IPI notification, VM0 will register all CPU */
snprintf(name, 32, "NOTIFY_ISR%d", cpu);
if (request_notification_irq(kick_notification, NULL, name) < 0) {
pr_err("Failed to setup notification");
return;
}
dev_dbg(ACRN_DBG_PTIRQ, "NOTIFY: irq[%d] setup vector %x",
dev_to_irq(notification_node),
dev_to_vector(notification_node));
}
void cleanup_notification(void)
{
if (notification_node)
unregister_handler_common(notification_node);
notification_node = NULL;
}

View File

@@ -0,0 +1,117 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
static DEFINE_CPU_DATA(uint64_t, softirq_pending);
void disable_softirq(int cpu_id)
{
bitmap_clr(SOFTIRQ_ATOMIC, &per_cpu(softirq_pending, cpu_id));
}
void enable_softirq(int cpu_id)
{
bitmap_set(SOFTIRQ_ATOMIC, &per_cpu(softirq_pending, cpu_id));
}
void init_softirq(void)
{
int cpu_id;
for (cpu_id = 0; cpu_id < phy_cpu_num; cpu_id++) {
per_cpu(softirq_pending, cpu_id) = 0;
bitmap_set(SOFTIRQ_ATOMIC, &per_cpu(softirq_pending, cpu_id));
}
}
void raise_softirq(int softirq_id)
{
int cpu_id = get_cpu_id();
uint64_t *bitmap = &per_cpu(softirq_pending, cpu_id);
if (cpu_id >= phy_cpu_num)
return;
bitmap_set(softirq_id, bitmap);
}
void exec_softirq(void)
{
int cpu_id = get_cpu_id();
uint64_t *bitmap = &per_cpu(softirq_pending, cpu_id);
uint64_t rflag;
int softirq_id;
if (cpu_id >= phy_cpu_num)
return;
/* Disable softirq
* SOFTIRQ_ATOMIC bit = 0 means softirq already in execution
*/
if (!bitmap_test_and_clear(SOFTIRQ_ATOMIC, bitmap))
return;
if (((*bitmap) & SOFTIRQ_MASK) == 0UL)
goto ENABLE_AND_EXIT;
/* check if we are in interrupt context */
CPU_RFLAGS_SAVE(&rflag);
if (!(rflag & (1<<9)))
goto ENABLE_AND_EXIT;
while (1) {
softirq_id = bitmap_ffs(bitmap);
if ((softirq_id < 0) || (softirq_id >= SOFTIRQ_MAX))
break;
bitmap_clr(softirq_id, bitmap);
switch (softirq_id) {
case SOFTIRQ_TIMER:
timer_softirq(cpu_id);
break;
case SOFTIRQ_DEV_ASSIGN:
ptdev_softirq(cpu_id);
break;
default:
break;
}
}
ENABLE_AND_EXIT:
enable_softirq(cpu_id);
}

561
hypervisor/arch/x86/timer.c Normal file
View File

@@ -0,0 +1,561 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
#define MAX_TIMER_ACTIONS 32
struct timer_statistics {
struct {
uint64_t pickup_id;
uint64_t pickup_time;
uint64_t pickup_deadline;
uint64_t added_id;
uint64_t added_time;
uint64_t added_deadline;
} last;
uint64_t total_pickup_cnt;
uint64_t total_added_cnt;
uint64_t irq_cnt;
long pending_cnt;
};
struct timer {
timer_handle_t func; /* callback if time reached */
uint64_t priv_data; /* func private data */
uint64_t deadline; /* tsc deadline to interrupt */
long handle; /* unique handle for user */
int cpu_id; /* armed on which CPU */
int id; /* timer ID, used by release */
struct list_head node; /* link all timers */
};
struct per_cpu_timers {
struct timer *timers_pool; /* it's timers pool for allocation */
uint64_t free_bitmap;
struct list_head timer_list; /* it's for runtime active timer list */
spinlock_t lock;
int cpu_id;
struct timer_statistics stat;
};
static DEFINE_CPU_DATA(struct per_cpu_timers, cpu_timers);
#define TIMER_IRQ (NR_MAX_IRQS - 1)
DEFINE_CPU_DATA(struct dev_handler_node *, timer_node);
static struct timer*
find_expired_timer(struct per_cpu_timers *cpu_timer, uint64_t tsc_now);
static struct timer *alloc_timer(int cpu_id)
{
int idx;
struct per_cpu_timers *cpu_timer;
struct timer *timer;
spinlock_rflags;
cpu_timer = &per_cpu(cpu_timers, cpu_id);
spinlock_irqsave_obtain(&cpu_timer->lock);
idx = bitmap_ffs(&cpu_timer->free_bitmap);
if (idx < 0) {
spinlock_irqrestore_release(&cpu_timer->lock);
return NULL;
}
bitmap_clr(idx, &cpu_timer->free_bitmap);
cpu_timer->stat.total_added_cnt++;
cpu_timer->stat.pending_cnt++;
/* assign unique handle and never duplicate */
timer = cpu_timer->timers_pool + idx;
timer->handle = cpu_timer->stat.total_added_cnt;
spinlock_irqrestore_release(&cpu_timer->lock);
ASSERT((cpu_timer->timers_pool[cpu_id].cpu_id == cpu_id),
"timer cpu_id did not match");
return timer;
}
static void release_timer(struct timer *timer)
{
struct per_cpu_timers *cpu_timer;
spinlock_rflags;
cpu_timer = &per_cpu(cpu_timers, timer->cpu_id);
timer->priv_data = 0;
timer->func = NULL;
timer->deadline = 0;
spinlock_irqsave_obtain(&cpu_timer->lock);
bitmap_set(timer->id, &cpu_timer->free_bitmap);
cpu_timer->stat.pending_cnt--;
spinlock_irqrestore_release(&cpu_timer->lock);
}
static int get_target_cpu(void)
{
/* we should search idle CPU to balance timer service */
return get_cpu_id();
}
static struct timer*
find_expired_timer(struct per_cpu_timers *cpu_timer, uint64_t tsc_now)
{
struct timer *timer;
struct list_head *pos;
spinlock_rflags;
spinlock_irqsave_obtain(&cpu_timer->lock);
list_for_each(pos, &cpu_timer->timer_list) {
timer = list_entry(pos, struct timer, node);
if (timer->deadline <= tsc_now)
goto UNLOCK;
}
timer = NULL;
UNLOCK:
spinlock_irqrestore_release(&cpu_timer->lock);
return timer;
}
/* need lock protect outside */
static struct timer*
_search_nearest_timer(struct per_cpu_timers *cpu_timer)
{
struct timer *timer;
struct timer *target = NULL;
struct list_head *pos;
list_for_each(pos, &cpu_timer->timer_list) {
timer = list_entry(pos, struct timer, node);
if (target == NULL)
target = timer;
else if (timer->deadline < target->deadline)
target = timer;
}
return target;
}
/* need lock protect outside */
static struct timer*
_search_timer_by_handle(struct per_cpu_timers *cpu_timer, long handle)
{
struct timer *timer;
struct list_head *pos;
list_for_each(pos, &cpu_timer->timer_list) {
timer = list_entry(pos, struct timer, node);
if (timer->handle == handle)
goto FOUND;
}
timer = NULL;
FOUND:
return timer;
}
static void
run_timer(struct per_cpu_timers *cpu_timer, struct timer *timer)
{
spinlock_rflags;
/* remove from list first */
spinlock_irqsave_obtain(&cpu_timer->lock);
list_del(&timer->node);
spinlock_irqrestore_release(&cpu_timer->lock);
/* deadline = 0 means stop timer, we should skip */
if (timer->func && timer->deadline != 0UL)
timer->func(timer->priv_data);
cpu_timer->stat.last.pickup_id = timer->id;
cpu_timer->stat.last.pickup_deadline = timer->deadline;
cpu_timer->stat.last.pickup_time = rdtsc();
cpu_timer->stat.total_pickup_cnt++;
TRACE_4I(TRACE_TIMER_ACTION_PCKUP, timer->id, timer->deadline,
timer->deadline >> 32, cpu_timer->stat.total_pickup_cnt);
}
/* run in interrupt context */
static int tsc_deadline_handler(__unused int irq, __unused void *data)
{
raise_softirq(SOFTIRQ_TIMER);
return 0;
}
static inline void schedule_next_timer(int cpu)
{
struct timer *timer;
struct per_cpu_timers *cpu_timer = &per_cpu(cpu_timers, cpu);
spinlock_rflags;
spinlock_irqsave_obtain(&cpu_timer->lock);
timer = _search_nearest_timer(cpu_timer);
if (timer) {
/* it is okay to program a expired time */
msr_write(MSR_IA32_TSC_DEADLINE, timer->deadline);
}
spinlock_irqrestore_release(&cpu_timer->lock);
}
int request_timer_irq(int cpu, dev_handler_t func, void *data, const char *name)
{
struct dev_handler_node *node = NULL;
if (cpu >= phy_cpu_num)
return -1;
if (per_cpu(timer_node, cpu)) {
pr_err("CPU%d timer isr already added", cpu);
unregister_handler_common(per_cpu(timer_node, cpu));
}
node = pri_register_handler(TIMER_IRQ, VECTOR_TIMER, func, data, name);
if (node != NULL) {
per_cpu(timer_node, cpu) = node;
update_irq_handler(TIMER_IRQ, quick_handler_nolock);
} else {
pr_err("Failed to add timer isr");
return -1;
}
return 0;
}
/*TODO: init in separate cpu */
static void init_timer_pool(void)
{
int i, j;
struct per_cpu_timers *cpu_timer;
struct timer *timers_pool;
/* Make sure only init one time*/
if (get_cpu_id() > 0)
return;
for (i = 0; i < phy_cpu_num; i++) {
cpu_timer = &per_cpu(cpu_timers, i);
cpu_timer->cpu_id = i;
timers_pool =
calloc(MAX_TIMER_ACTIONS, sizeof(struct timer));
ASSERT(timers_pool, "Create timers pool failed");
cpu_timer->timers_pool = timers_pool;
cpu_timer->free_bitmap = (1UL<<MAX_TIMER_ACTIONS)-1;
INIT_LIST_HEAD(&cpu_timer->timer_list);
spinlock_init(&cpu_timer->lock);
for (j = 0; j < MAX_TIMER_ACTIONS; j++) {
timers_pool[j].id = j;
timers_pool[j].cpu_id = i;
timers_pool[j].priv_data = 0;
timers_pool[j].func = NULL;
timers_pool[j].deadline = 0;
timers_pool[j].handle = -1UL;
}
}
}
static void init_tsc_deadline_timer(void)
{
uint32_t val;
val = VECTOR_TIMER;
val |= 0x40000; /* TSC deadline and unmask */
mmio_write_long(val, LAPIC_BASE + LAPIC_LVT_TIMER_REGISTER);
asm volatile("mfence" : : : "memory");
/* disarm timer */
msr_write(MSR_IA32_TSC_DEADLINE, 0UL);
}
void timer_init(void)
{
char name[32] = {0};
int cpu = get_cpu_id();
snprintf(name, 32, "timer_tick[%d]", cpu);
if (request_timer_irq(cpu, tsc_deadline_handler, NULL, name) < 0) {
pr_err("Timer setup failed");
return;
}
init_tsc_deadline_timer();
init_timer_pool();
}
void timer_cleanup(void)
{
int cpu = get_cpu_id();
if (per_cpu(timer_node, cpu))
unregister_handler_common(per_cpu(timer_node, cpu));
per_cpu(timer_node, cpu) = NULL;
}
int timer_softirq(int cpu_id)
{
struct per_cpu_timers *cpu_timer;
struct timer *timer;
int max = MAX_TIMER_ACTIONS;
/* handle passed timer */
cpu_timer = &per_cpu(cpu_timers, cpu_id);
cpu_timer->stat.irq_cnt++;
/* This is to make sure we are not blocked due to delay inside func()
* force to exit irq handler after we serviced >31 timers
* caller used to add_timer() in timer->func(), if there is a delay
* inside func(), it will infinitely loop here, because new added timer
* already passed due to previously func()'s delay.
*/
timer = find_expired_timer(cpu_timer, rdtsc());
while (timer && --max > 0) {
run_timer(cpu_timer, timer);
/* put back to timer pool */
release_timer(timer);
/* search next one */
timer = find_expired_timer(cpu_timer, rdtsc());
}
/* update nearest timer */
schedule_next_timer(cpu_id);
return 0;
}
/*
* add_timer is okay to add passed timer but not 0
* return: handle, this handle is unique and can be used to find back
* this added timer. handle will be invalid after timer expired
*/
long add_timer(timer_handle_t func, uint64_t data, uint64_t deadline)
{
struct timer *timer;
struct per_cpu_timers *cpu_timer;
int cpu_id = get_target_cpu();
spinlock_rflags;
if (deadline == 0 || func == NULL)
return -1;
/* possible interrupt context please avoid mem alloct here*/
timer = alloc_timer(cpu_id);
if (timer == NULL)
return -1;
timer->func = func;
timer->priv_data = data;
timer->deadline = deadline;
timer->cpu_id = get_target_cpu();
cpu_timer = &per_cpu(cpu_timers, timer->cpu_id);
/* We need irqsave here even softirq enabled to protect timer_list */
spinlock_irqsave_obtain(&cpu_timer->lock);
list_add_tail(&timer->node, &cpu_timer->timer_list);
cpu_timer->stat.last.added_id = timer->id;
cpu_timer->stat.last.added_time = rdtsc();
cpu_timer->stat.last.added_deadline = timer->deadline;
spinlock_irqrestore_release(&cpu_timer->lock);
TRACE_4I(TRACE_TIMER_ACTION_ADDED, timer->id, timer->deadline,
timer->deadline >> 32, cpu_timer->stat.total_added_cnt);
schedule_next_timer(cpu_id);
return timer->handle;
}
/*
* update_timer existing timer. if not found, add new timer
*/
long
update_timer(long handle, timer_handle_t func, uint64_t data,
uint64_t deadline)
{
struct timer *timer;
struct per_cpu_timers *cpu_timer;
int cpu_id = get_target_cpu();
spinlock_rflags;
bool ret = false;
if (deadline == 0)
return -1;
cpu_timer = &per_cpu(cpu_timers, cpu_id);
spinlock_irqsave_obtain(&cpu_timer->lock);
timer = _search_timer_by_handle(cpu_timer, handle);
if (timer) {
/* update deadline and re-sort */
timer->deadline = deadline;
timer->func = func;
timer->priv_data = data;
TRACE_4I(TRACE_TIMER_ACTION_UPDAT, timer->id,
timer->deadline, timer->deadline >> 32,
cpu_timer->stat.total_added_cnt);
ret = true;
}
spinlock_irqrestore_release(&cpu_timer->lock);
if (ret)
schedule_next_timer(cpu_id);
else {
/* if update failed, we add to new, and update handle */
/* TODO: the correct behavior should be return failure here */
handle = add_timer(func, data, deadline);
}
return handle;
}
/* NOTE: cpu_id referred to physical cpu id here */
bool cancel_timer(long handle, int cpu_id)
{
struct timer *timer;
struct per_cpu_timers *cpu_timer;
spinlock_rflags;
bool ret = false;
cpu_timer = &per_cpu(cpu_timers, cpu_id);
spinlock_irqsave_obtain(&cpu_timer->lock);
timer = _search_timer_by_handle(cpu_timer, handle);
if (timer) {
/* NOTE: we can not directly release timer here.
* Instead we set deadline to expired and clear func.
* This timer will be reclaim next timer
*/
timer->deadline = 0;
timer->func = NULL;
ret = true;
}
spinlock_irqrestore_release(&cpu_timer->lock);
return ret;
}
void dump_timer_pool_info(int cpu_id)
{
struct per_cpu_timers *cpu_timer =
&per_cpu(cpu_timers, cpu_id);
struct list_head *pos;
int cn = 0;
spinlock_rflags;
if (cpu_id >= phy_cpu_num)
return;
pr_info("Timer%d statistics: Pending: %d\n\t"
"total_pickup: %lld total_added: %lld total_irq: %lld",
cpu_id,
cpu_timer->stat.pending_cnt,
cpu_timer->stat.total_pickup_cnt,
cpu_timer->stat.total_added_cnt,
cpu_timer->stat.irq_cnt);
pr_info("LAST pickup[%d] time: 0x%llx deadline: 0x%llx",
cpu_timer->stat.last.pickup_id,
cpu_timer->stat.last.pickup_time,
cpu_timer->stat.last.pickup_deadline);
pr_info("LAST added[%d] time: 0x%llx deadline: 0x%llx",
cpu_timer->stat.last.added_id,
cpu_timer->stat.last.added_time,
cpu_timer->stat.last.added_deadline);
spinlock_irqsave_obtain(&cpu_timer->lock);
list_for_each(pos, &cpu_timer->timer_list) {
cn++;
pr_info("-->pending: %d trigger: 0x%llx", cn,
list_entry(pos, struct timer, node)->deadline);
}
spinlock_irqrestore_release(&cpu_timer->lock);
}
void check_tsc(void)
{
uint64_t temp64;
/* Ensure time-stamp timer is turned on for each CPU */
CPU_CR_READ(cr4, &temp64);
CPU_CR_WRITE(cr4, (temp64 & ~CR4_TSD));
}
uint64_t tsc_cycles_in_period(uint16_t timer_period_in_us)
{
uint16_t initial_pit;
uint16_t current_pit;
uint32_t current_tsc;
#define PIT_TARGET 0x3FFF
if (timer_period_in_us < 1000)
pr_warn("Bad timer_period_in_us: %d\n",
timer_period_in_us);
/* Assume the 8254 delivers 18.2 ticks per second when 16 bits fully
* wrap. This is about 1.193MHz or a clock period of 0.8384uSec
*/
initial_pit = (uint16_t)(timer_period_in_us*1193000UL/1000000);
initial_pit += PIT_TARGET;
/* Port 0x43 ==> Control word write; Data 0x30 ==> Select Counter 0,
* Read/Write least significant byte first, mode 0, 16 bits.
*/
io_write_byte(0x30, 0x43);
io_write_byte(initial_pit & 0x00ff, 0x40); /* Write LSB */
io_write_byte(initial_pit >> 8, 0x40); /* Write MSB */
current_tsc = rdtsc();
do {
/* Port 0x43 ==> Control word write; 0x00 ==> Select
* Counter 0, Counter Latch Command, Mode 0; 16 bits
*/
io_write_byte(0x00, 0x43);
current_pit = io_read_byte(0x40); /* Read LSB */
current_pit |= io_read_byte(0x40) << 8; /* Read MSB */
/* Let the counter count down to PIT_TARGET */
} while (current_pit > PIT_TARGET);
current_tsc = rdtsc() - current_tsc;
return (uint64_t) current_tsc;
}

View File

@@ -0,0 +1,494 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <hypervisor.h>
#include <hv_lib.h>
#include <acrn_common.h>
#include <hv_arch.h>
#include <hv_debug.h>
static int rdtscp_handler(struct vcpu *vcpu);
static int unhandled_vmexit_handler(struct vcpu *vcpu);
static int rdtsc_handler(struct vcpu *vcpu);
/* VM Dispatch table for Exit condition handling */
static const struct vm_exit_dispatch dispatch_table[] = {
[VMX_EXIT_REASON_EXCEPTION_OR_NMI] = {
.handler = exception_handler},
[VMX_EXIT_REASON_EXTERNAL_INTERRUPT] = {
.handler = external_interrupt_handler},
[VMX_EXIT_REASON_TRIPLE_FAULT] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_INIT_SIGNAL] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_STARTUP_IPI] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_IO_SMI] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_OTHER_SMI] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_INTERRUPT_WINDOW] = {
.handler = interrupt_win_exiting_handler},
[VMX_EXIT_REASON_NMI_WINDOW] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_TASK_SWITCH] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_CPUID] = {
.handler = cpuid_handler},
[VMX_EXIT_REASON_GETSEC] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_HLT] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_INVD] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_INVLPG] = {
.handler = unhandled_vmexit_handler,},
[VMX_EXIT_REASON_RDPMC] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_RDTSC] = {
.handler = rdtsc_handler},
[VMX_EXIT_REASON_RSM] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMCALL] = {
.handler = vmcall_handler},
[VMX_EXIT_REASON_VMCLEAR] {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMLAUNCH] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMPTRLD] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMPTRST] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMREAD] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMRESUME] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMWRITE] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMXOFF] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_VMXON] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_CR_ACCESS] = {
.handler = cr_access_handler,
.need_exit_qualification = 1},
[VMX_EXIT_REASON_DR_ACCESS] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_IO_INSTRUCTION] = {
.handler = io_instr_handler,
.need_exit_qualification = 1},
[VMX_EXIT_REASON_RDMSR] = {
.handler = rdmsr_handler},
[VMX_EXIT_REASON_WRMSR] = {
.handler = wrmsr_handler},
[VMX_EXIT_REASON_ENTRY_FAILURE_INVALID_GUEST_STATE] = {
.handler = unhandled_vmexit_handler,
.need_exit_qualification = 1},
[VMX_EXIT_REASON_ENTRY_FAILURE_MSR_LOADING] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_MWAIT] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_MONITOR_TRAP] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_MONITOR] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_PAUSE] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_ENTRY_FAILURE_MACHINE_CHECK] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_TPR_BELOW_THRESHOLD] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_APIC_ACCESS] = {
.handler = apicv_access_exit_handler},
[VMX_EXIT_REASON_VIRTUALIZED_EOI] = {
.handler = apicv_virtualized_eoi_exit_handler},
[VMX_EXIT_REASON_GDTR_IDTR_ACCESS] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_LDTR_TR_ACCESS] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_EPT_VIOLATION] = {
.handler = ept_violation_handler,
.need_exit_qualification = 1},
[VMX_EXIT_REASON_EPT_MISCONFIGURATION] = {
.handler = ept_misconfig_handler,
.need_exit_qualification = 1},
[VMX_EXIT_REASON_INVEPT] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_RDTSCP] = {
.handler = rdtscp_handler},
[VMX_EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_INVVPID] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_WBINVD] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_XSETBV] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_APIC_WRITE] = {
.handler = apicv_write_exit_handler}
};
struct vm_exit_dispatch *vmexit_handler(struct vcpu *vcpu)
{
struct vm_exit_dispatch *dispatch = HV_NULL;
uint16_t basic_exit_reason;
/* Obtain interrupt info */
vcpu->arch_vcpu.exit_interrupt_info =
exec_vmread(VMX_IDT_VEC_INFO_FIELD);
/* Calculate basic exit reason (low 16-bits) */
basic_exit_reason = vcpu->arch_vcpu.exit_reason & 0xFFFF;
/* Log details for exit */
pr_dbg("Exit Reason: 0x%016llx ", vcpu->arch_vcpu.exit_reason);
/* Ensure exit reason is within dispatch table */
if (basic_exit_reason < ARRAY_SIZE(dispatch_table)) {
/* Calculate dispatch table entry */
dispatch = (struct vm_exit_dispatch *)
(dispatch_table + basic_exit_reason);
/* See if an exit qualification is necessary for this exit
* handler
*/
if (dispatch->need_exit_qualification) {
/* Get exit qualification */
vcpu->arch_vcpu.exit_qualification =
exec_vmread(VMX_EXIT_QUALIFICATION);
}
}
/* Update current vcpu in VM that caused vm exit */
vcpu->vm->current_vcpu = vcpu;
/* Return pointer to exit dispatch entry */
return dispatch;
}
static int unhandled_vmexit_handler(__unused struct vcpu *vcpu)
{
pr_fatal("Error: Unhandled VM exit condition from guest at 0x%016llx ",
exec_vmread(VMX_GUEST_RIP));
pr_fatal("Exit Reason: 0x%016llx ", vcpu->arch_vcpu.exit_reason);
pr_err("Exit qualification: 0x%016llx ",
exec_vmread(VMX_EXIT_QUALIFICATION));
/* while(1); */
TRACE_2L(TRC_VMEXIT_UNHANDLED, vcpu->arch_vcpu.exit_reason, 0);
return 0;
}
static int write_cr0(struct vcpu *vcpu, uint64_t value)
{
uint32_t value32;
uint64_t value64;
pr_dbg("VMM: Guest trying to write 0x%08x to CR0", value);
/* Read host mask value */
value64 = exec_vmread(VMX_CR0_MASK);
/* Clear all bits being written by guest that are owned by host */
value &= ~value64;
/* Update CR0 in guest state */
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0 |= value;
exec_vmwrite(VMX_GUEST_CR0,
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0);
pr_dbg("VMM: Guest allowed to write 0x%08x to CR0",
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr0);
/* If guest is trying to transition vcpu from unpaged real mode to page
* protected mode make necessary changes to VMCS structure to reflect
* transition from real mode to paged-protected mode
*/
if (!is_vcpu_bsp(vcpu) &&
(vcpu->arch_vcpu.cpu_mode == REAL_MODE) &&
(value & CR0_PG) && (value & CR0_PE)) {
/* Enable protected mode */
value32 = exec_vmread(VMX_ENTRY_CONTROLS);
value32 |= (VMX_ENTRY_CTLS_IA32E_MODE |
VMX_ENTRY_CTLS_LOAD_PAT |
VMX_ENTRY_CTLS_LOAD_EFER);
exec_vmwrite(VMX_ENTRY_CONTROLS, value32);
pr_dbg("VMX_ENTRY_CONTROLS: 0x%x ", value32);
/* Disable unrestricted mode */
value32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
value32 |= (VMX_PROCBASED_CTLS2_EPT |
VMX_PROCBASED_CTLS2_RDTSCP);
exec_vmwrite(VMX_PROC_VM_EXEC_CONTROLS2, value32);
pr_dbg("VMX_PROC_VM_EXEC_CONTROLS2: 0x%x ", value32);
/* Set up EFER */
value64 = exec_vmread64(VMX_GUEST_IA32_EFER_FULL);
value64 |= (MSR_IA32_EFER_SCE_BIT |
MSR_IA32_EFER_LME_BIT |
MSR_IA32_EFER_LMA_BIT | MSR_IA32_EFER_NXE_BIT);
exec_vmwrite64(VMX_GUEST_IA32_EFER_FULL, value64);
pr_dbg("VMX_GUEST_IA32_EFER: 0x%016llx ", value64);
}
return 0;
}
static int write_cr3(struct vcpu *vcpu, uint64_t value)
{
/* Write to guest's CR3 */
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3 = value;
/* Commit new value to VMCS */
exec_vmwrite(VMX_GUEST_CR3,
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3);
return 0;
}
static int write_cr4(struct vcpu *vcpu, uint64_t value)
{
uint64_t temp64;
pr_dbg("VMM: Guest trying to write 0x%08x to CR4", value);
/* Read host mask value */
temp64 = exec_vmread(VMX_CR4_MASK);
/* Clear all bits being written by guest that are owned by host */
value &= ~temp64;
/* Write updated CR4 (bitwise OR of allowed guest bits and CR4 host
* value)
*/
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4 |= value;
exec_vmwrite(VMX_GUEST_CR4,
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4);
pr_dbg("VMM: Guest allowed to write 0x%08x to CR4",
vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr4);
return 0;
}
static int read_cr3(struct vcpu *vcpu, uint64_t *value)
{
*value = vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context].cr3;
pr_dbg("VMM: reading 0x%08x from CR3", *value);
return 0;
}
int cpuid_handler(struct vcpu *vcpu)
{
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
emulate_cpuid(vcpu, (uint32_t)cur_context->guest_cpu_regs.regs.rax,
(uint32_t *)&cur_context->guest_cpu_regs.regs.rax,
(uint32_t *)&cur_context->guest_cpu_regs.regs.rbx,
(uint32_t *)&cur_context->guest_cpu_regs.regs.rcx,
(uint32_t *)&cur_context->guest_cpu_regs.regs.rdx);
TRACE_2L(TRC_VMEXIT_CPUID, vcpu->vcpu_id, 0);
return 0;
}
int cr_access_handler(struct vcpu *vcpu)
{
uint64_t *regptr;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
static const int reg_trans_tab[] = {
[0] = VMX_MACHINE_T_GUEST_RAX_INDEX,
[1] = VMX_MACHINE_T_GUEST_RCX_INDEX,
[2] = VMX_MACHINE_T_GUEST_RDX_INDEX,
[3] = VMX_MACHINE_T_GUEST_RBX_INDEX,
[4] = 0xFF, /* for sp reg, should not be used, just for init */
[5] = VMX_MACHINE_T_GUEST_RBP_INDEX,
[6] = VMX_MACHINE_T_GUEST_RSI_INDEX,
[7] = VMX_MACHINE_T_GUEST_RDI_INDEX,
[8] = VMX_MACHINE_T_GUEST_R8_INDEX,
[9] = VMX_MACHINE_T_GUEST_R9_INDEX,
[10] = VMX_MACHINE_T_GUEST_R10_INDEX,
[11] = VMX_MACHINE_T_GUEST_R11_INDEX,
[12] = VMX_MACHINE_T_GUEST_R12_INDEX,
[13] = VMX_MACHINE_T_GUEST_R13_INDEX,
[14] = VMX_MACHINE_T_GUEST_R14_INDEX,
[15] = VMX_MACHINE_T_GUEST_R15_INDEX
};
int idx = VM_EXIT_CR_ACCESS_REG_IDX(vcpu->arch_vcpu.exit_qualification);
ASSERT(idx != 4, "index should not be 4 (target SP)");
regptr = cur_context->guest_cpu_regs.longs + reg_trans_tab[idx];
switch ((VM_EXIT_CR_ACCESS_ACCESS_TYPE
(vcpu->arch_vcpu.exit_qualification) << 4) |
VM_EXIT_CR_ACCESS_CR_NUM(vcpu->arch_vcpu.exit_qualification)) {
case 0x00:
/* mov to cr0 */
write_cr0(vcpu, *regptr);
break;
case 0x03:
/* mov to cr3 */
write_cr3(vcpu, *regptr);
break;
case 0x04:
/* mov to cr4 */
write_cr4(vcpu, *regptr);
break;
case 0x13:
/* mov from cr3 */
read_cr3(vcpu, regptr);
break;
#if 0
case 0x14:
/* mov from cr4 (this should not happen) */
case 0x10:
/* mov from cr0 (this should not happen) */
#endif
case 0x08:
/* mov to cr8 */
vlapic_set_cr8(vcpu->arch_vcpu.vlapic, *regptr);
break;
case 0x18:
/* mov from cr8 */
*regptr = vlapic_get_cr8(vcpu->arch_vcpu.vlapic);
break;
default:
panic("Unhandled CR access");
return -EINVAL;
}
TRACE_2L(TRC_VMEXIT_CR_ACCESS,
VM_EXIT_CR_ACCESS_ACCESS_TYPE
(vcpu->arch_vcpu.exit_qualification),
VM_EXIT_CR_ACCESS_CR_NUM
(vcpu->arch_vcpu.exit_qualification));
return 0;
}
#if 0
/*
* VMX_PROCBASED_CTLS_INVLPG is not enabled in the VM-execution
* control therefore we don't need it's handler.
*
* INVLPG: this instruction Invalidates any translation lookaside buffer
*/
int invlpg_handler(__unused struct vcpu *vcpu)
{
pr_fatal("INVLPG executed");
return 0;
}
/*
* XSETBV instruction set's the XCR0 that is used to tell for which components
* states can be saved on a context switch using xsave.
*
* We don't handle this right now because we are on a platform that does not
* support XSAVE/XRSTORE feature as reflected by the instruction CPUID.
*
* to make sure this never get called until we support it we can prevent the
* reading of this bit in CPUID VMEXIT.
*
* Linux checks this in CPUID: cpufeature.h: #define cpu_has_xsave
*/
static int xsetbv_instr_handler(__unused struct vcpu *vcpu)
{
ASSERT("Not Supported" == 0, "XSETBV executed");
return 0;
}
#endif
static int rdtsc_handler(struct vcpu *vcpu)
{
uint64_t host_tsc, guest_tsc, tsc_offset;
uint32_t id;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
/* Read the host TSC value */
CPU_RDTSCP_EXECUTE(&host_tsc, &id);
/* Get the guest TSC offset value from VMCS */
tsc_offset =
exec_vmread64(VMX_TSC_OFFSET_FULL);
/* Update the guest TSC value by following: TSC_guest = TSC_host +
* TSC_guest_Offset
*/
guest_tsc = host_tsc + tsc_offset;
/* Return the TSC_guest in rax:rdx */
cur_context->guest_cpu_regs.regs.rax = (uint32_t) guest_tsc;
cur_context->guest_cpu_regs.regs.rdx = (uint32_t) (guest_tsc >> 32);
TRACE_2L(TRC_VMEXIT_RDTSC, host_tsc, tsc_offset);
return 0;
}
static int rdtscp_handler(struct vcpu *vcpu)
{
uint64_t host_tsc, guest_tsc, tsc_offset;
uint32_t id;
struct run_context *cur_context =
&vcpu->arch_vcpu.contexts[vcpu->arch_vcpu.cur_context];
/* Read the host TSC value */
CPU_RDTSCP_EXECUTE(&host_tsc, &id);
/* Get the guest TSC offset value from VMCS */
tsc_offset =
exec_vmread64(VMX_TSC_OFFSET_FULL);
/* Update the guest TSC value by following: * TSC_guest = TSC_host +
* TSC_guest_Offset
*/
guest_tsc = host_tsc + tsc_offset;
/* Return the TSC_guest in rax:rdx and IA32_TSC_AUX in rcx */
cur_context->guest_cpu_regs.regs.rax = (uint32_t) guest_tsc;
cur_context->guest_cpu_regs.regs.rdx = (uint32_t) (guest_tsc >> 32);
cur_context->guest_cpu_regs.regs.rcx = vcpu->arch_vcpu.msr_tsc_aux;
TRACE_2L(TRC_VMEXIT_RDTSCP, guest_tsc, vcpu->arch_vcpu.msr_tsc_aux);
return 0;
}

1346
hypervisor/arch/x86/vmx.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,245 @@
/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <vmx.h>
#include <msr.h>
#include <guest.h>
#include <vcpu.h>
#include <cpu.h>
#include <types.h>
.text
/*int vmx_vmrun(struct run_context *context, int launch, int ibrs_type) */
.code64
.align 8
.global vmx_vmrun
vmx_vmrun:
/* Save all host GPRs that must be preserved across function calls
per System V ABI */
push %rdx
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
/* Save RDI on top of host stack for easy access to VCPU pointer
on return from guest context */
push %rdi
/* rdx = ibrs_type */
/* if ibrs_type != IBRS_NONE, means IBRS feature is supported,
* restore MSR SPEC_CTRL to guest
*/
cmp $IBRS_NONE,%rdx
je next
movl $MSR_IA32_SPEC_CTRL,%ecx
mov VMX_MACHINE_T_GUEST_SPEC_CTRL_OFFSET(%rdi),%rax
movl $0,%edx
wrmsr
next:
/* Load VMCS_HOST_RSP_FIELD field value */
mov $VMX_HOST_RSP,%rdx
/* Write the current stack pointer to the VMCS_HOST_RSP_FIELD */
vmwrite %rsp,%rdx
/* Error occurred - handle error */
jbe vm_eval_error
/* Compare the launch flag to see if launching (1) or resuming (0) */
cmp $VM_LAUNCH, %rsi
mov VMX_MACHINE_T_GUEST_CR2_OFFSET(%rdi),%rax
mov %rax,%cr2
mov VMX_MACHINE_T_GUEST_RAX_OFFSET(%rdi),%rax
mov VMX_MACHINE_T_GUEST_RBX_OFFSET(%rdi),%rbx
mov VMX_MACHINE_T_GUEST_RCX_OFFSET(%rdi),%rcx
mov VMX_MACHINE_T_GUEST_RDX_OFFSET(%rdi),%rdx
mov VMX_MACHINE_T_GUEST_RBP_OFFSET(%rdi),%rbp
mov VMX_MACHINE_T_GUEST_RSI_OFFSET(%rdi),%rsi
mov VMX_MACHINE_T_GUEST_R8_OFFSET(%rdi),%r8
mov VMX_MACHINE_T_GUEST_R9_OFFSET(%rdi),%r9
mov VMX_MACHINE_T_GUEST_R10_OFFSET(%rdi),%r10
mov VMX_MACHINE_T_GUEST_R11_OFFSET(%rdi),%r11
mov VMX_MACHINE_T_GUEST_R12_OFFSET(%rdi),%r12
mov VMX_MACHINE_T_GUEST_R13_OFFSET(%rdi),%r13
mov VMX_MACHINE_T_GUEST_R14_OFFSET(%rdi),%r14
mov VMX_MACHINE_T_GUEST_R15_OFFSET(%rdi),%r15
mov VMX_MACHINE_T_GUEST_RDI_OFFSET(%rdi),%rdi
/* Execute appropriate VMX instruction */
je vm_launch
/* Execute a VM resume */
vmresume
vm_launch:
/* Execute a VM launch */
vmlaunch
.global vm_exit
vm_exit:
/* Get VCPU data structure pointer from top of host stack and
save guest RDI in its place */
xchg 0(%rsp),%rdi
/* Save current GPRs to guest state area */
mov %rax,VMX_MACHINE_T_GUEST_RAX_OFFSET(%rdi)
mov %cr2,%rax
mov %rax,VMX_MACHINE_T_GUEST_CR2_OFFSET(%rdi)
mov %rbx,VMX_MACHINE_T_GUEST_RBX_OFFSET(%rdi)
mov %rcx,VMX_MACHINE_T_GUEST_RCX_OFFSET(%rdi)
mov %rdx,VMX_MACHINE_T_GUEST_RDX_OFFSET(%rdi)
mov %rbp,VMX_MACHINE_T_GUEST_RBP_OFFSET(%rdi)
mov %rsi,VMX_MACHINE_T_GUEST_RSI_OFFSET(%rdi)
mov %r8,VMX_MACHINE_T_GUEST_R8_OFFSET(%rdi)
mov %r9,VMX_MACHINE_T_GUEST_R9_OFFSET(%rdi)
mov %r10,VMX_MACHINE_T_GUEST_R10_OFFSET(%rdi)
mov %r11,VMX_MACHINE_T_GUEST_R11_OFFSET(%rdi)
mov %r12,VMX_MACHINE_T_GUEST_R12_OFFSET(%rdi)
mov %r13,VMX_MACHINE_T_GUEST_R13_OFFSET(%rdi)
mov %r14,VMX_MACHINE_T_GUEST_R14_OFFSET(%rdi)
mov %r15,VMX_MACHINE_T_GUEST_R15_OFFSET(%rdi)
/* Load guest RDI off host stack and into RDX */
mov 0(%rsp),%rdx
/* Save guest RDI to guest state area */
mov %rdx,VMX_MACHINE_T_GUEST_RDI_OFFSET(%rdi)
/* Save RDI to RSI for later SPEC_CTRL save*/
mov %rdi,%rsi
vm_eval_error:
/* Restore host GPR System V required registers */
pop %rdi
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdx
/* Check vm fail, refer to 64-ia32 spec section 26.2 in volume#3 */
mov $VM_FAIL,%rax
jc vm_return
jz vm_return
/* Clear host registers to prevent speculative use */
xor %rcx,%rcx
xor %r8,%r8
xor %r9,%r9
xor %r10,%r10
xor %r11,%r11
/* rdx = ibrs_type */
/* IBRS_NONE: no ibrs setting, just flush rsb
* IBRS_RAW: set IBRS then flush rsb
* IBRS_OPT: set STIBP & IBPB then flush rsb
*/
cmp $IBRS_NONE,%rdx
je stuff_rsb
cmp $IBRS_OPT,%rdx
je ibrs_opt
/* Save guest MSR SPEC_CTRL, low 32 bit is enough */
movl $MSR_IA32_SPEC_CTRL,%ecx
rdmsr
mov %rax,VMX_MACHINE_T_GUEST_SPEC_CTRL_OFFSET(%rsi)
movl $SPEC_ENABLE_IBRS,%eax
movl $0,%edx
wrmsr
jmp stuff_rsb
ibrs_opt:
movl $MSR_IA32_PRED_CMD,%ecx
movl $PRED_SET_IBPB,%eax
movl $0,%edx
wrmsr
/* Save guest MSR SPEC_CTRL, low 32 bit is enough */
movl $MSR_IA32_SPEC_CTRL,%ecx
rdmsr
mov %rax,VMX_MACHINE_T_GUEST_SPEC_CTRL_OFFSET(%rsi)
movl $SPEC_ENABLE_STIBP,%eax
movl $0,%edx
wrmsr
/* stuff rsb by 32 CALLs, make sure no any "ret" is executed before this
* stuffing rsb, otherwise, someone may insert some code before this for
* future update.
*/
stuff_rsb:
/* stuff 32 RSB, rax = 32/2 */
mov $16,%rax
.align 16
3:
call 4f
33:
pause
jmp 33b
.align 16
4:
call 5f
44:
pause
jmp 44b
.align 16
5: dec %rax
jnz 3b
/* stuff 32 RSB, rsp += 8*32 */
add $(8*32),%rsp
mov $VM_SUCCESS,%rax
vm_return:
/* Return to caller */
ret

1162
hypervisor/arch/x86/vtd.c Normal file

File diff suppressed because it is too large Load Diff