acrn-hypervisor/hypervisor/common/vm_load.c
Victor Sun 4702e743a8 HV: vm_load: change kernel type for zephyr image
Previously we only support loading raw format of zephyr image as prelaunched
Zephyr VM, this would cause guest F segment overridden issue because the zephyr
raw image covers memory space from 0x1000 to 0x100000 upper. To fix this issue,
we should support ELF format image loading so that parse and load the multiple
segments from ELF image directly.

Tracked-On: #6323

Signed-off-by: Victor Sun <victor.sun@intel.com>
Acked-by: Eddie Dong <eddie.dong@intel.com>
2021-08-18 13:30:48 +08:00

513 lines
18 KiB
C

/*
* Copyright (C) 2018 Intel Corporation. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <asm/guest/vm.h>
#include <asm/e820.h>
#include <asm/zeropage.h>
#include <asm/guest/ept.h>
#include <asm/mmu.h>
#include <boot.h>
#include <vacpi.h>
#include <efi_mmap.h>
#include <errno.h>
#include <logmsg.h>
#define DBG_LEVEL_VMLOAD 6U
/* Define a 32KB memory block to store LaaG VM load params in guest address space
* The params including:
* Init GDT entries : 1KB (must be 8byte aligned)
* Linux Zeropage : 4KB
* Boot cmdline : 2KB
* EFI memory map : 12KB
* Reserved region for trampoline code : 8KB
* Each param should keep 8byte aligned and the total region size should be less than 32KB
* so that it could be put below MEM_1M.
* Please note in Linux VM, the last 8KB space below MEM_1M is for trampoline code. The block
* should be able to accommodate it and so that avoid the trampoline corruption.
*/
#define BZIMG_LOAD_PARAMS_SIZE (MEM_4K * 8)
#define BZIMG_INITGDT_GPA(load_params_gpa) (load_params_gpa + 0UL)
#define BZIMG_ZEROPAGE_GPA(load_params_gpa) (load_params_gpa + MEM_1K)
#define BZIMG_CMDLINE_GPA(load_params_gpa) (load_params_gpa + MEM_1K + MEM_4K)
#define BZIMG_EFIMMAP_GPA(load_params_gpa) (load_params_gpa + MEM_1K + MEM_4K + MEM_2K)
/* TODO:
* The value is referenced from Linux boot protocal for old kernels,
* but this should be configurable for different OS. */
#define DEFAULT_RAMDISK_GPA_MAX 0x37ffffffUL
#define PRE_VM_MAX_RAM_ADDR_BELOW_4GB (VIRT_ACPI_DATA_ADDR - 1U)
static void *get_initrd_load_addr(struct acrn_vm *vm, uint64_t kernel_start)
{
uint64_t ramdisk_load_gpa = INVALID_GPA;
uint64_t ramdisk_gpa_max = DEFAULT_RAMDISK_GPA_MAX;
struct zero_page *zeropage = (struct zero_page *)vm->sw.kernel_info.kernel_src_addr;
uint32_t kernel_init_size, kernel_align, initrd_addr_max;
uint64_t kernel_end;
/* Per Linux boot protocol, the Kernel need a size of contiguous
* memory(i.e. init_size field in zeropage) from its extract address to boot,
* and initrd_addr_max field specifies the maximum address of the ramdisk.
* Per kernel src head_64.S, decompressed kernel start at 2M aligned to the
* compressed kernel load address.
*/
stac();
kernel_init_size = zeropage->hdr.init_size;
kernel_align = zeropage->hdr.kernel_alignment;
initrd_addr_max = zeropage->hdr.initrd_addr_max;
stac();
kernel_end = roundup(kernel_start, kernel_align) + kernel_init_size;
if (initrd_addr_max != 0U) {
ramdisk_gpa_max = initrd_addr_max;
}
if (is_sos_vm(vm)) {
uint64_t mods_start, mods_end;
get_boot_mods_range(&mods_start, &mods_end);
mods_start = sos_vm_hpa2gpa(mods_start);
mods_end = sos_vm_hpa2gpa(mods_end);
if (vm->sw.ramdisk_info.src_addr != NULL) {
ramdisk_load_gpa = sos_vm_hpa2gpa((uint64_t)vm->sw.ramdisk_info.src_addr);
}
/* For SOS VM, the ramdisk has been loaded by bootloader, so in most cases
* there is no need to do gpa copy again. But in the case that the ramdisk is
* loaded by bootloader at a address higher than its limit, we should do gpa
* copy then.
*/
if ((ramdisk_load_gpa + vm->sw.ramdisk_info.size) > ramdisk_gpa_max) {
/* In this case, mods_end must be higher than ramdisk_gpa_max,
* so try to locate ramdisk between MEM_1M and mods_start/kernel_start,
* or try the range between kernel_end and mods_start;
*/
ramdisk_load_gpa = find_space_from_ve820(vm, vm->sw.ramdisk_info.size,
MEM_1M, min(min(mods_start, kernel_start), ramdisk_gpa_max));
if ((ramdisk_load_gpa == INVALID_GPA) && (kernel_end < min(mods_start, ramdisk_gpa_max))) {
ramdisk_load_gpa = find_space_from_ve820(vm, vm->sw.ramdisk_info.size,
kernel_end, min(mods_start, ramdisk_gpa_max));
}
}
} else {
/* For pre-launched VM, the ramdisk would be put by searching ve820 table.
*/
ramdisk_gpa_max = min(PRE_VM_MAX_RAM_ADDR_BELOW_4GB, ramdisk_gpa_max);
if (kernel_end < ramdisk_gpa_max) {
ramdisk_load_gpa = find_space_from_ve820(vm, vm->sw.ramdisk_info.size,
kernel_end, ramdisk_gpa_max);
}
if (ramdisk_load_gpa == INVALID_GPA) {
ramdisk_load_gpa = find_space_from_ve820(vm, vm->sw.ramdisk_info.size,
MEM_1M, min(kernel_start, ramdisk_gpa_max));
}
}
if (ramdisk_load_gpa == INVALID_GPA) {
pr_err("no space in guest memory to load VM %d ramdisk", vm->vm_id);
vm->sw.ramdisk_info.size = 0U;
}
dev_dbg(DBG_LEVEL_VMLOAD, "VM%d ramdisk load_addr: 0x%lx", vm->vm_id, ramdisk_load_gpa);
return (ramdisk_load_gpa == INVALID_GPA) ? NULL : (void *)ramdisk_load_gpa;
}
/**
* @pre vm != NULL
*/
static void *get_bzimage_kernel_load_addr(struct acrn_vm *vm)
{
void *load_addr = NULL;
struct vm_sw_info *sw_info = &vm->sw;
struct zero_page *zeropage;
/* According to the explaination for pref_address
* in Documentation/x86/boot.txt, a relocating
* bootloader should attempt to load kernel at pref_address
* if possible. A non-relocatable kernel will unconditionally
* move itself and to run at this address.
*/
zeropage = (struct zero_page *)sw_info->kernel_info.kernel_src_addr;
stac();
if ((is_sos_vm(vm)) && (zeropage->hdr.relocatable_kernel != 0U)) {
uint64_t mods_start, mods_end;
uint64_t kernel_load_gpa = INVALID_GPA;
uint32_t kernel_align = zeropage->hdr.kernel_alignment;
uint32_t kernel_init_size = zeropage->hdr.init_size;
/* Because the kernel load address need to be up aligned to kernel_align size
* whereas find_space_from_ve820() can only return page aligned address,
* we enlarge the needed size to (kernel_init_size + kernel_align).
*/
uint32_t kernel_size = kernel_init_size + kernel_align;
get_boot_mods_range(&mods_start, &mods_end);
mods_start = sos_vm_hpa2gpa(mods_start);
mods_end = sos_vm_hpa2gpa(mods_end);
/* TODO: support load kernel when modules are beyond 4GB space. */
if (mods_end < MEM_4G) {
kernel_load_gpa = find_space_from_ve820(vm, kernel_size, MEM_1M, mods_start);
if (kernel_load_gpa == INVALID_GPA) {
kernel_load_gpa = find_space_from_ve820(vm, kernel_size, mods_end, MEM_4G);
}
}
if (kernel_load_gpa != INVALID_GPA) {
load_addr = (void *)roundup((uint64_t)kernel_load_gpa, kernel_align);
}
} else {
load_addr = (void *)zeropage->hdr.pref_addr;
if (is_sos_vm(vm)) {
/* The non-relocatable SOS kernel might overlap with boot modules. */
pr_err("Non-relocatable kernel found, risk to boot!");
}
}
clac();
if (load_addr == NULL) {
pr_err("Could not get kernel load addr of VM %d .", vm->vm_id);
}
dev_dbg(DBG_LEVEL_VMLOAD, "VM%d kernel load_addr: 0x%lx", vm->vm_id, load_addr);
return load_addr;
}
/**
* @pre vm != NULL && efi_mmap_desc != NULL
*/
static uint16_t create_sos_vm_efi_mmap_desc(struct acrn_vm *vm, struct efi_memory_desc *efi_mmap_desc)
{
uint16_t i, desc_idx = 0U;
const struct efi_memory_desc *hv_efi_mmap_desc = get_efi_mmap_entry();
for (i = 0U; i < get_efi_mmap_entries_count(); i++) {
/* Below efi mmap desc types in native should be kept as original for SOS VM */
if ((hv_efi_mmap_desc[i].type == EFI_RESERVED_MEMORYTYPE)
|| (hv_efi_mmap_desc[i].type == EFI_UNUSABLE_MEMORY)
|| (hv_efi_mmap_desc[i].type == EFI_ACPI_RECLAIM_MEMORY)
|| (hv_efi_mmap_desc[i].type == EFI_ACPI_MEMORY_NVS)
|| (hv_efi_mmap_desc[i].type == EFI_BOOT_SERVICES_CODE)
|| (hv_efi_mmap_desc[i].type == EFI_BOOT_SERVICES_DATA)
|| (hv_efi_mmap_desc[i].type == EFI_RUNTIME_SERVICES_CODE)
|| (hv_efi_mmap_desc[i].type == EFI_RUNTIME_SERVICES_DATA)
|| (hv_efi_mmap_desc[i].type == EFI_MEMORYMAPPED_IO)
|| (hv_efi_mmap_desc[i].type == EFI_MEMORYMAPPED_IOPORTSPACE)
|| (hv_efi_mmap_desc[i].type == EFI_PALCODE)
|| (hv_efi_mmap_desc[i].type == EFI_PERSISTENT_MEMORY)) {
efi_mmap_desc[desc_idx] = hv_efi_mmap_desc[i];
desc_idx++;
}
}
for (i = 0U; i < vm->e820_entry_num; i++) {
/* The memory region with e820 type of RAM could be acted as EFI_CONVENTIONAL_MEMORY
* for SOS VM, the region which occupied by HV and pre-launched VM has been filtered
* already, so it is safe for SOS VM.
* As SOS VM start to run after efi call ExitBootService(), the type of EFI_LOADER_CODE
* and EFI_LOADER_DATA which have been mapped to E820_TYPE_RAM are not needed.
*/
if (vm->e820_entries[i].type == E820_TYPE_RAM) {
efi_mmap_desc[desc_idx].type = EFI_CONVENTIONAL_MEMORY;
efi_mmap_desc[desc_idx].phys_addr = vm->e820_entries[i].baseaddr;
efi_mmap_desc[desc_idx].virt_addr = vm->e820_entries[i].baseaddr;
efi_mmap_desc[desc_idx].num_pages = vm->e820_entries[i].length / PAGE_SIZE;
efi_mmap_desc[desc_idx].attribute = EFI_MEMORY_WB;
desc_idx++;
}
}
for (i = 0U; i < desc_idx; i++) {
pr_dbg("SOS VM efi mmap desc[%d]: addr: 0x%lx, len: 0x%lx, type: %d", i,
efi_mmap_desc[i].phys_addr, efi_mmap_desc[i].num_pages * PAGE_SIZE, efi_mmap_desc[i].type);
}
return desc_idx;
}
/**
* @pre zp != NULL && vm != NULL
*/
static uint32_t create_zeropage_e820(struct zero_page *zp, const struct acrn_vm *vm)
{
uint32_t entry_num = vm->e820_entry_num;
struct e820_entry *zp_e820 = zp->entries;
const struct e820_entry *vm_e820 = vm->e820_entries;
if ((zp_e820 == NULL) || (vm_e820 == NULL) || (entry_num == 0U) || (entry_num > E820_MAX_ENTRIES)) {
pr_err("e820 create error");
entry_num = 0U;
} else {
(void)memcpy_s((void *)zp_e820, entry_num * sizeof(struct e820_entry),
(void *)vm_e820, entry_num * sizeof(struct e820_entry));
}
return entry_num;
}
/**
* @pre vm != NULL
* @pre (vm->min_mem_addr <= kernel_load_addr) && (kernel_load_addr < vm->max_mem_addr)
*/
static uint64_t create_zero_page(struct acrn_vm *vm, uint64_t load_params_gpa)
{
struct zero_page *zeropage, *hva;
struct sw_kernel_info *sw_kernel = &(vm->sw.kernel_info);
struct sw_module_info *bootargs_info = &(vm->sw.bootargs_info);
struct sw_module_info *ramdisk_info = &(vm->sw.ramdisk_info);
uint64_t gpa, addr;
gpa = BZIMG_ZEROPAGE_GPA(load_params_gpa);
hva = (struct zero_page *)gpa2hva(vm, gpa);
zeropage = hva;
stac();
/* clear the zeropage */
(void)memset(zeropage, 0U, MEM_2K);
#ifdef CONFIG_MULTIBOOT2
if (is_sos_vm(vm)) {
struct acrn_boot_info *abi = get_acrn_boot_info();
if (boot_from_uefi(abi)) {
struct efi_info *sos_efi_info = &zeropage->boot_efi_info;
uint64_t efi_mmap_gpa = BZIMG_EFIMMAP_GPA(load_params_gpa);
struct efi_memory_desc *efi_mmap_desc = (struct efi_memory_desc *)gpa2hva(vm, efi_mmap_gpa);
uint16_t efi_mmap_desc_nr = create_sos_vm_efi_mmap_desc(vm, efi_mmap_desc);
sos_efi_info->loader_signature = 0x34364c45; /* "EL64" */
sos_efi_info->memdesc_version = abi->uefi_info.memdesc_version;
sos_efi_info->memdesc_size = sizeof(struct efi_memory_desc);
sos_efi_info->memmap_size = efi_mmap_desc_nr * sizeof(struct efi_memory_desc);
sos_efi_info->memmap = (uint32_t)efi_mmap_gpa;
sos_efi_info->memmap_hi = (uint32_t)(efi_mmap_gpa >> 32U);
sos_efi_info->systab = abi->uefi_info.systab;
sos_efi_info->systab_hi = abi->uefi_info.systab_hi;
}
}
#endif
/* copy part of the header into the zero page */
hva = (struct zero_page *)sw_kernel->kernel_src_addr;
(void)memcpy_s(&(zeropage->hdr), sizeof(zeropage->hdr),
&(hva->hdr), sizeof(hva->hdr));
/* See if kernel has a RAM disk */
if (ramdisk_info->src_addr != NULL) {
/* Copy ramdisk load_addr and size in zeropage header structure
*/
addr = (uint64_t)ramdisk_info->load_addr;
zeropage->hdr.ramdisk_addr = (uint32_t)addr;
zeropage->hdr.ramdisk_size = (uint32_t)ramdisk_info->size;
}
/* Copy bootargs load_addr in zeropage header structure */
addr = (uint64_t)bootargs_info->load_addr;
zeropage->hdr.bootargs_addr = (uint32_t)addr;
/* set constant arguments in zero page */
zeropage->hdr.loader_type = 0xffU;
zeropage->hdr.load_flags |= (1U << 5U); /* quiet */
/* Create/add e820 table entries in zeropage */
zeropage->e820_nentries = (uint8_t)create_zeropage_e820(zeropage, vm);
clac();
/* Return Physical Base Address of zeropage */
return gpa;
}
/**
* @pre sw_module != NULL
*/
static void load_sw_module(struct acrn_vm *vm, struct sw_module_info *sw_module)
{
if ((sw_module->size != 0) && (sw_module->load_addr != NULL)) {
(void)copy_to_gpa(vm, sw_module->src_addr, (uint64_t)sw_module->load_addr, sw_module->size);
}
}
/**
* @pre vm != NULL
*/
static void prepare_loading_bzimage(struct acrn_vm *vm, struct acrn_vcpu *vcpu,
uint64_t load_params_gpa, uint64_t kernel_load_gpa)
{
uint32_t i;
uint32_t prot_code_offset, prot_code_size, kernel_entry_offset;
uint8_t setup_sectors;
const struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id);
struct sw_kernel_info *sw_kernel = &(vm->sw.kernel_info);
struct sw_module_info *bootargs_info = &(vm->sw.bootargs_info);
struct sw_module_info *ramdisk_info = &(vm->sw.ramdisk_info);
struct sw_module_info *acpi_info = &(vm->sw.acpi_info);
struct zero_page *zeropage = (struct zero_page *)sw_kernel->kernel_src_addr;
/* The bzImage file consists of three parts:
* boot_params(i.e. zero page) + real mode setup code + compressed protected mode code
* The compressed proteced mode code start at offset (setup_sectors + 1U) * 512U of bzImage.
* Only protected mode code need to be loaded.
*/
stac();
setup_sectors = (zeropage->hdr.setup_sects == 0U) ? 4U : zeropage->hdr.setup_sects;
clac();
prot_code_offset = (uint32_t)(setup_sectors + 1U) * 512U;
prot_code_size = (sw_kernel->kernel_size > prot_code_offset) ?
(sw_kernel->kernel_size - prot_code_offset) : 0U;
/* Copy the protected mode part kernel code to its run-time location */
(void)copy_to_gpa(vm, (sw_kernel->kernel_src_addr + prot_code_offset), kernel_load_gpa, prot_code_size);
if (vm->sw.ramdisk_info.size > 0U) {
/* Use customer specified ramdisk load addr if it is configured in VM configuration,
* otherwise use allocated address calculated by HV.
*/
if (vm_config->os_config.kernel_ramdisk_addr != 0UL) {
vm->sw.ramdisk_info.load_addr = (void *)vm_config->os_config.kernel_ramdisk_addr;
} else {
vm->sw.ramdisk_info.load_addr = (void *)get_initrd_load_addr(vm, kernel_load_gpa);
if (vm->sw.ramdisk_info.load_addr == NULL) {
pr_err("failed to load initrd for VM%d !", vm->vm_id);
}
}
/* Don't need to load ramdisk if src_addr and load_addr are pointed to same place. */
if (gpa2hva(vm, (uint64_t)ramdisk_info->load_addr) != ramdisk_info->src_addr) {
load_sw_module(vm, ramdisk_info);
}
}
bootargs_info->load_addr = (void *)BZIMG_CMDLINE_GPA(load_params_gpa);
load_sw_module(vm, bootargs_info);
/* Copy Guest OS ACPI to its load location */
load_sw_module(vm, acpi_info);
/* 32bit kernel entry is at where protected mode code loaded */
kernel_entry_offset = 0U;
if (vcpu->arch.cpu_mode == CPU_MODE_64BIT) {
/* 64bit entry is the 512bytes after the start */
kernel_entry_offset += 512U;
}
sw_kernel->kernel_entry_addr = (void *)(kernel_load_gpa + kernel_entry_offset);
/* Documentation states: ebx=0, edi=0, ebp=0, esi=ptr to
* zeropage
*/
for (i = 0U; i < NUM_GPRS; i++) {
vcpu_set_gpreg(vcpu, i, 0UL);
}
/* Create Zeropage and copy Physical Base Address of Zeropage
* in RSI
*/
vcpu_set_gpreg(vcpu, CPU_REG_RSI, create_zero_page(vm, load_params_gpa));
pr_info("%s, RSI pointing to zero page for VM %d at GPA %X",
__func__, vm->vm_id, vcpu_get_gpreg(vcpu, CPU_REG_RSI));
}
/**
* @pre vm != NULL
*/
static void prepare_loading_rawimage(struct acrn_vm *vm)
{
struct sw_kernel_info *sw_kernel = &(vm->sw.kernel_info);
struct sw_module_info *acpi_info = &(vm->sw.acpi_info);
const struct acrn_vm_config *vm_config = get_vm_config(vm->vm_id);
uint64_t kernel_load_gpa;
/* TODO: GPA 0 load support */
kernel_load_gpa = vm_config->os_config.kernel_load_addr;
/* Copy the guest kernel image to its run-time location */
(void)copy_to_gpa(vm, sw_kernel->kernel_src_addr, kernel_load_gpa, sw_kernel->kernel_size);
/* Copy Guest OS ACPI to its load location */
load_sw_module(vm, acpi_info);
sw_kernel->kernel_entry_addr = (void *)vm_config->os_config.kernel_entry_addr;
}
static int32_t vm_bzimage_loader(struct acrn_vm *vm)
{
int32_t ret = -ENOMEM;
/* get primary vcpu */
struct acrn_vcpu *vcpu = vcpu_from_vid(vm, BSP_CPU_ID);
uint64_t load_params_gpa = find_space_from_ve820(vm, BZIMG_LOAD_PARAMS_SIZE, MEM_4K, MEM_1M);
if (load_params_gpa != INVALID_GPA) {
uint64_t kernel_load_gpa = (uint64_t)get_bzimage_kernel_load_addr(vm);
if (kernel_load_gpa != 0UL) {
/* We boot bzImage from protected mode directly */
init_vcpu_protect_mode_regs(vcpu, BZIMG_INITGDT_GPA(load_params_gpa));
prepare_loading_bzimage(vm, vcpu, load_params_gpa, kernel_load_gpa);
ret = 0;
}
}
return ret;
}
static int32_t vm_rawimage_loader(struct acrn_vm *vm)
{
int32_t ret = 0;
uint64_t vgdt_gpa = 0x800;
/*
* TODO:
* - We need to initialize the guest bsp registers according to
* guest boot mode (real mode vs protect mode)
* - The memory layout usage is unclear, only GDT might be needed as its boot param.
* currently we only support Zephyr which has no needs on cmdline/e820/efimmap/etc.
* hardcode the vGDT GPA to 0x800 where is not used by Zephyr so far;
*/
init_vcpu_protect_mode_regs(vcpu_from_vid(vm, BSP_CPU_ID), vgdt_gpa);
prepare_loading_rawimage(vm);
return ret;
}
/**
* @pre vm != NULL
*/
int32_t vm_sw_loader(struct acrn_vm *vm)
{
int32_t ret = 0;
/* get primary vcpu */
struct acrn_vcpu *vcpu = vcpu_from_vid(vm, BSP_CPU_ID);
if (vm->sw.kernel_type == KERNEL_BZIMAGE) {
ret = vm_bzimage_loader(vm);
} else if (vm->sw.kernel_type == KERNEL_RAWIMAGE){
ret = vm_rawimage_loader(vm);
} else {
ret = -EINVAL;
}
if (ret == 0) {
/* Set VCPU entry point to kernel entry */
vcpu_set_rip(vcpu, (uint64_t)vm->sw.kernel_info.kernel_entry_addr);
pr_info("%s, VM %hu VCPU %hu Entry: 0x%016lx ", __func__, vm->vm_id, vcpu->vcpu_id,
vm->sw.kernel_info.kernel_entry_addr);
}
return ret;
}