acrn-hypervisor/hypervisor/arch/x86/pm.c
Wu Zhou fbdc2774af hv: add ACRN CPU frequency initializer
The design of ACRN CPU performance management is to let hardware
do the autonomous frequency selection(or set to a fixed value),
and remove guest's ability to control CPU frequency.

This patch is to implement the CPU frequency initializer, which will
setup CPU frequency base on the performance policy type.

Two performance policy types are provided for user to choose from:
  - 'Performance': CPU runs at its CPU runs at its maximum frequency.
    Enable hardware autonomous frequency selection if HWP is presented.
  - 'Nominal': CPU runs at its guaranteed frequency.

The policy type is passed to hypervisor through boot parameter, as
either 'cpu_perf_policy=Nominal' or 'cpu_perf_policy=Performance'.
The default type is 'Performance'.

Both HWP and ACPI p-state are supported. HWP is the first choice, for
it provides hardware autonomous frequency selection, while keeps
frequency transaction time low.

Two functions are added to the hypervisor to call:
  - init_frequency_policy(): called by BSP at start up time. It processes
    the boot parameters, and enables HWP if it is presented.
  - apply_frequency_policy(): called after init_frequency_policy().
    It applies initial CPU frequency policy setting for each core. It
    uses a set of frequency limits data struct to quickly decide what the
    highest/nominal frequency is. The frequency limits are generated by
    config-tools.

The hypervisor will not be governing CPU frequency after initial policy
is applied.

Cores running RTVMs are fixed to nominal/guaranteed frequency, to get
more certainty in latency. This is done by setting the core's frequency
limits to highest=lowest=nominal in config-tools.

Tracked-On: #8168
Signed-off-by: Wu Zhou <wu.zhou@intel.com>
Acked-by: Eddie Dong <eddie.dong@intel.com>
2022-10-08 11:13:21 +08:00

357 lines
9.9 KiB
C

/*
* Copyright (C) 2018-2022 Intel Corporation.
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <acrn_common.h>
#include <asm/default_acpi_info.h>
#include <platform_acpi_info.h>
#include <asm/per_cpu.h>
#include <asm/io.h>
#include <asm/msr.h>
#include <asm/pgtable.h>
#include <asm/host_pm.h>
#include <asm/trampoline.h>
#include <asm/vmx.h>
#include <console.h>
#include <asm/ioapic.h>
#include <asm/vtd.h>
#include <asm/lapic.h>
#include <asm/tsc.h>
#include <delay.h>
#include <asm/board.h>
#include <asm/cpuid.h>
struct cpu_context cpu_ctx;
/* The values in this structure should come from host ACPI table */
static struct pm_s_state_data host_pm_s_state = {
.pm1a_evt = {
.space_id = PM1A_EVT_SPACE_ID,
.bit_width = PM1A_EVT_BIT_WIDTH,
.bit_offset = PM1A_EVT_BIT_OFFSET,
.access_size = PM1A_EVT_ACCESS_SIZE,
.address = PM1A_EVT_ADDRESS
},
.pm1b_evt = {
.space_id = PM1B_EVT_SPACE_ID,
.bit_width = PM1B_EVT_BIT_WIDTH,
.bit_offset = PM1B_EVT_BIT_OFFSET,
.access_size = PM1B_EVT_ACCESS_SIZE,
.address = PM1B_EVT_ADDRESS
},
.pm1a_cnt = {
.space_id = PM1A_CNT_SPACE_ID,
.bit_width = PM1A_CNT_BIT_WIDTH,
.bit_offset = PM1A_CNT_BIT_OFFSET,
.access_size = PM1A_CNT_ACCESS_SIZE,
.address = PM1A_CNT_ADDRESS
},
.pm1b_cnt = {
.space_id = PM1B_CNT_SPACE_ID,
.bit_width = PM1B_CNT_BIT_WIDTH,
.bit_offset = PM1B_CNT_BIT_OFFSET,
.access_size = PM1B_CNT_ACCESS_SIZE,
.address = PM1B_CNT_ADDRESS
},
.s3_pkg = {
.val_pm1a = S3_PKG_VAL_PM1A,
.val_pm1b = S3_PKG_VAL_PM1B,
.reserved = S3_PKG_RESERVED
},
.s5_pkg = {
.val_pm1a = S5_PKG_VAL_PM1A,
.val_pm1b = S5_PKG_VAL_PM1B,
.reserved = S5_PKG_RESERVED
},
.wake_vector_32 = (uint32_t *)WAKE_VECTOR_32,
.wake_vector_64 = (uint64_t *)WAKE_VECTOR_64
};
/* host reset register defined in ACPI */
static struct acpi_reset_reg host_reset_reg = {
.reg = {
.space_id = RESET_REGISTER_SPACE_ID,
.bit_width = RESET_REGISTER_BIT_WIDTH,
.bit_offset = RESET_REGISTER_BIT_OFFSET,
.access_size = RESET_REGISTER_ACCESS_SIZE,
.address = RESET_REGISTER_ADDRESS,
},
.val = RESET_REGISTER_VALUE
};
struct pm_s_state_data *get_host_sstate_data(void)
{
return &host_pm_s_state;
}
struct acpi_reset_reg *get_host_reset_reg_data(void)
{
return &host_reset_reg;
}
void restore_msrs(void)
{
#ifdef STACK_PROTECTOR
struct stack_canary *psc = &get_cpu_var(stk_canary);
msr_write(MSR_IA32_FS_BASE, (uint64_t)psc);
#endif
}
static void acpi_gas_write(const struct acrn_acpi_generic_address *gas, uint32_t val)
{
uint16_t val16 = (uint16_t)val;
if (gas->space_id == SPACE_SYSTEM_MEMORY) {
mmio_write16(val16, hpa2hva(gas->address));
} else {
pio_write16(val16, (uint16_t)gas->address);
}
}
static uint32_t acpi_gas_read(const struct acrn_acpi_generic_address *gas)
{
uint32_t ret = 0U;
if (gas->space_id == SPACE_SYSTEM_MEMORY) {
ret = mmio_read16(hpa2hva(gas->address));
} else {
ret = pio_read16((uint16_t)gas->address);
}
return ret;
}
/* This function supports enter S3 or S5 according to the value given to pm1a_cnt_val and pm1b_cnt_val */
void do_acpi_sx(const struct pm_s_state_data *sstate_data, uint32_t pm1a_cnt_val, uint32_t pm1b_cnt_val)
{
uint32_t s1, s2;
acpi_gas_write(&(sstate_data->pm1a_cnt), pm1a_cnt_val);
if (sstate_data->pm1b_cnt.address != 0U) {
acpi_gas_write(&(sstate_data->pm1b_cnt), pm1b_cnt_val);
}
do {
/* polling PM1 state register to detect wether
* the Sx state enter is interrupted by wakeup event.
*/
s1 = acpi_gas_read(&(sstate_data->pm1a_evt));
if (sstate_data->pm1b_evt.address != 0U) {
s2 = acpi_gas_read(&(sstate_data->pm1b_evt));
s1 |= s2;
}
/* According to ACPI spec 4.8.3.1.1 PM1 state register, the bit
* WAK_STS(bit 15) is set if system will transition to working
* state.
*/
} while ((s1 & (1U << BIT_WAK_STS)) == 0U);
}
static uint32_t system_pm1a_cnt_val, system_pm1b_cnt_val;
void save_s5_reg_val(uint32_t pm1a_cnt_val, uint32_t pm1b_cnt_val)
{
system_pm1a_cnt_val = pm1a_cnt_val;
system_pm1b_cnt_val = pm1b_cnt_val;
}
void shutdown_system(void)
{
struct pm_s_state_data *sx_data = get_host_sstate_data();
do_acpi_sx(sx_data, system_pm1a_cnt_val, system_pm1b_cnt_val);
}
static void suspend_tsc(__unused void *data)
{
per_cpu(tsc_suspend, get_pcpu_id()) = rdtsc();
}
static void resume_tsc(__unused void *data)
{
msr_write(MSR_IA32_TIME_STAMP_COUNTER, per_cpu(tsc_suspend, get_pcpu_id()));
}
void host_enter_s3(const struct pm_s_state_data *sstate_data, uint32_t pm1a_cnt_val, uint32_t pm1b_cnt_val)
{
uint64_t pmain_entry_saved;
stac();
/* set ACRN wakeup vec instead */
*(sstate_data->wake_vector_32) = (uint32_t)get_trampoline_start16_paddr();
clac();
/* Save TSC on all PCPU */
smp_call_function(get_active_pcpu_bitmap(), suspend_tsc, NULL);
/* offline all APs */
stop_pcpus();
stac();
/* Save default main entry and we will restore it after
* back from S3. So the AP online could jmp to correct
* main entry.
*/
pmain_entry_saved = read_trampoline_sym(main_entry);
/* Set the main entry for resume from S3 state */
write_trampoline_sym(main_entry, (uint64_t)restore_s3_context);
clac();
CPU_IRQ_DISABLE_ON_CONFIG();
vmx_off();
suspend_console();
suspend_ioapic();
suspend_iommu();
suspend_lapic();
asm_enter_s3(sstate_data, pm1a_cnt_val, pm1b_cnt_val);
resume_lapic();
resume_iommu();
resume_ioapic();
vmx_on();
CPU_IRQ_ENABLE_ON_CONFIG();
/* restore the default main entry */
stac();
write_trampoline_sym(main_entry, pmain_entry_saved);
clac();
/* online all APs again */
if (!start_pcpus(AP_MASK)) {
panic("Failed to start all APs!");
}
/* Restore TSC on all PCPU
* Caution: There should no timer setup before TSC resumed.
*/
smp_call_function(get_active_pcpu_bitmap(), resume_tsc, NULL);
/* console must be resumed after TSC restored since it will setup timer base on TSC */
resume_console();
}
void reset_host(void)
{
struct acrn_acpi_generic_address *gas = &(host_reset_reg.reg);
/* TODO: gracefully shut down all guests before doing host reset. */
/*
* Assumption:
* The platform we are running must support at least one of reset method:
* - ACPI reset
* - 0xcf9 reset
*
* UEFI more likely sets the reset value as 0x6 (not 0xe) for 0xcf9 port.
* This asserts PLTRST# to reset devices on the platform, but not the
* SLP_S3#/4#/5# signals, which power down the systems. This might not be
* enough for us.
*/
if ((gas->space_id == SPACE_SYSTEM_IO) &&
(gas->bit_width == 8U) && (gas->bit_offset == 0U) &&
(gas->address != 0U) && (gas->address != 0xcf9U)) {
pio_write8(host_reset_reg.val, (uint16_t)host_reset_reg.reg.address);
} else {
/* making sure bit 2 (RST_CPU) is '0', when the reset command is issued. */
pio_write8(0x2U, 0xcf9U);
udelay(50U);
pio_write8(0xeU, 0xcf9U);
}
pr_fatal("%s(): can't reset host.", __func__);
while (1) {
asm_pause();
}
}
static enum acrn_cpufreq_policy_type cpufreq_policy = CPUFREQ_POLICY_PERFORMANCE;
void init_frequency_policy(void)
{
uint32_t cpuid_06_eax, unused;
struct acrn_boot_info *abi = get_acrn_boot_info();
const char *cmd_src = abi->cmdline;
/*
* Parse cmdline, decide which policy type to use.
* User can either specify cpu_perf_policy=Nominal or cpu_perf_policy=Performance
* The default type is 'Performance'
*/
if(strstr_s(cmd_src, MAX_BOOTARGS_SIZE, "cpu_perf_policy=Nominal", 24U) != NULL) {
cpufreq_policy = CPUFREQ_POLICY_NOMINAL;
}
cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused, &unused);
if ((cpuid_06_eax & CPUID_EAX_HWP) != 0U) {
/* If HWP is available, enable HWP early. This will unlock other HWP MSRs. */
msr_write(MSR_IA32_PM_ENABLE, 1U);
}
}
/*
* This Function is to be called by each pcpu after init_cpufreq().
* It applies the frequency policy, which can be specified from boot parameters.
* - cpu_perf_policy=Performance: HWP autonomous selection, between highest HWP level and
* lowest HWP level. If HWP is not avaliable, the frequency is fixed to highest p-state.
* - cpu_perf_policy=Nominal: frequency is fixed to guaranteed HWP level or nominal p-state.
* The default policy is 'Performance'.
*
* ACRN will not be governing pcpu's frequency after this.
*/
void apply_frequency_policy(void)
{
struct acrn_cpufreq_limits *limits = &cpufreq_limits[get_pcpu_id()];
uint64_t highest_lvl_req = limits->highest_hwp_lvl, lowest_lvl_req = limits->lowest_hwp_lvl, reg;
uint8_t pstate_req = limits->performance_pstate;
uint32_t cpuid_06_eax, cpuid_01_ecx, unused;
cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused, &unused);
cpuid_subleaf(0x1U, 0U, &unused, &unused, &cpuid_01_ecx, &unused);
/* Both HWP and ACPI p-state are supported. HWP is the first choise. */
if ((cpuid_06_eax & CPUID_EAX_HWP) != 0U) {
/*
* For Performance policy(default): CPU frequency will be autonomously selected between highest and lowest
* For Nominal policy: set to fixed frequency by letting highest=lowest=guaranteed
*/
if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
highest_lvl_req = limits->guaranteed_hwp_lvl;
lowest_lvl_req = limits->guaranteed_hwp_lvl;
}
/* EPP(0x80: default) | Desired_Performance(0: HWP auto) | Maximum_Performance | Minimum_Performance */
reg = (0x80UL << 24U) | (0x00UL << 16U) | (highest_lvl_req << 8U) | lowest_lvl_req;
msr_write(MSR_IA32_HWP_REQUEST, reg);
} else if ((cpuid_01_ecx & CPUID_ECX_EST) != 0U) {
struct cpu_state_info *pm_s_state_data = get_cpu_pm_state_info();
/*
* Set to fixed frequency in ACPI p-state mode.
* Performance policy: performance_pstate
* Nominal policy: nominal_pstate
*/
if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
pstate_req = limits->nominal_pstate;
}
/* PX info might be missing on some platforms (px_cnt equels 0). Do nothing if so. */
if (pm_s_state_data->px_cnt != 0) {
if (pstate_req < pm_s_state_data->px_cnt) {
msr_write(MSR_IA32_PERF_CTL, pm_s_state_data->px_data[pstate_req].control);
} else {
ASSERT(false, "invalid p-state index");
}
}
} else {
/* If no frequency interface is presented, just let CPU run by itself. Do nothing here.*/
}
}