diff --git a/kernel/configs/arm64_kata_kvm_4.19.x b/kernel/configs/arm64_kata_kvm_4.19.x index de2964b6af..28f4e03eda 100644 --- a/kernel/configs/arm64_kata_kvm_4.19.x +++ b/kernel/configs/arm64_kata_kvm_4.19.x @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm64 4.19.23 Kernel Configuration +# Linux/arm64 4.19.24 Kernel Configuration # # @@ -281,7 +281,10 @@ CONFIG_PCI_ECAM=y CONFIG_PCI_IOV=y # CONFIG_PCI_PRI is not set # CONFIG_PCI_PASID is not set +CONFIG_PCI_LABEL=y CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +# CONFIG_HOTPLUG_PCI_ACPI_IBM is not set CONFIG_HOTPLUG_PCI_CPCI=y CONFIG_HOTPLUG_PCI_SHPC=y @@ -398,8 +401,9 @@ CONFIG_ARM64_VHE=y # ARMv8.2 architectural features # CONFIG_ARM64_UAO=y -# CONFIG_ARM64_PMEM is not set +CONFIG_ARM64_PMEM=y CONFIG_ARM64_RAS_EXTN=y +# CONFIG_ARM64_CNP is not set CONFIG_ARM64_SVE=y CONFIG_ARM64_MODULE_PLTS=y # CONFIG_RANDOMIZE_BASE is not set @@ -407,6 +411,7 @@ CONFIG_ARM64_MODULE_PLTS=y # # Boot options # +# CONFIG_ARM64_ACPI_PARKING_PROTOCOL is not set CONFIG_CMDLINE="" # CONFIG_CMDLINE_FORCE is not set CONFIG_EFI_STUB=y @@ -475,7 +480,35 @@ CONFIG_EFI_ARMSTUB_DTB_LOADER=y # Tegra firmware driver # CONFIG_ARCH_SUPPORTS_ACPI=y -# CONFIG_ACPI is not set +CONFIG_ACPI=y +CONFIG_ACPI_GENERIC_GSI=y +CONFIG_ACPI_CCA_REQUIRED=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_BUTTON=y +# CONFIG_ACPI_DOCK is not set +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_MCFG=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_HOTPLUG_CPU=y +# CONFIG_ACPI_THERMAL is not set +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +# CONFIG_ACPI_PCI_SLOT is not set +CONFIG_ACPI_CONTAINER=y +# CONFIG_ACPI_HED is not set +# CONFIG_ACPI_BGRT is not set +CONFIG_ACPI_REDUCED_HARDWARE_ONLY=y +CONFIG_ACPI_NFIT=y +CONFIG_HAVE_ACPI_APEI=y +# CONFIG_ACPI_APEI is not set +# CONFIG_PMIC_OPREGION is not set +# CONFIG_ACPI_CONFIGFS is not set +CONFIG_ACPI_IORT=y +CONFIG_ACPI_GTDT=y +CONFIG_ACPI_PPTT=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VHOST_NET is not set @@ -1141,6 +1174,13 @@ CONFIG_OF_NET=y CONFIG_OF_RESERVED_MEM=y # CONFIG_OF_OVERLAY is not set # CONFIG_PARPORT is not set +CONFIG_PNP=y +# CONFIG_PNP_DEBUG_MESSAGES is not set + +# +# Protocols +# +CONFIG_PNPACPI=y CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_NULL_BLK is not set # CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set @@ -1383,6 +1423,7 @@ CONFIG_VIRTIO_NET=y # CONFIG_ETHERNET is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set +# CONFIG_NET_SB1000 is not set # CONFIG_MDIO_DEVICE is not set # CONFIG_PHYLIB is not set # CONFIG_PPP is not set @@ -1398,6 +1439,7 @@ CONFIG_VIRTIO_NET=y # # CONFIG_WAN is not set # CONFIG_VMXNET3 is not set +# CONFIG_FUJITSU_ES is not set CONFIG_NET_FAILOVER=y # CONFIG_ISDN is not set # CONFIG_NVM is not set @@ -1710,6 +1752,7 @@ CONFIG_COMMON_CLK=y # Clock Source drivers # CONFIG_TIMER_OF=y +CONFIG_TIMER_ACPI=y CONFIG_TIMER_PROBE=y CONFIG_ARM_ARCH_TIMER=y # CONFIG_ARM_ARCH_TIMER_EVTSTREAM is not set @@ -1801,7 +1844,9 @@ CONFIG_PARTITION_PERCPU=y # CONFIG_ARM_CCI_PMU is not set # CONFIG_ARM_CCN is not set CONFIG_ARM_PMU=y +CONFIG_ARM_PMU_ACPI=y # CONFIG_ARM_DSU_PMU is not set +# CONFIG_HISI_PMU is not set # CONFIG_ARM_SPE_PMU is not set # CONFIG_RAS is not set @@ -1809,7 +1854,14 @@ CONFIG_ARM_PMU=y # Android # # CONFIG_ANDROID is not set -# CONFIG_LIBNVDIMM is not set +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=y +CONFIG_ND_BLK=y +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=y +CONFIG_BTT=y +CONFIG_OF_PMEM=y +CONFIG_DAX_DRIVER=y CONFIG_DAX=y CONFIG_NVMEM=y @@ -2170,6 +2222,8 @@ CONFIG_LIBFDT=y CONFIG_UCS2_STRING=y CONFIG_SG_POOL=y CONFIG_ARCH_HAS_SG_CHAIN=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y CONFIG_SBITMAP=y # CONFIG_STRING_SELFTEST is not set diff --git a/kernel/kata_config_version b/kernel/kata_config_version index f04c001f3f..64bb6b746d 100644 --- a/kernel/kata_config_version +++ b/kernel/kata_config_version @@ -1 +1 @@ -29 +30 diff --git a/kernel/patches/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch b/kernel/patches/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch new file mode 100644 index 0000000000..17564efd2d --- /dev/null +++ b/kernel/patches/0003-arm64-backport-Arm64-KVM-Dynamic-IPA-and-52bit-IPA-s.patch @@ -0,0 +1,2277 @@ +From 60a4fed76e63c36cd327c4b404ec163e93a4805e Mon Sep 17 00:00:00 2001 +From: Penny Zheng +Date: Tue, 19 Feb 2019 16:05:44 +0800 +Subject: [PATCH] arm64: backport Arm64 KVM Dynamic IPA and 52bit IPA support + to 4.19.X + +This patch is based on Suzuki K Poulose's +[v6,00/18] kvm: arm64: Dynamic IPA and 52bit IPA +https://patchwork.kernel.org/cover/10616271/ + +Signed-off-by: Penny Zheng +--- + .../admin-guide/kernel-parameters.txt | 4 + + arch/arm/include/asm/kvm_arm.h | 4 +- + arch/arm/include/asm/kvm_host.h | 13 +- + arch/arm/include/asm/kvm_mmu.h | 30 ++- + arch/arm/include/asm/stage2_pgtable.h | 54 ++-- + arch/arm64/Kconfig | 13 + + arch/arm64/include/asm/cpucaps.h | 3 +- + arch/arm64/include/asm/cpufeature.h | 28 ++- + arch/arm64/include/asm/kvm_arm.h | 156 +++++++++--- + arch/arm64/include/asm/kvm_asm.h | 3 +- + arch/arm64/include/asm/kvm_host.h | 18 +- + arch/arm64/include/asm/kvm_hyp.h | 10 + + arch/arm64/include/asm/kvm_mmu.h | 47 +++- + arch/arm64/include/asm/mmu_context.h | 17 +- + arch/arm64/include/asm/pgtable-hwdef.h | 2 + + arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 ---- + arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 --- + arch/arm64/include/asm/stage2_pgtable.h | 236 +++++++++++++----- + arch/arm64/kernel/cpufeature.c | 43 ++++ + arch/arm64/kernel/suspend.c | 4 + + arch/arm64/kvm/guest.c | 6 +- + arch/arm64/kvm/hyp-init.S | 3 + + arch/arm64/kvm/hyp/Makefile | 1 - + arch/arm64/kvm/hyp/s2-setup.c | 90 ------- + arch/arm64/kvm/hyp/switch.c | 4 +- + arch/arm64/kvm/hyp/tlb.c | 4 +- + arch/arm64/kvm/reset.c | 108 +++++++- + arch/arm64/mm/context.c | 3 + + arch/arm64/mm/proc.S | 11 +- + include/linux/irqchip/arm-gic-v3.h | 5 + + include/uapi/linux/kvm.h | 15 ++ + virt/kvm/arm/arm.c | 26 +- + virt/kvm/arm/mmu.c | 120 ++++----- + virt/kvm/arm/vgic/vgic-its.c | 36 +-- + virt/kvm/arm/vgic/vgic-kvm-device.c | 2 +- + virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 - + 36 files changed, 767 insertions(+), 435 deletions(-) + delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopmd.h + delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopud.h + delete mode 100644 arch/arm64/kvm/hyp/s2-setup.c + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index f5acf35c712f..f28de4b3c5c7 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2758,6 +2758,10 @@ + + noclflush [BUGS=X86] Don't use the CLFLUSH instruction + ++ nocnp [ARM64] ++ Disable CNP (Common not Private translations) ++ even if it is supported by processor. ++ + nodelayacct [KNL] Disable per-task delay accounting + + nodsp [SH] Disable hardware DSP at boot time. +diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h +index 3ab8b3781bfe..b95f8d0d9f17 100644 +--- a/arch/arm/include/asm/kvm_arm.h ++++ b/arch/arm/include/asm/kvm_arm.h +@@ -133,8 +133,7 @@ + * space. + */ + #define KVM_PHYS_SHIFT (40) +-#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) +-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) ++ + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) + + /* Virtualization Translation Control Register (VTCR) bits */ +@@ -161,6 +160,7 @@ + #else + #define VTTBR_X (5 - KVM_T0SZ) + #endif ++#define VTTBR_CNP_BIT _AC(1, UL) + #define VTTBR_BADDR_MASK (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_X) + #define VTTBR_VMID_SHIFT _AC(48, ULL) + #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) +diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h +index 3ad482d2f1eb..5ca5d9af0c26 100644 +--- a/arch/arm/include/asm/kvm_host.h ++++ b/arch/arm/include/asm/kvm_host.h +@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) + kvm_call_hyp(__init_stage2_translation); + } + +-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) ++static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) + { + return 0; + } +@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} + struct kvm *kvm_arch_alloc_vm(void); + void kvm_arch_free_vm(struct kvm *kvm); + ++static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) ++{ ++ /* ++ * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, ++ * so any non-zero value used as type is illegal. ++ */ ++ if (type) ++ return -EINVAL; ++ return 0; ++} ++ + #endif /* __ARM_KVM_HOST_H__ */ +diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h +index 265ea9cf7df7..c9dcfd918c2a 100644 +--- a/arch/arm/include/asm/kvm_mmu.h ++++ b/arch/arm/include/asm/kvm_mmu.h +@@ -35,16 +35,12 @@ + addr; \ + }) + +-/* +- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. +- */ +-#define KVM_MMU_CACHE_MIN_PAGES 2 +- + #ifndef __ASSEMBLY__ + + #include + #include + #include ++#include + #include + #include + #include +@@ -52,6 +48,13 @@ + /* Ensure compatibility with arm64 */ + #define VA_BITS 32 + ++#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT ++#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) ++#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) ++#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK ++ ++#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) ++ + int create_hyp_mappings(void *from, void *to, pgprot_t prot); + int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, + void __iomem **kaddr, +@@ -355,6 +358,23 @@ static inline int hyp_map_aux_data(void) + + #define kvm_phys_to_vttbr(addr) (addr) + ++static inline void kvm_set_ipa_limit(void) {} ++ ++static inline bool kvm_cpu_has_cnp(void) ++{ ++ return false; ++} ++ ++static inline bool kvm_cpu_has_cnp(void) ++{ ++ return false; ++} ++ ++static inline bool kvm_cpu_has_cnp(void) ++{ ++ return false; ++} ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __ARM_KVM_MMU_H__ */ +diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h +index 460d616bb2d6..f6a7ea805232 100644 +--- a/arch/arm/include/asm/stage2_pgtable.h ++++ b/arch/arm/include/asm/stage2_pgtable.h +@@ -19,43 +19,53 @@ + #ifndef __ARM_S2_PGTABLE_H_ + #define __ARM_S2_PGTABLE_H_ + +-#define stage2_pgd_none(pgd) pgd_none(pgd) +-#define stage2_pgd_clear(pgd) pgd_clear(pgd) +-#define stage2_pgd_present(pgd) pgd_present(pgd) +-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +-#define stage2_pud_free(pud) pud_free(NULL, pud) +- +-#define stage2_pud_none(pud) pud_none(pud) +-#define stage2_pud_clear(pud) pud_clear(pud) +-#define stage2_pud_present(pud) pud_present(pud) +-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +- +-#define stage2_pud_huge(pud) pud_huge(pud) ++/* ++ * kvm_mmu_cache_min_pages() is the number of pages required ++ * to install a stage-2 translation. We pre-allocate the entry ++ * level table at VM creation. Since we have a 3 level page-table, ++ * we need only two pages to add a new mapping. ++ */ ++#define kvm_mmu_cache_min_pages(kvm) 2 ++ ++#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) ++#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) ++#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) ++#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) ++#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) ++#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) ++ ++#define stage2_pud_none(kvm, pud) pud_none(pud) ++#define stage2_pud_clear(kvm, pud) pud_clear(pud) ++#define stage2_pud_present(kvm, pud) pud_present(pud) ++#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) ++#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) ++#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) ++ ++#define stage2_pud_huge(kvm, pud) pud_huge(pud) + + /* Open coded p*d_addr_end that can deal with 64bit addresses */ +-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) ++static inline phys_addr_t ++stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) + { + phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; + } + +-#define stage2_pud_addr_end(addr, end) (end) ++#define stage2_pud_addr_end(kvm, addr, end) (end) + +-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) ++static inline phys_addr_t ++stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) + { + phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; + } + +-#define stage2_pgd_index(addr) pgd_index(addr) ++#define stage2_pgd_index(kvm, addr) pgd_index(addr) + +-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +-#define stage2_pud_table_empty(pudp) false ++#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) ++#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) ++#define stage2_pud_table_empty(kvm, pudp) false + + #endif /* __ARM_S2_PGTABLE_H_ */ +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 1b1a0e95c751..f9162da575a9 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1132,6 +1132,19 @@ config ARM64_RAS_EXTN + and access the new registers if the system supports the extension. + Platform RAS features may additionally depend on firmware support. + ++config ARM64_CNP ++ bool "Enable support for Common Not Private (CNP) translations" ++ depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN ++ help ++ Common Not Private (CNP) allows translation table entries to ++ be shared between different PEs in the same inner shareable ++ domain, so the hardware can use this fact to optimise the ++ caching of such entries in the TLB. ++ ++ Selecting this option allows the CNP feature to be detected ++ at runtime, and does not affect PEs that do not implement ++ this feature. ++ + endmenu + + config ARM64_SVE +diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h +index ae1f70450fb2..0f009abdd8cf 100644 +--- a/arch/arm64/include/asm/cpucaps.h ++++ b/arch/arm64/include/asm/cpucaps.h +@@ -51,7 +51,8 @@ + #define ARM64_SSBD 30 + #define ARM64_MISMATCHED_CACHE_TYPE 31 + #define ARM64_HAS_STAGE2_FWB 32 ++#define ARM64_HAS_CNP 33 + +-#define ARM64_NCAPS 33 ++#define ARM64_NCAPS 34 + + #endif /* __ASM_CPUCAPS_H */ +diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h +index 1717ba1db35d..6dc5823d5f12 100644 +--- a/arch/arm64/include/asm/cpufeature.h ++++ b/arch/arm64/include/asm/cpufeature.h +@@ -262,7 +262,7 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0; + /* + * CPU feature detected at boot time based on system-wide value of a + * feature. It is safe for a late CPU to have this feature even though +- * the system hasn't enabled it, although the featuer will not be used ++ * the system hasn't enabled it, although the feature will not be used + * by Linux in this case. If the system has enabled this feature already, + * then every late CPU must have it. + */ +@@ -508,6 +508,12 @@ static inline bool system_supports_sve(void) + cpus_have_const_cap(ARM64_SVE); + } + ++static inline bool system_supports_cnp(void) ++{ ++ return IS_ENABLED(CONFIG_ARM64_CNP) && ++ cpus_have_const_cap(ARM64_HAS_CNP); ++} ++ + #define ARM64_SSBD_UNKNOWN -1 + #define ARM64_SSBD_FORCE_DISABLE 0 + #define ARM64_SSBD_KERNEL 1 +@@ -530,6 +536,26 @@ void arm64_set_ssbd_mitigation(bool state); + static inline void arm64_set_ssbd_mitigation(bool state) {} + #endif + ++static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) ++{ ++ switch (parange) { ++ case 0: return 32; ++ case 1: return 36; ++ case 2: return 40; ++ case 3: return 42; ++ case 4: return 44; ++ case 5: return 48; ++ case 6: return 52; ++ /* ++ * A future PE could use a value unknown to the kernel. ++ * However, by the "D10.1.4 Principles of the ID scheme ++ * for fields in ID registers", ARM DDI 0487C.a, any new ++ * value is guaranteed to be higher than what we know already. ++ * As a safe limit, we return the limit supported by the kernel. ++ */ ++ default: return CONFIG_ARM64_PA_BITS; ++ } ++} + #endif /* __ASSEMBLY__ */ + + #endif +diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h +index 8b284cbf8162..bc2327d4a505 100644 +--- a/arch/arm64/include/asm/kvm_arm.h ++++ b/arch/arm64/include/asm/kvm_arm.h +@@ -110,6 +110,7 @@ + #define VTCR_EL2_RES1 (1U << 31) + #define VTCR_EL2_HD (1 << 22) + #define VTCR_EL2_HA (1 << 21) ++#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT + #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK + #define VTCR_EL2_TG0_MASK TCR_TG0_MASK + #define VTCR_EL2_TG0_4K TCR_TG0_4K +@@ -123,62 +124,150 @@ + #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA + #define VTCR_EL2_SL0_SHIFT 6 + #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) +-#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) + #define VTCR_EL2_T0SZ_MASK 0x3f +-#define VTCR_EL2_T0SZ_40B 24 + #define VTCR_EL2_VS_SHIFT 19 + #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) + #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) + ++#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) ++ + /* + * We configure the Stage-2 page tables to always restrict the IPA space to be + * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are + * not known to exist and will break with this configuration. + * +- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time +- * (see hyp-init.S). ++ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). + * + * Note that when using 4K pages, we concatenate two first level page tables + * together. With 16K pages, we concatenate 16 first level page tables. + * +- * The magic numbers used for VTTBR_X in this patch can be found in Tables +- * D4-23 and D4-25 in ARM DDI 0487A.b. + */ + +-#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B + #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ + VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) + +-#ifdef CONFIG_ARM64_64K_PAGES + /* +- * Stage2 translation configuration: +- * 64kB pages (TG0 = 1) +- * 2 level page tables (SL = 1) ++ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. ++ * Interestingly, it depends on the page size. ++ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a ++ * ++ * ----------------------------------------- ++ * | Entry level | 4K | 16K/64K | ++ * ------------------------------------------ ++ * | Level: 0 | 2 | - | ++ * ------------------------------------------ ++ * | Level: 1 | 1 | 2 | ++ * ------------------------------------------ ++ * | Level: 2 | 0 | 1 | ++ * ------------------------------------------ ++ * | Level: 3 | - | 0 | ++ * ------------------------------------------ ++ * ++ * The table roughly translates to : ++ * ++ * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level ++ * ++ * Where TGRAN_SL0_BASE is a magic number depending on the page size: ++ * TGRAN_SL0_BASE(4K) = 2 ++ * TGRAN_SL0_BASE(16K) = 3 ++ * TGRAN_SL0_BASE(64K) = 3 ++ * provided we take care of ruling out the unsupported cases and ++ * Entry_Level = 4 - Number_of_levels. ++ * + */ +-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) +-#define VTTBR_X_TGRAN_MAGIC 38 ++#ifdef CONFIG_ARM64_64K_PAGES ++ ++#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K ++#define VTCR_EL2_TGRAN_SL0_BASE 3UL ++ + #elif defined(CONFIG_ARM64_16K_PAGES) +-/* +- * Stage2 translation configuration: +- * 16kB pages (TG0 = 2) +- * 2 level page tables (SL = 1) +- */ +-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) +-#define VTTBR_X_TGRAN_MAGIC 42 ++ ++#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K ++#define VTCR_EL2_TGRAN_SL0_BASE 3UL ++ + #else /* 4K */ +-/* +- * Stage2 translation configuration: +- * 4kB pages (TG0 = 0) +- * 3 level page tables (SL = 1) +- */ +-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) +-#define VTTBR_X_TGRAN_MAGIC 37 ++ ++#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K ++#define VTCR_EL2_TGRAN_SL0_BASE 2UL ++ + #endif + +-#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) +-#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) ++#define VTCR_EL2_LVLS_TO_SL0(levels) \ ++ ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) ++#define VTCR_EL2_SL0_TO_LVLS(sl0) \ ++ ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) ++#define VTCR_EL2_LVLS(vtcr) \ ++ VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + +-#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X) ++#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) ++#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) ++ ++/* ++ * ARM VMSAv8-64 defines an algorithm for finding the translation table ++ * descriptors in section D4.2.8 in ARM DDI 0487C.a. ++ * ++ * The algorithm defines the expectations on the translation table ++ * addresses for each level, based on PAGE_SIZE, entry level ++ * and the translation table size (T0SZ). The variable "x" in the ++ * algorithm determines the alignment of a table base address at a given ++ * level and thus determines the alignment of VTTBR:BADDR for stage2 ++ * page table entry level. ++ * Since the number of bits resolved at the entry level could vary ++ * depending on the T0SZ, the value of "x" is defined based on a ++ * Magic constant for a given PAGE_SIZE and Entry Level. The ++ * intermediate levels must be always aligned to the PAGE_SIZE (i.e, ++ * x = PAGE_SHIFT). ++ * ++ * The value of "x" for entry level is calculated as : ++ * x = Magic_N - T0SZ ++ * ++ * where Magic_N is an integer depending on the page size and the entry ++ * level of the page table as below: ++ * ++ * -------------------------------------------- ++ * | Entry level | 4K 16K 64K | ++ * -------------------------------------------- ++ * | Level: 0 (4 levels) | 28 | - | - | ++ * -------------------------------------------- ++ * | Level: 1 (3 levels) | 37 | 31 | 25 | ++ * -------------------------------------------- ++ * | Level: 2 (2 levels) | 46 | 42 | 38 | ++ * -------------------------------------------- ++ * | Level: 3 (1 level) | - | 53 | 51 | ++ * -------------------------------------------- ++ * ++ * We have a magic formula for the Magic_N below: ++ * ++ * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels) ++ * ++ * where Number_of_levels = (4 - Level). We are only interested in the ++ * value for Entry_Level for the stage2 page table. ++ * ++ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows: ++ * ++ * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT) ++ * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels) ++ * ++ * Here is one way to explain the Magic Formula: ++ * ++ * x = log2(Size_of_Entry_Level_Table) ++ * ++ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another ++ * PAGE_SHIFT bits in the PTE, we have : ++ * ++ * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT) ++ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3 ++ * where n = number of levels, and since each pointer is 8bytes, we have: ++ * ++ * x = Bits_Entry_Level + 3 ++ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n ++ * ++ * The only constraint here is that, we have to find the number of page table ++ * levels for a given IPA size (which we do, see stage2_pt_levels()) ++ */ ++#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3))) ++ ++#define VTTBR_CNP_BIT (UL(1)) + #define VTTBR_VMID_SHIFT (UL(48)) + #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) + +@@ -226,6 +315,13 @@ + + /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ + #define HPFAR_MASK (~UL(0xf)) ++/* ++ * We have ++ * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] ++ * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] ++ */ ++#define PAR_TO_HPFAR(par) \ ++ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) + + #define kvm_arm_exception_type \ + {0, "IRQ" }, \ +diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h +index 102b5a5c47b6..aea01a09eb94 100644 +--- a/arch/arm64/include/asm/kvm_asm.h ++++ b/arch/arm64/include/asm/kvm_asm.h +@@ -30,6 +30,7 @@ + #define ARM_EXCEPTION_IRQ 0 + #define ARM_EXCEPTION_EL1_SERROR 1 + #define ARM_EXCEPTION_TRAP 2 ++#define ARM_EXCEPTION_IL 3 + /* The hyp-stub will return this for any kvm_call_hyp() call */ + #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR + +@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void); + + extern u32 __kvm_get_mdcr_el2(void); + +-extern u32 __init_stage2_translation(void); +- + /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ + #define __hyp_this_cpu_ptr(sym) \ + ({ \ +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index 3d6d7336f871..f84052f306af 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + + int __attribute_const__ kvm_target_cpu(void); + int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); ++int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); + void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); + + struct kvm_arch { +@@ -61,11 +61,13 @@ struct kvm_arch { + u64 vmid_gen; + u32 vmid; + +- /* 1-level 2nd stage table, protected by kvm->mmu_lock */ ++ /* stage2 entry level table */ + pgd_t *pgd; + + /* VTTBR value associated with above pgd and vmid */ + u64 vttbr; ++ /* VTCR_EL2 value for this VM */ ++ u64 vtcr; + + /* The last vcpu id that ran on each physical CPU */ + int __percpu *last_vcpu_ran; +@@ -440,13 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); + +-static inline void __cpu_init_stage2(void) +-{ +- u32 parange = kvm_call_hyp(__init_stage2_translation); +- +- WARN_ONCE(parange < 40, +- "PARange is %d bits, unsupported configuration!", parange); +-} ++static inline void __cpu_init_stage2(void) {} + + /* Guest/host FPSIMD coordination helpers */ + int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); +@@ -509,8 +505,12 @@ static inline int kvm_arm_have_ssbd(void) + void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); + void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); + ++void kvm_set_ipa_limit(void); ++ + #define __KVM_HAVE_ARCH_VM_ALLOC + struct kvm *kvm_arch_alloc_vm(void); + void kvm_arch_free_vm(struct kvm *kvm); + ++int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); ++ + #endif /* __ARM64_KVM_HOST_H__ */ +diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h +index 384c34397619..23aca66767f9 100644 +--- a/arch/arm64/include/asm/kvm_hyp.h ++++ b/arch/arm64/include/asm/kvm_hyp.h +@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void); + u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); + void __noreturn __hyp_do_panic(unsigned long, ...); + ++/* ++ * Must be called from hyp code running at EL2 with an updated VTTBR ++ * and interrupts disabled. ++ */ ++static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) ++{ ++ write_sysreg(kvm->arch.vtcr, vtcr_el2); ++ write_sysreg(kvm->arch.vttbr, vttbr_el2); ++} ++ + #endif /* __ARM64_KVM_HYP_H__ */ + +diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h +index d6fff7de5539..658657367f2f 100644 +--- a/arch/arm64/include/asm/kvm_mmu.h ++++ b/arch/arm64/include/asm/kvm_mmu.h +@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v) + * We currently only support a 40bit IPA. + */ + #define KVM_PHYS_SHIFT (40) +-#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) +-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) ++ ++#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) ++#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) ++#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) ++ ++static inline bool kvm_page_empty(void *ptr) ++{ ++ struct page *ptr_page = virt_to_page(ptr); ++ return page_count(ptr_page) == 1; ++} + + #include + +@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) + return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); + } + +-static inline bool kvm_page_empty(void *ptr) +-{ +- struct page *ptr_page = virt_to_page(ptr); +- return page_count(ptr_page) == 1; +-} +- + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) + + #ifdef __PAGETABLE_PMD_FOLDED +@@ -517,5 +519,34 @@ static inline int hyp_map_aux_data(void) + + #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) + ++/* ++ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. ++ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with ++ * 52bit IPS. ++ */ ++static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) ++{ ++ int x = ARM64_VTTBR_X(ipa_shift, levels); ++ ++ return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; ++} ++ ++static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) ++{ ++ unsigned int x = arm64_vttbr_x(ipa_shift, levels); ++ ++ return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); ++} ++ ++static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) ++{ ++ return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); ++} ++ ++static inline bool kvm_cpu_has_cnp(void) ++{ ++ return system_supports_cnp(); ++} ++ + #endif /* __ASSEMBLY__ */ + #endif /* __ARM64_KVM_MMU_H__ */ +diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h +index 39ec0b8a689e..1e58bf58c22b 100644 +--- a/arch/arm64/include/asm/mmu_context.h ++++ b/arch/arm64/include/asm/mmu_context.h +@@ -147,12 +147,25 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + +- phys_addr_t pgd_phys = virt_to_phys(pgdp); ++ /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ ++ phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); ++ ++ if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) { ++ /* ++ * cpu_replace_ttbr1() is used when there's a boot CPU ++ * up (i.e. cpufeature framework is not up yet) and ++ * latter only when we enable CNP via cpufeature's ++ * enable() callback. ++ * Also we rely on the cpu_hwcap bit being set before ++ * calling the enable() function. ++ */ ++ ttbr1 |= TTBR_CNP_BIT; ++ } + + replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); +- replace_phys(pgd_phys); ++ replace_phys(ttbr1); + cpu_uninstall_idmap(); + } + +diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h +index fd208eac9f2a..1d7d8da2ef9b 100644 +--- a/arch/arm64/include/asm/pgtable-hwdef.h ++++ b/arch/arm64/include/asm/pgtable-hwdef.h +@@ -211,6 +211,8 @@ + #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) + #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) + ++#define TTBR_CNP_BIT (UL(1) << 0) ++ + /* + * TCR flags. + */ +diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h +deleted file mode 100644 +index 2656a0fd05a6..000000000000 +--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h ++++ /dev/null +@@ -1,42 +0,0 @@ +-/* +- * Copyright (C) 2016 - ARM Ltd +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ +-#define __ARM64_S2_PGTABLE_NOPMD_H_ +- +-#include +- +-#define __S2_PGTABLE_PMD_FOLDED +- +-#define S2_PMD_SHIFT S2_PUD_SHIFT +-#define S2_PTRS_PER_PMD 1 +-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) +-#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) +- +-#define stage2_pud_none(pud) (0) +-#define stage2_pud_present(pud) (1) +-#define stage2_pud_clear(pud) do { } while (0) +-#define stage2_pud_populate(pud, pmd) do { } while (0) +-#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) +- +-#define stage2_pmd_free(pmd) do { } while (0) +- +-#define stage2_pmd_addr_end(addr, end) (end) +- +-#define stage2_pud_huge(pud) (0) +-#define stage2_pmd_table_empty(pmdp) (0) +- +-#endif +diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h +deleted file mode 100644 +index 5ee87b54ebf3..000000000000 +--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h ++++ /dev/null +@@ -1,39 +0,0 @@ +-/* +- * Copyright (C) 2016 - ARM Ltd +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ +-#define __ARM64_S2_PGTABLE_NOPUD_H_ +- +-#define __S2_PGTABLE_PUD_FOLDED +- +-#define S2_PUD_SHIFT S2_PGDIR_SHIFT +-#define S2_PTRS_PER_PUD 1 +-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +-#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) +- +-#define stage2_pgd_none(pgd) (0) +-#define stage2_pgd_present(pgd) (1) +-#define stage2_pgd_clear(pgd) do { } while (0) +-#define stage2_pgd_populate(pgd, pud) do { } while (0) +- +-#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) +- +-#define stage2_pud_free(x) do { } while (0) +- +-#define stage2_pud_addr_end(addr, end) (end) +-#define stage2_pud_table_empty(pmdp) (0) +- +-#endif +diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h +index 8b68099348e5..d352f6df8d2c 100644 +--- a/arch/arm64/include/asm/stage2_pgtable.h ++++ b/arch/arm64/include/asm/stage2_pgtable.h +@@ -19,8 +19,16 @@ + #ifndef __ARM64_S2_PGTABLE_H_ + #define __ARM64_S2_PGTABLE_H_ + ++#include + #include + ++/* ++ * PGDIR_SHIFT determines the size a top-level page table entry can map ++ * and depends on the number of levels in the page table. Compute the ++ * PGDIR_SHIFT for a given number of levels. ++ */ ++#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) ++ + /* + * The hardware supports concatenation of up to 16 tables at stage2 entry level + * and we use the feature whenever possible. +@@ -29,112 +37,208 @@ + * On arm64, the smallest PAGE_SIZE supported is 4k, which means + * (PAGE_SHIFT - 3) > 4 holds for all page sizes. + * This implies, the total number of page table levels at stage2 expected +- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) ++ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) + * in normal translations(e.g, stage1), since we cannot have another level in +- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). ++ * the range (IPA_SHIFT, IPA_SHIFT - 4). + */ +-#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) ++#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) ++#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) + +-/* +- * With all the supported VA_BITs and 40bit guest IPA, the following condition +- * is always true: +- * +- * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS +- * +- * We base our stage-2 page table walker helpers on this assumption and +- * fall back to using the host version of the helper wherever possible. +- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back +- * to using the host version, since it is guaranteed it is not folded at host. +- * +- * If the condition breaks in the future, we can rearrange the host level +- * definitions and reuse them for stage2. Till then... +- */ +-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS +-#error "Unsupported combination of guest IPA and host VA_BITS." +-#endif +- +-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ +-#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) +-#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) +-#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) ++/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ ++#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) ++#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) ++#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) + + /* + * The number of PTRS across all concatenated stage2 tables given by the + * number of bits resolved at the initial level. ++ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), ++ * in which case, stage2_pgd_ptrs will have one entry. + */ +-#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) ++#define pgd_ptrs_shift(ipa, pgdir_shift) \ ++ ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) ++#define __s2_pgd_ptrs(ipa, lvls) \ ++ (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) ++#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) ++ ++#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) ++#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) + + /* +- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation +- * levels in addition to the PGD. ++ * kvm_mmmu_cache_min_pages() is the number of pages required to install ++ * a stage-2 translation. We pre-allocate the entry level page table at ++ * the VM creation. + */ +-#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) ++#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) + +- +-#if STAGE2_PGTABLE_LEVELS > 3 ++/* Stage2 PUD definitions when the level is present */ ++static inline bool kvm_stage2_has_pud(struct kvm *kvm) ++{ ++ return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); ++} + + #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) +-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) ++#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) + #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) + +-#define stage2_pgd_none(pgd) pgd_none(pgd) +-#define stage2_pgd_clear(pgd) pgd_clear(pgd) +-#define stage2_pgd_present(pgd) pgd_present(pgd) +-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +-#define stage2_pud_free(pud) pud_free(NULL, pud) ++static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ return pgd_none(pgd); ++ else ++ return 0; ++} + +-#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) ++static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ pgd_clear(pgdp); ++} + +-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) ++static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) + { +- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; ++ if (kvm_stage2_has_pud(kvm)) ++ return pgd_present(pgd); ++ else ++ return 1; ++} + +- return (boundary - 1 < end - 1) ? boundary : end; ++static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ pgd_populate(NULL, pgd, pud); ++} ++ ++static inline pud_t *stage2_pud_offset(struct kvm *kvm, ++ pgd_t *pgd, unsigned long address) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ return pud_offset(pgd, address); ++ else ++ return (pud_t *)pgd; + } + +-#endif /* STAGE2_PGTABLE_LEVELS > 3 */ ++static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ pud_free(NULL, pud); ++} + ++static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) ++{ ++ if (kvm_stage2_has_pud(kvm)) ++ return kvm_page_empty(pudp); ++ else ++ return false; ++} + +-#if STAGE2_PGTABLE_LEVELS > 2 ++static inline phys_addr_t ++stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) ++{ ++ if (kvm_stage2_has_pud(kvm)) { ++ phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; ++ ++ return (boundary - 1 < end - 1) ? boundary : end; ++ } else { ++ return end; ++ } ++} ++ ++/* Stage2 PMD definitions when the level is present */ ++static inline bool kvm_stage2_has_pmd(struct kvm *kvm) ++{ ++ return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); ++} + + #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) +-#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) ++#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) + #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) + +-#define stage2_pud_none(pud) pud_none(pud) +-#define stage2_pud_clear(pud) pud_clear(pud) +-#define stage2_pud_present(pud) pud_present(pud) +-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) ++static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ return pud_none(pud); ++ else ++ return 0; ++} ++ ++static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ pud_clear(pud); ++} + +-#define stage2_pud_huge(pud) pud_huge(pud) +-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) ++static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ return pud_present(pud); ++ else ++ return 1; ++} + +-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) ++static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) + { +- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; ++ if (kvm_stage2_has_pmd(kvm)) ++ pud_populate(NULL, pud, pmd); ++} + +- return (boundary - 1 < end - 1) ? boundary : end; ++static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, ++ pud_t *pud, unsigned long address) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ return pmd_offset(pud, address); ++ else ++ return (pmd_t *)pud; + } + +-#endif /* STAGE2_PGTABLE_LEVELS > 2 */ ++static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ pmd_free(NULL, pmd); ++} ++ ++static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ return pud_huge(pud); ++ else ++ return 0; ++} ++ ++static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) ++{ ++ if (kvm_stage2_has_pmd(kvm)) ++ return kvm_page_empty(pmdp); ++ else ++ return 0; ++} + +-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) ++static inline phys_addr_t ++stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) ++{ ++ if (kvm_stage2_has_pmd(kvm)) { ++ phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + +-#if STAGE2_PGTABLE_LEVELS == 2 +-#include +-#elif STAGE2_PGTABLE_LEVELS == 3 +-#include +-#endif ++ return (boundary - 1 < end - 1) ? boundary : end; ++ } else { ++ return end; ++ } ++} + ++static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) ++{ ++ return kvm_page_empty(ptep); ++} + +-#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) ++static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) ++{ ++ return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); ++} + +-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) ++static inline phys_addr_t ++stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) + { +- phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; ++ phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); + + return (boundary - 1 < end - 1) ? boundary : end; + } +diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c +index 93f69d82225d..e14d600d7877 100644 +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -117,6 +118,7 @@ EXPORT_SYMBOL(cpu_hwcap_keys); + static bool __maybe_unused + cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); + ++static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); + + /* + * NOTE: Any changes to the visibility of features should be kept in +@@ -873,6 +875,29 @@ static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, + return ctr & BIT(CTR_DIC_SHIFT); + } + ++static bool nocnp; ++ ++static int __init early_nocnp(char *p) ++{ ++ nocnp = true; ++ return 0; ++} ++early_param("nocnp", early_nocnp); ++ ++static bool __maybe_unused ++has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) ++{ ++ /* ++ * Kdump isn't guaranteed to power-off all secondary CPUs, CNP ++ * may share TLB entries with a CPU stuck in the crashed ++ * kernel. ++ */ ++ if (is_kdump_kernel()) ++ return false; ++ ++ return has_cpuid_feature(entry, scope) && !nocnp; ++} ++ + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +@@ -1235,6 +1260,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = { + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, ++#endif ++#ifdef CONFIG_ARM64_CNP ++ { ++ .desc = "Common not Private translations", ++ .capability = ARM64_HAS_CNP, ++ .type = ARM64_CPUCAP_SYSTEM_FEATURE, ++ .matches = has_useable_cnp, ++ .sys_reg = SYS_ID_AA64MMFR2_EL1, ++ .sign = FTR_UNSIGNED, ++ .field_pos = ID_AA64MMFR2_CNP_SHIFT, ++ .min_field_value = 1, ++ .cpu_enable = cpu_enable_cnp, ++ }, + #endif + {}, + }; +@@ -1672,6 +1710,11 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) + return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); + } + ++static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) ++{ ++ cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); ++} ++ + /* + * We emulate only the following system register space. + * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7] +diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c +index 70c283368b64..9405d1b7f4b0 100644 +--- a/arch/arm64/kernel/suspend.c ++++ b/arch/arm64/kernel/suspend.c +@@ -48,6 +48,10 @@ void notrace __cpu_suspend_exit(void) + */ + cpu_uninstall_idmap(); + ++ /* Restore CnP bit in TTBR1_EL1 */ ++ if (system_supports_cnp()) ++ cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); ++ + /* + * PSTATE was not saved over suspend/resume, re-enable any detected + * features that might not have been set correctly. +diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c +index a6c9fbaeaefc..dd436a50fce7 100644 +--- a/arch/arm64/kvm/guest.c ++++ b/arch/arm64/kvm/guest.c +@@ -391,15 +391,15 @@ int __attribute_const__ kvm_target_cpu(void) + return KVM_ARM_TARGET_CORTEX_A53; + case ARM_CPU_PART_CORTEX_A57: + return KVM_ARM_TARGET_CORTEX_A57; +- }; ++ } + break; + case ARM_CPU_IMP_APM: + switch (part_number) { + case APM_CPU_PART_POTENZA: + return KVM_ARM_TARGET_XGENE_POTENZA; +- }; ++ } + break; +- }; ++ } + + /* Return a default generic target */ + return KVM_ARM_TARGET_GENERIC_V8; +diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S +index ea9225160786..4576b86a5579 100644 +--- a/arch/arm64/kvm/hyp-init.S ++++ b/arch/arm64/kvm/hyp-init.S +@@ -65,6 +65,9 @@ __do_hyp_init: + b.lo __kvm_handle_stub_hvc + + phys_to_ttbr x4, x0 ++alternative_if ARM64_HAS_CNP ++ orr x4, x4, #TTBR_CNP_BIT ++alternative_else_nop_endif + msr ttbr0_el2, x4 + + mrs x4, tcr_el1 +diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile +index 2fabc2dc1966..82d1904328ad 100644 +--- a/arch/arm64/kvm/hyp/Makefile ++++ b/arch/arm64/kvm/hyp/Makefile +@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o + obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o + obj-$(CONFIG_KVM_ARM_HOST) += tlb.o + obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o +-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o + + # KVM code is run at a different exception code with a different map, so + # compiler instrumentation that inserts callbacks or checks into the code may +diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c +deleted file mode 100644 +index 603e1ee83e89..000000000000 +--- a/arch/arm64/kvm/hyp/s2-setup.c ++++ /dev/null +@@ -1,90 +0,0 @@ +-/* +- * Copyright (C) 2016 - ARM Ltd +- * Author: Marc Zyngier +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#include +-#include +-#include +-#include +- +-u32 __hyp_text __init_stage2_translation(void) +-{ +- u64 val = VTCR_EL2_FLAGS; +- u64 parange; +- u64 tmp; +- +- /* +- * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS +- * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while +- * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... +- */ +- parange = read_sysreg(id_aa64mmfr0_el1) & 7; +- if (parange > ID_AA64MMFR0_PARANGE_MAX) +- parange = ID_AA64MMFR0_PARANGE_MAX; +- val |= parange << 16; +- +- /* Compute the actual PARange... */ +- switch (parange) { +- case 0: +- parange = 32; +- break; +- case 1: +- parange = 36; +- break; +- case 2: +- parange = 40; +- break; +- case 3: +- parange = 42; +- break; +- case 4: +- parange = 44; +- break; +- case 5: +- default: +- parange = 48; +- break; +- } +- +- /* +- * ... and clamp it to 40 bits, unless we have some braindead +- * HW that implements less than that. In all cases, we'll +- * return that value for the rest of the kernel to decide what +- * to do. +- */ +- val |= 64 - (parange > 40 ? 40 : parange); +- +- /* +- * Check the availability of Hardware Access Flag / Dirty Bit +- * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. +- */ +- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; +- if (tmp) +- val |= VTCR_EL2_HA; +- +- /* +- * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS +- * bit in VTCR_EL2. +- */ +- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; +- val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? +- VTCR_EL2_VS_16BIT : +- VTCR_EL2_VS_8BIT; +- +- write_sysreg(val, vtcr_el2); +- +- return parange; +-} +diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c +index a1c32c1f2267..f6e02cc4d856 100644 +--- a/arch/arm64/kvm/hyp/switch.c ++++ b/arch/arm64/kvm/hyp/switch.c +@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void) + + static void __hyp_text __activate_vm(struct kvm *kvm) + { +- write_sysreg(kvm->arch.vttbr, vttbr_el2); ++ __load_guest_stage2(kvm); + } + + static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) +@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ +- *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4; ++ *hpfar = PAR_TO_HPFAR(tmp); + return true; + } + +diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c +index c041eab3dce0..7fcc9c1a5f45 100644 +--- a/arch/arm64/kvm/hyp/tlb.c ++++ b/arch/arm64/kvm/hyp/tlb.c +@@ -35,7 +35,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, + * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so + * let's flip TGE before executing the TLB operation. + */ +- write_sysreg(kvm->arch.vttbr, vttbr_el2); ++ __load_guest_stage2(kvm); + val = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + write_sysreg(val, hcr_el2); +@@ -45,7 +45,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, + static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, + unsigned long *flags) + { +- write_sysreg(kvm->arch.vttbr, vttbr_el2); ++ __load_guest_stage2(kvm); + isb(); + } + +diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c +index e37c78bbe1ca..b72a3dd56204 100644 +--- a/arch/arm64/kvm/reset.c ++++ b/arch/arm64/kvm/reset.c +@@ -26,6 +26,7 @@ + + #include + ++#include + #include + #include + #include +@@ -33,6 +34,9 @@ + #include + #include + ++/* Maximum phys_shift supported for any VM on this host */ ++static u32 kvm_ipa_limit; ++ + /* + * ARMv8 Reset Values + */ +@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void) + } + + /** +- * kvm_arch_dev_ioctl_check_extension ++ * kvm_arch_vm_ioctl_check_extension + * + * We currently assume that the number of HW registers is uniform + * across all CPUs (see cpuinfo_sanity_check). + */ +-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) ++int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) + { + int r; + +@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) + break; + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_VCPU_ATTRIBUTES: +- case KVM_CAP_VCPU_EVENTS: + r = 1; + break; ++ case KVM_CAP_ARM_VM_IPA_SIZE: ++ r = kvm_ipa_limit; ++ break; + default: + r = 0; + } +@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) + /* Reset timer */ + return kvm_timer_vcpu_reset(vcpu); + } ++ ++void kvm_set_ipa_limit(void) ++{ ++ unsigned int ipa_max, pa_max, va_max, parange; ++ ++ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; ++ pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); ++ ++ /* Clamp the IPA limit to the PA size supported by the kernel */ ++ ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; ++ /* ++ * Since our stage2 table is dependent on the stage1 page table code, ++ * we must always honor the following condition: ++ * ++ * Number of levels in Stage1 >= Number of levels in Stage2. ++ * ++ * So clamp the ipa limit further down to limit the number of levels. ++ * Since we can concatenate upto 16 tables at entry level, we could ++ * go upto 4bits above the maximum VA addressible with the current ++ * number of levels. ++ */ ++ va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; ++ va_max += 4; ++ ++ if (va_max < ipa_max) ++ ipa_max = va_max; ++ ++ /* ++ * If the final limit is lower than the real physical address ++ * limit of the CPUs, report the reason. ++ */ ++ if (ipa_max < pa_max) ++ pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", ++ (va_max < pa_max) ? "Virtual" : "Physical"); ++ ++ WARN(ipa_max < KVM_PHYS_SHIFT, ++ "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); ++ kvm_ipa_limit = ipa_max; ++ kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); ++} ++ ++/* ++ * Configure the VTCR_EL2 for this VM. The VTCR value is common ++ * across all the physical CPUs on the system. We use system wide ++ * sanitised values to fill in different fields, except for Hardware ++ * Management of Access Flags. HA Flag is set unconditionally on ++ * all CPUs, as it is safe to run with or without the feature and ++ * the bit is RES0 on CPUs that don't support it. ++ */ ++int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) ++{ ++ u64 vtcr = VTCR_EL2_FLAGS; ++ u32 parange, phys_shift; ++ u8 lvls; ++ ++ if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) ++ return -EINVAL; ++ ++ phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); ++ if (phys_shift) { ++ if (phys_shift > kvm_ipa_limit || ++ phys_shift < 32) ++ return -EINVAL; ++ } else { ++ phys_shift = KVM_PHYS_SHIFT; ++ } ++ ++ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; ++ if (parange > ID_AA64MMFR0_PARANGE_MAX) ++ parange = ID_AA64MMFR0_PARANGE_MAX; ++ vtcr |= parange << VTCR_EL2_PS_SHIFT; ++ ++ vtcr |= VTCR_EL2_T0SZ(phys_shift); ++ /* ++ * Use a minimum 2 level page table to prevent splitting ++ * host PMD huge pages at stage2. ++ */ ++ lvls = stage2_pgtable_levels(phys_shift); ++ if (lvls < 2) ++ lvls = 2; ++ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); ++ ++ /* ++ * Enable the Hardware Access Flag management, unconditionally ++ * on all CPUs. The features is RES0 on CPUs without the support ++ * and must be ignored by the CPUs. ++ */ ++ vtcr |= VTCR_EL2_HA; ++ ++ /* Set the vmid bits */ ++ vtcr |= (kvm_get_vmid_bits() == 16) ? ++ VTCR_EL2_VS_16BIT : ++ VTCR_EL2_VS_8BIT; ++ kvm->arch.vtcr = vtcr; ++ return 0; ++} +diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c +index c127f94da8e2..a65af49e12e7 100644 +--- a/arch/arm64/mm/context.c ++++ b/arch/arm64/mm/context.c +@@ -196,6 +196,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) + unsigned long flags; + u64 asid, old_active_asid; + ++ if (system_supports_cnp()) ++ cpu_set_reserved_ttbr0(); ++ + asid = atomic64_read(&mm->context.id); + + /* +diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S +index 03646e6a2ef4..2c75b0b903ae 100644 +--- a/arch/arm64/mm/proc.S ++++ b/arch/arm64/mm/proc.S +@@ -160,6 +160,12 @@ ENTRY(cpu_do_switch_mm) + mrs x2, ttbr1_el1 + mmid x1, x1 // get mm->context.id + phys_to_ttbr x3, x0 ++ ++alternative_if ARM64_HAS_CNP ++ cbz x1, 1f // skip CNP for reserved ASID ++ orr x3, x3, #TTBR_CNP_BIT ++1: ++alternative_else_nop_endif + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + bfi x3, x1, #48, #16 // set the ASID field in TTBR0 + #endif +@@ -184,7 +190,7 @@ ENDPROC(cpu_do_switch_mm) + .endm + + /* +- * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) ++ * void idmap_cpu_replace_ttbr1(phys_addr_t ttbr1) + * + * This is the low-level counterpart to cpu_replace_ttbr1, and should not be + * called by anything else. It can only be executed from a TTBR0 mapping. +@@ -194,8 +200,7 @@ ENTRY(idmap_cpu_replace_ttbr1) + + __idmap_cpu_set_reserved_ttbr1 x1, x3 + +- phys_to_ttbr x3, x0 +- msr ttbr1_el1, x3 ++ msr ttbr1_el1, x0 + isb + + restore_daif x2 +diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h +index 8bdbb5f29494..74b0aa9c7499 100644 +--- a/include/linux/irqchip/arm-gic-v3.h ++++ b/include/linux/irqchip/arm-gic-v3.h +@@ -357,6 +357,8 @@ + #define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) + #define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) + ++#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) ++ + #define GITS_BASER_NR_REGS 8 + + #define GITS_BASER_VALID (1ULL << 63) +@@ -388,6 +390,9 @@ + #define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) + #define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) ++#define GITS_BASER_ADDR_48_to_52(baser) \ ++ (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) ++ + #define GITS_BASER_SHAREABILITY_SHIFT (10) + #define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 251be353f950..c6a2f49b2d2e 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -750,6 +750,15 @@ struct kvm_ppc_resize_hpt { + + #define KVM_S390_SIE_PAGE_OFFSET 1 + ++/* ++ * On arm64, machine type can be used to request the physical ++ * address size for the VM. Bits[7-0] are reserved for the guest ++ * PA size shift (i.e, log2(PA_Size)). For backward compatibility, ++ * value 0 implies the default IPA size, 40bits. ++ */ ++#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL ++#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ ++ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + /* + * ioctls for /dev/kvm fds: + */ +@@ -953,6 +962,12 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_NESTED_STATE 157 + #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 + #define KVM_CAP_MSR_PLATFORM_INFO 159 ++#define KVM_CAP_PPC_NESTED_HV 160 ++#define KVM_CAP_HYPERV_SEND_IPI 161 ++#define KVM_CAP_COALESCED_PIO 162 ++#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 ++#define KVM_CAP_EXCEPTION_PAYLOAD 164 ++#define KVM_CAP_ARM_VM_IPA_SIZE 165 + + #ifdef KVM_CAP_IRQ_ROUTING + +diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c +index 91495045ad5a..abcd29db2d7a 100644 +--- a/virt/kvm/arm/arm.c ++++ b/virt/kvm/arm/arm.c +@@ -120,8 +120,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + { + int ret, cpu; + +- if (type) +- return -EINVAL; ++ ret = kvm_arm_setup_stage2(kvm, type); ++ if (ret) ++ return ret; + + kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); + if (!kvm->arch.last_vcpu_ran) +@@ -212,6 +213,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) + case KVM_CAP_READONLY_MEM: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: ++ case KVM_CAP_VCPU_EVENTS: + r = 1; + break; + case KVM_CAP_ARM_SET_DEVICE_ADDR: +@@ -240,7 +242,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) + r = 1; + break; + default: +- r = kvm_arch_dev_ioctl_check_extension(kvm, ext); ++ r = kvm_arch_vm_ioctl_check_extension(kvm, ext); + break; + } + return r; +@@ -498,7 +500,7 @@ static bool need_new_vmid_gen(struct kvm *kvm) + static void update_vttbr(struct kvm *kvm) + { + phys_addr_t pgd_phys; +- u64 vmid; ++ u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; + + if (!need_new_vmid_gen(kvm)) + return; +@@ -540,9 +542,9 @@ static void update_vttbr(struct kvm *kvm) + + /* update vttbr to be used with the new vmid */ + pgd_phys = virt_to_phys(kvm->arch.pgd); +- BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); ++ BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); + vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); +- kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; ++ kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; + + smp_wmb(); + WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); +@@ -1306,16 +1308,10 @@ static void cpu_hyp_reinit(void) + { + cpu_hyp_reset(); + +- if (is_kernel_in_hyp_mode()) { +- /* +- * __cpu_init_stage2() is safe to call even if the PM +- * event was cancelled before the CPU was reset. +- */ +- __cpu_init_stage2(); ++ if (is_kernel_in_hyp_mode()) + kvm_timer_init_vhe(); +- } else { ++ else + cpu_init_hyp_mode(NULL); +- } + + kvm_arm_init_debug(); + +@@ -1411,6 +1407,8 @@ static int init_common_resources(void) + kvm_vmid_bits = kvm_get_vmid_bits(); + kvm_info("%d-bit VMID\n", kvm_vmid_bits); + ++ kvm_set_ipa_limit(); ++ + return 0; + } + +diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c +index 2f405b0be25c..c23a1b323aad 100644 +--- a/virt/kvm/arm/mmu.c ++++ b/virt/kvm/arm/mmu.c +@@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector; + + static unsigned long io_map_base; + +-#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) + #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) + + #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +@@ -150,20 +149,20 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) + + static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) + { +- pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); +- stage2_pgd_clear(pgd); ++ pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); ++ stage2_pgd_clear(kvm, pgd); + kvm_tlb_flush_vmid_ipa(kvm, addr); +- stage2_pud_free(pud_table); ++ stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(pgd)); + } + + static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) + { +- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); +- VM_BUG_ON(stage2_pud_huge(*pud)); +- stage2_pud_clear(pud); ++ pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); ++ VM_BUG_ON(stage2_pud_huge(kvm, *pud)); ++ stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); +- stage2_pmd_free(pmd_table); ++ stage2_pmd_free(kvm, pmd_table); + put_page(virt_to_page(pud)); + } + +@@ -252,7 +251,7 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, + } + } while (pte++, addr += PAGE_SIZE, addr != end); + +- if (stage2_pte_table_empty(start_pte)) ++ if (stage2_pte_table_empty(kvm, start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); + } + +@@ -262,9 +261,9 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t next, start_addr = addr; + pmd_t *pmd, *start_pmd; + +- start_pmd = pmd = stage2_pmd_offset(pud, addr); ++ start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); + do { +- next = stage2_pmd_addr_end(addr, end); ++ next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + pmd_t old_pmd = *pmd; +@@ -281,7 +280,7 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, + } + } while (pmd++, addr = next, addr != end); + +- if (stage2_pmd_table_empty(start_pmd)) ++ if (stage2_pmd_table_empty(kvm, start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); + } + +@@ -291,14 +290,14 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t next, start_addr = addr; + pud_t *pud, *start_pud; + +- start_pud = pud = stage2_pud_offset(pgd, addr); ++ start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + do { +- next = stage2_pud_addr_end(addr, end); +- if (!stage2_pud_none(*pud)) { +- if (stage2_pud_huge(*pud)) { ++ next = stage2_pud_addr_end(kvm, addr, end); ++ if (!stage2_pud_none(kvm, *pud)) { ++ if (stage2_pud_huge(kvm, *pud)) { + pud_t old_pud = *pud; + +- stage2_pud_clear(pud); ++ stage2_pud_clear(kvm, pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + kvm_flush_dcache_pud(old_pud); + put_page(virt_to_page(pud)); +@@ -308,7 +307,7 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, + } + } while (pud++, addr = next, addr != end); + +- if (stage2_pud_table_empty(start_pud)) ++ if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); + } + +@@ -332,7 +331,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) + assert_spin_locked(&kvm->mmu_lock); + WARN_ON(size & ~PAGE_MASK); + +- pgd = kvm->arch.pgd + stage2_pgd_index(addr); ++ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Make sure the page table is still active, as another thread +@@ -341,8 +340,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) + */ + if (!READ_ONCE(kvm->arch.pgd)) + break; +- next = stage2_pgd_addr_end(addr, end); +- if (!stage2_pgd_none(*pgd)) ++ next = stage2_pgd_addr_end(kvm, addr, end); ++ if (!stage2_pgd_none(kvm, *pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); + /* + * If the range is too large, release the kvm->mmu_lock +@@ -371,9 +370,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, + pmd_t *pmd; + phys_addr_t next; + +- pmd = stage2_pmd_offset(pud, addr); ++ pmd = stage2_pmd_offset(kvm, pud, addr); + do { +- next = stage2_pmd_addr_end(addr, end); ++ next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) + kvm_flush_dcache_pmd(*pmd); +@@ -389,11 +388,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, + pud_t *pud; + phys_addr_t next; + +- pud = stage2_pud_offset(pgd, addr); ++ pud = stage2_pud_offset(kvm, pgd, addr); + do { +- next = stage2_pud_addr_end(addr, end); +- if (!stage2_pud_none(*pud)) { +- if (stage2_pud_huge(*pud)) ++ next = stage2_pud_addr_end(kvm, addr, end); ++ if (!stage2_pud_none(kvm, *pud)) { ++ if (stage2_pud_huge(kvm, *pud)) + kvm_flush_dcache_pud(*pud); + else + stage2_flush_pmds(kvm, pud, addr, next); +@@ -409,10 +408,11 @@ static void stage2_flush_memslot(struct kvm *kvm, + phys_addr_t next; + pgd_t *pgd; + +- pgd = kvm->arch.pgd + stage2_pgd_index(addr); ++ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { +- next = stage2_pgd_addr_end(addr, end); +- stage2_flush_puds(kvm, pgd, addr, next); ++ next = stage2_pgd_addr_end(kvm, addr, end); ++ if (!stage2_pgd_none(kvm, *pgd)) ++ stage2_flush_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); + } + +@@ -897,7 +897,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) + } + + /* Allocate the HW PGD, making sure that each page gets its own refcount */ +- pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); ++ pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); + if (!pgd) + return -ENOMEM; + +@@ -986,7 +986,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) + + spin_lock(&kvm->mmu_lock); + if (kvm->arch.pgd) { +- unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); ++ unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); + pgd = READ_ONCE(kvm->arch.pgd); + kvm->arch.pgd = NULL; + } +@@ -994,7 +994,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) + + /* Free the HW pgd, one page at a time */ + if (pgd) +- free_pages_exact(pgd, S2_PGD_SIZE); ++ free_pages_exact(pgd, stage2_pgd_size(kvm)); + } + + static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +@@ -1003,16 +1003,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache + pgd_t *pgd; + pud_t *pud; + +- pgd = kvm->arch.pgd + stage2_pgd_index(addr); +- if (WARN_ON(stage2_pgd_none(*pgd))) { ++ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); ++ if (stage2_pgd_none(kvm, *pgd)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); +- stage2_pgd_populate(pgd, pud); ++ stage2_pgd_populate(kvm, pgd, pud); + get_page(virt_to_page(pgd)); + } + +- return stage2_pud_offset(pgd, addr); ++ return stage2_pud_offset(kvm, pgd, addr); + } + + static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +@@ -1025,15 +1025,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache + if (!pud) + return NULL; + +- if (stage2_pud_none(*pud)) { ++ if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return NULL; + pmd = mmu_memory_cache_alloc(cache); +- stage2_pud_populate(pud, pmd); ++ stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + +- return stage2_pmd_offset(pud, addr); ++ return stage2_pmd_offset(kvm, pud, addr); + } + + static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache +@@ -1207,8 +1207,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, + if (writable) + pte = kvm_s2pte_mkwrite(pte); + +- ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, +- KVM_NR_MEM_OBJS); ++ ret = mmu_topup_memory_cache(&cache, ++ kvm_mmu_cache_min_pages(kvm), ++ KVM_NR_MEM_OBJS); + if (ret) + goto out; + spin_lock(&kvm->mmu_lock); +@@ -1302,19 +1303,21 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) + + /** + * stage2_wp_pmds - write protect PUD range ++ * kvm: kvm instance for the VM + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +-static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) ++static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, ++ phys_addr_t addr, phys_addr_t end) + { + pmd_t *pmd; + phys_addr_t next; + +- pmd = stage2_pmd_offset(pud, addr); ++ pmd = stage2_pmd_offset(kvm, pud, addr); + + do { +- next = stage2_pmd_addr_end(addr, end); ++ next = stage2_pmd_addr_end(kvm, addr, end); + if (!pmd_none(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) +@@ -1334,18 +1337,19 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) + * + * Process PUD entries, for a huge PUD we cause a panic. + */ +-static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) ++static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, ++ phys_addr_t addr, phys_addr_t end) + { + pud_t *pud; + phys_addr_t next; + +- pud = stage2_pud_offset(pgd, addr); ++ pud = stage2_pud_offset(kvm, pgd, addr); + do { +- next = stage2_pud_addr_end(addr, end); +- if (!stage2_pud_none(*pud)) { ++ next = stage2_pud_addr_end(kvm, addr, end); ++ if (!stage2_pud_none(kvm, *pud)) { + /* TODO:PUD not supported, revisit later if supported */ +- BUG_ON(stage2_pud_huge(*pud)); +- stage2_wp_pmds(pud, addr, next); ++ BUG_ON(stage2_pud_huge(kvm, *pud)); ++ stage2_wp_pmds(kvm, pud, addr, next); + } + } while (pud++, addr = next, addr != end); + } +@@ -1361,7 +1365,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) + pgd_t *pgd; + phys_addr_t next; + +- pgd = kvm->arch.pgd + stage2_pgd_index(addr); ++ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is +@@ -1375,9 +1379,9 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) + cond_resched_lock(&kvm->mmu_lock); + if (!READ_ONCE(kvm->arch.pgd)) + break; +- next = stage2_pgd_addr_end(addr, end); +- if (stage2_pgd_present(*pgd)) +- stage2_wp_puds(pgd, addr, next); ++ next = stage2_pgd_addr_end(kvm, addr, end); ++ if (stage2_pgd_present(kvm, *pgd)) ++ stage2_wp_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); + } + +@@ -1526,7 +1530,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + up_read(¤t->mm->mmap_sem); + + /* We need minimum second+third level pages */ +- ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, ++ ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); + if (ret) + return ret; +@@ -1769,7 +1773,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) + } + + /* Userspace should not be able to register out-of-bounds IPAs */ +- VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); ++ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); + + if (fault_status == FSC_ACCESS) { + handle_access_fault(vcpu, fault_ipa); +@@ -2068,7 +2072,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, + * space addressable by the KVM guest IPA space. + */ + if (memslot->base_gfn + memslot->npages >= +- (KVM_PHYS_SIZE >> PAGE_SHIFT)) ++ (kvm_phys_size(kvm) >> PAGE_SHIFT)) + return -EFAULT; + + down_read(¤t->mm->mmap_sem); +diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c +index 12502251727e..eb2a390a6c86 100644 +--- a/virt/kvm/arm/vgic/vgic-its.c ++++ b/virt/kvm/arm/vgic/vgic-its.c +@@ -241,13 +241,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(ite, &(dev)->itt_head, ite_list) + +-/* +- * We only implement 48 bits of PA at the moment, although the ITS +- * supports more. Let's be restrictive here. +- */ +-#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) +-#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) +- + #define GIC_LPI_OFFSET 8192 + + #define VITS_TYPER_IDBITS 16 +@@ -759,6 +752,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + { + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + u64 indirect_ptr, type = GITS_BASER_TYPE(baser); ++ phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); + int esz = GITS_BASER_ENTRY_SIZE(baser); + int index; + gfn_t gfn; +@@ -783,7 +777,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + if (id >= (l1_tbl_size / esz)) + return false; + +- addr = BASER_ADDRESS(baser) + id * esz; ++ addr = base + id * esz; + gfn = addr >> PAGE_SHIFT; + + if (eaddr) +@@ -798,7 +792,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest_lock(its->dev->kvm, +- BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), ++ base + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + +@@ -808,11 +802,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, + if (!(indirect_ptr & BIT_ULL(63))) + return false; + +- /* +- * Mask the guest physical address and calculate the frame number. +- * Any address beyond our supported 48 bits of PA will be caught +- * by the actual check in the final step. +- */ ++ /* Mask the guest physical address and calculate the frame number. */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ +@@ -1304,9 +1294,6 @@ static u64 vgic_sanitise_its_baser(u64 reg) + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + +- /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ +- reg &= ~GENMASK_ULL(15, 12); +- + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + +@@ -1325,11 +1312,8 @@ static u64 vgic_sanitise_its_cbaser(u64 reg) + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + +- /* +- * Sanitise the physical address to be 64k aligned. +- * Also limit the physical addresses to 48 bits. +- */ +- reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); ++ /* Sanitise the physical address to be 64k aligned. */ ++ reg &= ~GENMASK_ULL(15, 12); + + return reg; + } +@@ -1375,7 +1359,7 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) + if (!its->enabled) + return; + +- cbaser = CBASER_ADDRESS(its->cbaser); ++ cbaser = GITS_CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, +@@ -2233,7 +2217,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) + if (!(baser & GITS_BASER_VALID)) + return 0; + +- l1_gpa = BASER_ADDRESS(baser); ++ l1_gpa = GITS_BASER_ADDR_48_to_52(baser); + + if (baser & GITS_BASER_INDIRECT) { + l1_esz = GITS_LVL1_ENTRY_SIZE; +@@ -2305,7 +2289,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) + { + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + u64 baser = its->baser_coll_table; +- gpa_t gpa = BASER_ADDRESS(baser); ++ gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); + struct its_collection *collection; + u64 val; + size_t max_size, filled = 0; +@@ -2354,7 +2338,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) + if (!(baser & GITS_BASER_VALID)) + return 0; + +- gpa = BASER_ADDRESS(baser); ++ gpa = GITS_BASER_ADDR_48_to_52(baser); + + max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + +diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c +index 6ada2432e37c..5099723c38c6 100644 +--- a/virt/kvm/arm/vgic/vgic-kvm-device.c ++++ b/virt/kvm/arm/vgic/vgic-kvm-device.c +@@ -25,7 +25,7 @@ + int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) + { +- if (addr & ~KVM_PHYS_MASK) ++ if (addr & ~kvm_phys_mask(kvm)) + return -E2BIG; + + if (!IS_ALIGNED(addr, alignment)) +diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c +index a2a175b08b17..b3d1f0985117 100644 +--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c ++++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c +@@ -364,7 +364,6 @@ static u64 vgic_sanitise_pendbaser(u64 reg) + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; +- reg &= ~GENMASK_ULL(51, 48); + + return reg; + } +@@ -382,7 +381,6 @@ static u64 vgic_sanitise_propbaser(u64 reg) + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; +- reg &= ~GENMASK_ULL(51, 48); + return reg; + } + +-- +2.17.1 +