package: enable cloud-hypervisor for arm64

Now, cloud-hypervisor is capable to work on arm64. it's time to
enable it in kata for arm64.
as cloud-hypervisor can only use virtio-fs, a new patch should be
applied to kernel for virtiofs and some config should be removed
temporarily.

Fixes: #446
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
This commit is contained in:
Jianyong Wu 2020-07-23 15:34:12 +08:00
parent c5c3f5c31d
commit c0720179d2
6 changed files with 530 additions and 7 deletions

View File

@ -29,8 +29,8 @@ download_kernel=false
# The repository where kernel configuration lives # The repository where kernel configuration lives
runtime_repository="github.com/${project_name}/runtime" runtime_repository="github.com/${project_name}/runtime"
# The repository where kernel configuration lives # The repository where kernel configuration lives
readonly kernel_config_repo="github.com/${project_name}/packaging" readonly kernel_config_repo="github.com/${project_name}/kata-containers/tools/packaging"
readonly patches_repo="github.com/${project_name}/packaging" readonly patches_repo="github.com/${project_name}/kata-containers/tools/packaging"
readonly patches_repo_dir="${GOPATH}/src/${patches_repo}" readonly patches_repo_dir="${GOPATH}/src/${patches_repo}"
# Default path to search patches to apply to kernel # Default path to search patches to apply to kernel
readonly default_patches_dir="${patches_repo_dir}/kernel/patches/" readonly default_patches_dir="${patches_repo_dir}/kernel/patches/"
@ -391,7 +391,7 @@ build_kernel() {
make -j $(nproc) ARCH="${arch_target}" make -j $(nproc) ARCH="${arch_target}"
[ "$arch_target" != "powerpc" ] && ([ -e "arch/${arch_target}/boot/bzImage" ] || [ -e "arch/${arch_target}/boot/Image.gz" ]) [ "$arch_target" != "powerpc" ] && ([ -e "arch/${arch_target}/boot/bzImage" ] || [ -e "arch/${arch_target}/boot/Image.gz" ])
[ -e "vmlinux" ] [ -e "vmlinux" ]
[ "${hypervisor_target}" == "firecracker" ] && [ "${arch_target}" == "arm64" ] && [ -e "arch/${arch_target}/boot/Image" ] ([ "${hypervisor_target}" == "firecracker" ] || [ "${hypervisor_target}" == "cloud-hypervisor" ]) && [ "${arch_target}" == "arm64" ] && [ -e "arch/${arch_target}/boot/Image" ]
popd >>/dev/null popd >>/dev/null
} }

View File

@ -22,7 +22,8 @@ CONFIG_PERF_EVENTS=y
CONFIG_ARM64_PSEUDO_NMI=y CONFIG_ARM64_PSEUDO_NMI=y
CONFIG_ARM64_SVE=y CONFIG_ARM64_SVE=y
# Arm64 prefers to use REFCOUNT_FULL by default. # Arm64 prefers to use REFCOUNT_FULL by default, disable it as the
# latest kernel has discarded it, add it to whitelist.
CONFIG_REFCOUNT_FULL=y CONFIG_REFCOUNT_FULL=y
# #

View File

@ -3,5 +3,10 @@
# The implementation of ptp_kvm on arm is one experimental feature, # The implementation of ptp_kvm on arm is one experimental feature,
# you need to apply private patches to enable it on your host machine. # you need to apply private patches to enable it on your host machine.
# See https://github.com/kata-containers/packaging/pull/998 for detailed info. # See https://github.com/kata-containers/packaging/pull/998 for detailed info.
CONFIG_PTP_1588_CLOCK=y #
CONFIG_PTP_1588_CLOCK_KVM=y # as ptp_kvm is under review in upstream and linux kernel in virtio-fs branch
# will change with time, we need update the patch again and again. For now, we
# disable ptp_kvm to wait for the stable kernel version of virtio-fs or ptp_kvm
# accept by upstream.
#CONFIG_PTP_1588_CLOCK=y
#CONFIG_PTP_1588_CLOCK_KVM=y

View File

@ -6,3 +6,4 @@ CONFIG_NF_NAT_PROTO_DCCP
CONFIG_NF_NAT_PROTO_GRE CONFIG_NF_NAT_PROTO_GRE
CONFIG_NF_NAT_PROTO_SCTP CONFIG_NF_NAT_PROTO_SCTP
CONFIG_NF_NAT_PROTO_UDPLITE CONFIG_NF_NAT_PROTO_UDPLITE
CONFIG_REFCOUNT_FULL

View File

@ -1 +1 @@
80 81

View File

@ -0,0 +1,516 @@
From e9ba07d0e1a602df39cc6c9c984c1b838b67f9c4 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 4 Mar 2020 09:58:43 +0530
Subject: [PATCH] arm64/mm: Enable memory hot remove
The arch code for hot-remove must tear down portions of the linear map and
vmemmap corresponding to memory being removed. In both cases the page
tables mapping these regions must be freed, and when sparse vmemmap is in
use the memory backing the vmemmap must also be freed.
This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
can be used to tear down either region and calls it from vmemmap_free() and
___remove_pgd_mapping(). The free_mapped argument determines whether the
backing memory will be freed.
It makes two distinct passes over the kernel page table. In the first pass
with unmap_hotplug_range() it unmaps, invalidates applicable TLB cache and
frees backing memory if required (vmemmap) for each mapped leaf entry. In
the second pass with free_empty_tables() it looks for empty page table
sections whose page table page can be unmapped, TLB invalidated and freed.
While freeing intermediate level page table pages bail out if any of its
entries are still valid. This can happen for partially filled kernel page
table either from a previously attempted failed memory hot add or while
removing an address range which does not span the entire page table page
range.
The vmemmap region may share levels of table with the vmalloc region.
There can be conflicts between hot remove freeing page table pages with
a concurrent vmalloc() walking the kernel page table. This conflict can
not just be solved by taking the init_mm ptl because of existing locking
scheme in vmalloc(). So free_empty_tables() implements a floor and ceiling
method which is borrowed from user page table tear with free_pgd_range()
which skips freeing page table pages if intermediate address range is not
aligned or maximum floor-ceiling might not own the entire page table page.
Boot memory on arm64 cannot be removed. Hence this registers a new memory
hotplug notifier which prevents boot memory offlining and it's removal.
While here update arch_add_memory() to handle __add_pages() failures by
just unmapping recently added kernel linear mapping. Now enable memory hot
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
This implementation is overall inspired from kernel page table tear down
procedure on X86 architecture and user page table tear down method.
[Mike and Catalin added P4D page table level support]
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
arch/arm64/Kconfig | 3 +
arch/arm64/include/asm/memory.h | 1 +
arch/arm64/mm/mmu.c | 379 +++++++++++++++++++++++++++++++-
3 files changed, 374 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0b30e884e088..8fb0ba221a26 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -281,6 +281,9 @@ config ZONE_DMA32
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+
config SMP
def_bool y
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 4d94676e5a8b..2be67b232499 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -54,6 +54,7 @@
#define MODULES_VADDR (BPF_JIT_REGION_END)
#define MODULES_VSIZE (SZ_128M)
#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M)
+#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 128f70852bf3..9b08f7c7e6f0 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -17,6 +17,7 @@
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/memory.h>
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -724,6 +725,312 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(pte));
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, size_t size)
+{
+ WARN_ON(PageReserved(page));
+ free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+ free_hotplug_page_range(page, PAGE_SIZE);
+}
+
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling,
+ unsigned long mask)
+{
+ start &= mask;
+ if (start < floor)
+ return false;
+
+ if (ceiling) {
+ ceiling &= mask;
+ if (!ceiling)
+ return false;
+ }
+
+ if (end - 1 > ceiling - 1)
+ return false;
+ return true;
+}
+
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ pte_t *ptep, pte;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+ if (pte_none(pte))
+ continue;
+
+ WARN_ON(!pte_present(pte));
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
+ } while (addr += PAGE_SIZE, addr < end);
+}
+
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd));
+ if (pmd_sect(pmd)) {
+ pmd_clear(pmdp);
+
+ /*
+ * One TLBI should be sufficient here as the PMD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pmd_page(pmd),
+ PMD_SIZE);
+ continue;
+ }
+ WARN_ON(!pmd_table(pmd));
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud));
+ if (pud_sect(pud)) {
+ pud_clear(pudp);
+
+ /*
+ * One TLBI should be sufficient here as the PUD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pud_page(pud),
+ PUD_SIZE);
+ continue;
+ }
+ WARN_ON(!pud_table(pud));
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
+ bool free_mapped)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pte_t *ptep, pte;
+ unsigned long i, start = addr;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+
+ /*
+ * This is just a sanity check here which verifies that
+ * pte clearing has been done by earlier unmap loops.
+ */
+ WARN_ON(!pte_none(pte));
+ } while (addr += PAGE_SIZE, addr < end);
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pte page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ ptep = pte_offset_kernel(pmdp, 0UL);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (!pte_none(READ_ONCE(ptep[i])))
+ return;
+ }
+
+ pmd_clear(pmdp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(ptep));
+}
+
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pmd_t *pmdp, pmd;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 2)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pmd page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pmdp = pmd_offset(pudp, 0UL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(READ_ONCE(pmdp[i])))
+ return;
+ }
+
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
+}
+
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pud_t *pudp, pud;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 3)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pud page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pudp = pud_offset(p4dp, 0UL);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_none(READ_ONCE(pudp[i])))
+ return;
+ }
+
+ p4d_clear(p4dp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pudp));
+}
+
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_tables(unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#if !ARM64_SWAPPER_USES_SECTION_MAPS
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -771,6 +1078,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
+
+ unmap_hotplug_range(start, end, true);
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
+#endif
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
@@ -1049,10 +1362,21 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+ unsigned long end = start + size;
+
+ WARN_ON(pgdir != init_mm.pgd);
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
+
+ unmap_hotplug_range(start, end, false);
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
+}
+
int arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions)
{
- int flags = 0;
+ int ret, flags = 0;
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1062,22 +1386,59 @@ int arch_add_memory(int nid, u64 start, u64 size,
memblock_clear_nomap(start, size);
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
restrictions);
+ if (ret)
+ __remove_pgd_mapping(swapper_pg_dir,
+ __phys_to_virt(start), size);
+ return ret;
}
+
void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- /*
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
- * adding fails). Until then, this function should only be used
- * during memory hotplug (adding memory), not for memory
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
- * unlocked yet.
- */
__remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
+}
+
+/*
+ * This memory hotplug notifier helps prevent boot memory from being
+ * inadvertently removed as it blocks pfn range offlining process in
+ * __offline_pages(). Hence this prevents both offlining as well as
+ * removal process for boot memory which is initially always online.
+ * In future if and when boot memory could be removed, this notifier
+ * should be dropped and free_hotplug_page_range() should handle any
+ * reserved pages allocated during boot.
+ */
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct mem_section *ms;
+ struct memory_notify *arg = data;
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
+ unsigned long pfn = arg->start_pfn;
+
+ if (action != MEM_GOING_OFFLINE)
+ return NOTIFY_OK;
+
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
+ if (early_section(ms))
+ return NOTIFY_BAD;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block prevent_bootmem_remove_nb = {
+ .notifier_call = prevent_bootmem_remove_notifier,
+};
+
+static int __init prevent_bootmem_remove_init(void)
+{
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
}
+device_initcall(prevent_bootmem_remove_init);
#endif
--
2.17.1