mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-29 04:04:45 +00:00
package: enable cloud-hypervisor for arm64
Now, cloud-hypervisor is capable to work on arm64. it's time to enable it in kata for arm64. as cloud-hypervisor can only use virtio-fs, a new patch should be applied to kernel for virtiofs and some config should be removed temporarily. Fixes: #446 Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
This commit is contained in:
parent
c5c3f5c31d
commit
c0720179d2
@ -29,8 +29,8 @@ download_kernel=false
|
|||||||
# The repository where kernel configuration lives
|
# The repository where kernel configuration lives
|
||||||
runtime_repository="github.com/${project_name}/runtime"
|
runtime_repository="github.com/${project_name}/runtime"
|
||||||
# The repository where kernel configuration lives
|
# The repository where kernel configuration lives
|
||||||
readonly kernel_config_repo="github.com/${project_name}/packaging"
|
readonly kernel_config_repo="github.com/${project_name}/kata-containers/tools/packaging"
|
||||||
readonly patches_repo="github.com/${project_name}/packaging"
|
readonly patches_repo="github.com/${project_name}/kata-containers/tools/packaging"
|
||||||
readonly patches_repo_dir="${GOPATH}/src/${patches_repo}"
|
readonly patches_repo_dir="${GOPATH}/src/${patches_repo}"
|
||||||
# Default path to search patches to apply to kernel
|
# Default path to search patches to apply to kernel
|
||||||
readonly default_patches_dir="${patches_repo_dir}/kernel/patches/"
|
readonly default_patches_dir="${patches_repo_dir}/kernel/patches/"
|
||||||
@ -391,7 +391,7 @@ build_kernel() {
|
|||||||
make -j $(nproc) ARCH="${arch_target}"
|
make -j $(nproc) ARCH="${arch_target}"
|
||||||
[ "$arch_target" != "powerpc" ] && ([ -e "arch/${arch_target}/boot/bzImage" ] || [ -e "arch/${arch_target}/boot/Image.gz" ])
|
[ "$arch_target" != "powerpc" ] && ([ -e "arch/${arch_target}/boot/bzImage" ] || [ -e "arch/${arch_target}/boot/Image.gz" ])
|
||||||
[ -e "vmlinux" ]
|
[ -e "vmlinux" ]
|
||||||
[ "${hypervisor_target}" == "firecracker" ] && [ "${arch_target}" == "arm64" ] && [ -e "arch/${arch_target}/boot/Image" ]
|
([ "${hypervisor_target}" == "firecracker" ] || [ "${hypervisor_target}" == "cloud-hypervisor" ]) && [ "${arch_target}" == "arm64" ] && [ -e "arch/${arch_target}/boot/Image" ]
|
||||||
popd >>/dev/null
|
popd >>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,8 @@ CONFIG_PERF_EVENTS=y
|
|||||||
CONFIG_ARM64_PSEUDO_NMI=y
|
CONFIG_ARM64_PSEUDO_NMI=y
|
||||||
CONFIG_ARM64_SVE=y
|
CONFIG_ARM64_SVE=y
|
||||||
|
|
||||||
# Arm64 prefers to use REFCOUNT_FULL by default.
|
# Arm64 prefers to use REFCOUNT_FULL by default, disable it as the
|
||||||
|
# latest kernel has discarded it, add it to whitelist.
|
||||||
CONFIG_REFCOUNT_FULL=y
|
CONFIG_REFCOUNT_FULL=y
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -3,5 +3,10 @@
|
|||||||
# The implementation of ptp_kvm on arm is one experimental feature,
|
# The implementation of ptp_kvm on arm is one experimental feature,
|
||||||
# you need to apply private patches to enable it on your host machine.
|
# you need to apply private patches to enable it on your host machine.
|
||||||
# See https://github.com/kata-containers/packaging/pull/998 for detailed info.
|
# See https://github.com/kata-containers/packaging/pull/998 for detailed info.
|
||||||
CONFIG_PTP_1588_CLOCK=y
|
#
|
||||||
CONFIG_PTP_1588_CLOCK_KVM=y
|
# as ptp_kvm is under review in upstream and linux kernel in virtio-fs branch
|
||||||
|
# will change with time, we need update the patch again and again. For now, we
|
||||||
|
# disable ptp_kvm to wait for the stable kernel version of virtio-fs or ptp_kvm
|
||||||
|
# accept by upstream.
|
||||||
|
#CONFIG_PTP_1588_CLOCK=y
|
||||||
|
#CONFIG_PTP_1588_CLOCK_KVM=y
|
||||||
|
@ -6,3 +6,4 @@ CONFIG_NF_NAT_PROTO_DCCP
|
|||||||
CONFIG_NF_NAT_PROTO_GRE
|
CONFIG_NF_NAT_PROTO_GRE
|
||||||
CONFIG_NF_NAT_PROTO_SCTP
|
CONFIG_NF_NAT_PROTO_SCTP
|
||||||
CONFIG_NF_NAT_PROTO_UDPLITE
|
CONFIG_NF_NAT_PROTO_UDPLITE
|
||||||
|
CONFIG_REFCOUNT_FULL
|
||||||
|
@ -1 +1 @@
|
|||||||
80
|
81
|
||||||
|
@ -0,0 +1,516 @@
|
|||||||
|
From e9ba07d0e1a602df39cc6c9c984c1b838b67f9c4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Anshuman Khandual <anshuman.khandual@arm.com>
|
||||||
|
Date: Wed, 4 Mar 2020 09:58:43 +0530
|
||||||
|
Subject: [PATCH] arm64/mm: Enable memory hot remove
|
||||||
|
|
||||||
|
The arch code for hot-remove must tear down portions of the linear map and
|
||||||
|
vmemmap corresponding to memory being removed. In both cases the page
|
||||||
|
tables mapping these regions must be freed, and when sparse vmemmap is in
|
||||||
|
use the memory backing the vmemmap must also be freed.
|
||||||
|
|
||||||
|
This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
|
||||||
|
can be used to tear down either region and calls it from vmemmap_free() and
|
||||||
|
___remove_pgd_mapping(). The free_mapped argument determines whether the
|
||||||
|
backing memory will be freed.
|
||||||
|
|
||||||
|
It makes two distinct passes over the kernel page table. In the first pass
|
||||||
|
with unmap_hotplug_range() it unmaps, invalidates applicable TLB cache and
|
||||||
|
frees backing memory if required (vmemmap) for each mapped leaf entry. In
|
||||||
|
the second pass with free_empty_tables() it looks for empty page table
|
||||||
|
sections whose page table page can be unmapped, TLB invalidated and freed.
|
||||||
|
|
||||||
|
While freeing intermediate level page table pages bail out if any of its
|
||||||
|
entries are still valid. This can happen for partially filled kernel page
|
||||||
|
table either from a previously attempted failed memory hot add or while
|
||||||
|
removing an address range which does not span the entire page table page
|
||||||
|
range.
|
||||||
|
|
||||||
|
The vmemmap region may share levels of table with the vmalloc region.
|
||||||
|
There can be conflicts between hot remove freeing page table pages with
|
||||||
|
a concurrent vmalloc() walking the kernel page table. This conflict can
|
||||||
|
not just be solved by taking the init_mm ptl because of existing locking
|
||||||
|
scheme in vmalloc(). So free_empty_tables() implements a floor and ceiling
|
||||||
|
method which is borrowed from user page table tear with free_pgd_range()
|
||||||
|
which skips freeing page table pages if intermediate address range is not
|
||||||
|
aligned or maximum floor-ceiling might not own the entire page table page.
|
||||||
|
|
||||||
|
Boot memory on arm64 cannot be removed. Hence this registers a new memory
|
||||||
|
hotplug notifier which prevents boot memory offlining and it's removal.
|
||||||
|
|
||||||
|
While here update arch_add_memory() to handle __add_pages() failures by
|
||||||
|
just unmapping recently added kernel linear mapping. Now enable memory hot
|
||||||
|
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
|
||||||
|
|
||||||
|
This implementation is overall inspired from kernel page table tear down
|
||||||
|
procedure on X86 architecture and user page table tear down method.
|
||||||
|
|
||||||
|
[Mike and Catalin added P4D page table level support]
|
||||||
|
|
||||||
|
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
|
||||||
|
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
|
||||||
|
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
|
||||||
|
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
|
||||||
|
---
|
||||||
|
arch/arm64/Kconfig | 3 +
|
||||||
|
arch/arm64/include/asm/memory.h | 1 +
|
||||||
|
arch/arm64/mm/mmu.c | 379 +++++++++++++++++++++++++++++++-
|
||||||
|
3 files changed, 374 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
|
||||||
|
index 0b30e884e088..8fb0ba221a26 100644
|
||||||
|
--- a/arch/arm64/Kconfig
|
||||||
|
+++ b/arch/arm64/Kconfig
|
||||||
|
@@ -281,6 +281,9 @@ config ZONE_DMA32
|
||||||
|
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||||
|
def_bool y
|
||||||
|
|
||||||
|
+config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||||
|
+ def_bool y
|
||||||
|
+
|
||||||
|
config SMP
|
||||||
|
def_bool y
|
||||||
|
|
||||||
|
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
|
||||||
|
index 4d94676e5a8b..2be67b232499 100644
|
||||||
|
--- a/arch/arm64/include/asm/memory.h
|
||||||
|
+++ b/arch/arm64/include/asm/memory.h
|
||||||
|
@@ -54,6 +54,7 @@
|
||||||
|
#define MODULES_VADDR (BPF_JIT_REGION_END)
|
||||||
|
#define MODULES_VSIZE (SZ_128M)
|
||||||
|
#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M)
|
||||||
|
+#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
|
||||||
|
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
|
||||||
|
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
|
||||||
|
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
|
||||||
|
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
|
||||||
|
index 128f70852bf3..9b08f7c7e6f0 100644
|
||||||
|
--- a/arch/arm64/mm/mmu.c
|
||||||
|
+++ b/arch/arm64/mm/mmu.c
|
||||||
|
@@ -17,6 +17,7 @@
|
||||||
|
#include <linux/mman.h>
|
||||||
|
#include <linux/nodemask.h>
|
||||||
|
#include <linux/memblock.h>
|
||||||
|
+#include <linux/memory.h>
|
||||||
|
#include <linux/fs.h>
|
||||||
|
#include <linux/io.h>
|
||||||
|
#include <linux/mm.h>
|
||||||
|
@@ -724,6 +725,312 @@ int kern_addr_valid(unsigned long addr)
|
||||||
|
|
||||||
|
return pfn_valid(pte_pfn(pte));
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
|
+static void free_hotplug_page_range(struct page *page, size_t size)
|
||||||
|
+{
|
||||||
|
+ WARN_ON(PageReserved(page));
|
||||||
|
+ free_pages((unsigned long)page_address(page), get_order(size));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_hotplug_pgtable_page(struct page *page)
|
||||||
|
+{
|
||||||
|
+ free_hotplug_page_range(page, PAGE_SIZE);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
|
||||||
|
+ unsigned long floor, unsigned long ceiling,
|
||||||
|
+ unsigned long mask)
|
||||||
|
+{
|
||||||
|
+ start &= mask;
|
||||||
|
+ if (start < floor)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ if (ceiling) {
|
||||||
|
+ ceiling &= mask;
|
||||||
|
+ if (!ceiling)
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (end - 1 > ceiling - 1)
|
||||||
|
+ return false;
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
||||||
|
+ unsigned long end, bool free_mapped)
|
||||||
|
+{
|
||||||
|
+ pte_t *ptep, pte;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||||
|
+ pte = READ_ONCE(*ptep);
|
||||||
|
+ if (pte_none(pte))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pte_present(pte));
|
||||||
|
+ pte_clear(&init_mm, addr, ptep);
|
||||||
|
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||||
|
+ if (free_mapped)
|
||||||
|
+ free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
|
||||||
|
+ } while (addr += PAGE_SIZE, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
||||||
|
+ unsigned long end, bool free_mapped)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ pmd_t *pmdp, pmd;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pmd_addr_end(addr, end);
|
||||||
|
+ pmdp = pmd_offset(pudp, addr);
|
||||||
|
+ pmd = READ_ONCE(*pmdp);
|
||||||
|
+ if (pmd_none(pmd))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pmd_present(pmd));
|
||||||
|
+ if (pmd_sect(pmd)) {
|
||||||
|
+ pmd_clear(pmdp);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * One TLBI should be sufficient here as the PMD_SIZE
|
||||||
|
+ * range is mapped with a single block entry.
|
||||||
|
+ */
|
||||||
|
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||||
|
+ if (free_mapped)
|
||||||
|
+ free_hotplug_page_range(pmd_page(pmd),
|
||||||
|
+ PMD_SIZE);
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+ WARN_ON(!pmd_table(pmd));
|
||||||
|
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
|
||||||
|
+ unsigned long end, bool free_mapped)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ pud_t *pudp, pud;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pud_addr_end(addr, end);
|
||||||
|
+ pudp = pud_offset(p4dp, addr);
|
||||||
|
+ pud = READ_ONCE(*pudp);
|
||||||
|
+ if (pud_none(pud))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pud_present(pud));
|
||||||
|
+ if (pud_sect(pud)) {
|
||||||
|
+ pud_clear(pudp);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * One TLBI should be sufficient here as the PUD_SIZE
|
||||||
|
+ * range is mapped with a single block entry.
|
||||||
|
+ */
|
||||||
|
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||||
|
+ if (free_mapped)
|
||||||
|
+ free_hotplug_page_range(pud_page(pud),
|
||||||
|
+ PUD_SIZE);
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+ WARN_ON(!pud_table(pud));
|
||||||
|
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
|
||||||
|
+ unsigned long end, bool free_mapped)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ p4d_t *p4dp, p4d;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = p4d_addr_end(addr, end);
|
||||||
|
+ p4dp = p4d_offset(pgdp, addr);
|
||||||
|
+ p4d = READ_ONCE(*p4dp);
|
||||||
|
+ if (p4d_none(p4d))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!p4d_present(p4d));
|
||||||
|
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
||||||
|
+ bool free_mapped)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ pgd_t *pgdp, pgd;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pgd_addr_end(addr, end);
|
||||||
|
+ pgdp = pgd_offset_k(addr);
|
||||||
|
+ pgd = READ_ONCE(*pgdp);
|
||||||
|
+ if (pgd_none(pgd))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pgd_present(pgd));
|
||||||
|
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
||||||
|
+ unsigned long end, unsigned long floor,
|
||||||
|
+ unsigned long ceiling)
|
||||||
|
+{
|
||||||
|
+ pte_t *ptep, pte;
|
||||||
|
+ unsigned long i, start = addr;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||||
|
+ pte = READ_ONCE(*ptep);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * This is just a sanity check here which verifies that
|
||||||
|
+ * pte clearing has been done by earlier unmap loops.
|
||||||
|
+ */
|
||||||
|
+ WARN_ON(!pte_none(pte));
|
||||||
|
+ } while (addr += PAGE_SIZE, addr < end);
|
||||||
|
+
|
||||||
|
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Check whether we can free the pte page if the rest of the
|
||||||
|
+ * entries are empty. Overlap with other regions have been
|
||||||
|
+ * handled by the floor/ceiling check.
|
||||||
|
+ */
|
||||||
|
+ ptep = pte_offset_kernel(pmdp, 0UL);
|
||||||
|
+ for (i = 0; i < PTRS_PER_PTE; i++) {
|
||||||
|
+ if (!pte_none(READ_ONCE(ptep[i])))
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ pmd_clear(pmdp);
|
||||||
|
+ __flush_tlb_kernel_pgtable(start);
|
||||||
|
+ free_hotplug_pgtable_page(virt_to_page(ptep));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
||||||
|
+ unsigned long end, unsigned long floor,
|
||||||
|
+ unsigned long ceiling)
|
||||||
|
+{
|
||||||
|
+ pmd_t *pmdp, pmd;
|
||||||
|
+ unsigned long i, next, start = addr;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pmd_addr_end(addr, end);
|
||||||
|
+ pmdp = pmd_offset(pudp, addr);
|
||||||
|
+ pmd = READ_ONCE(*pmdp);
|
||||||
|
+ if (pmd_none(pmd))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
||||||
|
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+
|
||||||
|
+ if (CONFIG_PGTABLE_LEVELS <= 2)
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Check whether we can free the pmd page if the rest of the
|
||||||
|
+ * entries are empty. Overlap with other regions have been
|
||||||
|
+ * handled by the floor/ceiling check.
|
||||||
|
+ */
|
||||||
|
+ pmdp = pmd_offset(pudp, 0UL);
|
||||||
|
+ for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||||
|
+ if (!pmd_none(READ_ONCE(pmdp[i])))
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ pud_clear(pudp);
|
||||||
|
+ __flush_tlb_kernel_pgtable(start);
|
||||||
|
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
|
||||||
|
+ unsigned long end, unsigned long floor,
|
||||||
|
+ unsigned long ceiling)
|
||||||
|
+{
|
||||||
|
+ pud_t *pudp, pud;
|
||||||
|
+ unsigned long i, next, start = addr;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pud_addr_end(addr, end);
|
||||||
|
+ pudp = pud_offset(p4dp, addr);
|
||||||
|
+ pud = READ_ONCE(*pudp);
|
||||||
|
+ if (pud_none(pud))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
||||||
|
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+
|
||||||
|
+ if (CONFIG_PGTABLE_LEVELS <= 3)
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Check whether we can free the pud page if the rest of the
|
||||||
|
+ * entries are empty. Overlap with other regions have been
|
||||||
|
+ * handled by the floor/ceiling check.
|
||||||
|
+ */
|
||||||
|
+ pudp = pud_offset(p4dp, 0UL);
|
||||||
|
+ for (i = 0; i < PTRS_PER_PUD; i++) {
|
||||||
|
+ if (!pud_none(READ_ONCE(pudp[i])))
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ p4d_clear(p4dp);
|
||||||
|
+ __flush_tlb_kernel_pgtable(start);
|
||||||
|
+ free_hotplug_pgtable_page(virt_to_page(pudp));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
|
||||||
|
+ unsigned long end, unsigned long floor,
|
||||||
|
+ unsigned long ceiling)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ p4d_t *p4dp, p4d;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = p4d_addr_end(addr, end);
|
||||||
|
+ p4dp = p4d_offset(pgdp, addr);
|
||||||
|
+ p4d = READ_ONCE(*p4dp);
|
||||||
|
+ if (p4d_none(p4d))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!p4d_present(p4d));
|
||||||
|
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void free_empty_tables(unsigned long addr, unsigned long end,
|
||||||
|
+ unsigned long floor, unsigned long ceiling)
|
||||||
|
+{
|
||||||
|
+ unsigned long next;
|
||||||
|
+ pgd_t *pgdp, pgd;
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ next = pgd_addr_end(addr, end);
|
||||||
|
+ pgdp = pgd_offset_k(addr);
|
||||||
|
+ pgd = READ_ONCE(*pgdp);
|
||||||
|
+ if (pgd_none(pgd))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ WARN_ON(!pgd_present(pgd));
|
||||||
|
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
|
||||||
|
+ } while (addr = next, addr < end);
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||||
|
#if !ARM64_SWAPPER_USES_SECTION_MAPS
|
||||||
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||||
|
@@ -771,6 +1078,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||||
|
void vmemmap_free(unsigned long start, unsigned long end,
|
||||||
|
struct vmem_altmap *altmap)
|
||||||
|
{
|
||||||
|
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
|
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
||||||
|
+
|
||||||
|
+ unmap_hotplug_range(start, end, true);
|
||||||
|
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
|
||||||
|
+#endif
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||||
|
|
||||||
|
@@ -1049,10 +1362,21 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
|
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
||||||
|
+{
|
||||||
|
+ unsigned long end = start + size;
|
||||||
|
+
|
||||||
|
+ WARN_ON(pgdir != init_mm.pgd);
|
||||||
|
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
|
||||||
|
+
|
||||||
|
+ unmap_hotplug_range(start, end, false);
|
||||||
|
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
int arch_add_memory(int nid, u64 start, u64 size,
|
||||||
|
struct mhp_restrictions *restrictions)
|
||||||
|
{
|
||||||
|
- int flags = 0;
|
||||||
|
+ int ret, flags = 0;
|
||||||
|
|
||||||
|
if (rodata_full || debug_pagealloc_enabled())
|
||||||
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
||||||
|
@@ -1062,22 +1386,59 @@ int arch_add_memory(int nid, u64 start, u64 size,
|
||||||
|
|
||||||
|
memblock_clear_nomap(start, size);
|
||||||
|
|
||||||
|
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||||
|
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||||
|
restrictions);
|
||||||
|
+ if (ret)
|
||||||
|
+ __remove_pgd_mapping(swapper_pg_dir,
|
||||||
|
+ __phys_to_virt(start), size);
|
||||||
|
+ return ret;
|
||||||
|
}
|
||||||
|
+
|
||||||
|
void arch_remove_memory(int nid, u64 start, u64 size,
|
||||||
|
struct vmem_altmap *altmap)
|
||||||
|
{
|
||||||
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||||
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||||
|
|
||||||
|
- /*
|
||||||
|
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
|
||||||
|
- * adding fails). Until then, this function should only be used
|
||||||
|
- * during memory hotplug (adding memory), not for memory
|
||||||
|
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
|
||||||
|
- * unlocked yet.
|
||||||
|
- */
|
||||||
|
__remove_pages(start_pfn, nr_pages, altmap);
|
||||||
|
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+ * This memory hotplug notifier helps prevent boot memory from being
|
||||||
|
+ * inadvertently removed as it blocks pfn range offlining process in
|
||||||
|
+ * __offline_pages(). Hence this prevents both offlining as well as
|
||||||
|
+ * removal process for boot memory which is initially always online.
|
||||||
|
+ * In future if and when boot memory could be removed, this notifier
|
||||||
|
+ * should be dropped and free_hotplug_page_range() should handle any
|
||||||
|
+ * reserved pages allocated during boot.
|
||||||
|
+ */
|
||||||
|
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
|
||||||
|
+ unsigned long action, void *data)
|
||||||
|
+{
|
||||||
|
+ struct mem_section *ms;
|
||||||
|
+ struct memory_notify *arg = data;
|
||||||
|
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
|
||||||
|
+ unsigned long pfn = arg->start_pfn;
|
||||||
|
+
|
||||||
|
+ if (action != MEM_GOING_OFFLINE)
|
||||||
|
+ return NOTIFY_OK;
|
||||||
|
+
|
||||||
|
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||||
|
+ ms = __pfn_to_section(pfn);
|
||||||
|
+ if (early_section(ms))
|
||||||
|
+ return NOTIFY_BAD;
|
||||||
|
+ }
|
||||||
|
+ return NOTIFY_OK;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static struct notifier_block prevent_bootmem_remove_nb = {
|
||||||
|
+ .notifier_call = prevent_bootmem_remove_notifier,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static int __init prevent_bootmem_remove_init(void)
|
||||||
|
+{
|
||||||
|
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
|
||||||
|
}
|
||||||
|
+device_initcall(prevent_bootmem_remove_init);
|
||||||
|
#endif
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user