mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-22 05:28:25 +00:00
kernel: enable virtio-fs for arm64.
This patch add patch file for virtio-fs-v0.3 kernel to enable memory hot remove to let virtio-fs available on arm64. Also, kernel config file for virtio-fs-v0.3x for arm64 is offered. Fixes: #973 Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
This commit is contained in:
parent
60de5bc2f8
commit
9b8f20cac8
2763
kernel/configs/arm64_kata_kvm_virtio-fs-v0.3.x
Normal file
2763
kernel/configs/arm64_kata_kvm_virtio-fs-v0.3.x
Normal file
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
||||
69
|
||||
70
|
||||
|
@ -0,0 +1,453 @@
|
||||
From: Anshuman Khandual <anshuman.khandual@arm.com>
|
||||
Date: Mon, 15 Jul 2019 11:47:50 +0530
|
||||
Subject: [PATCH] arm64/mm: Enable memory hot remove
|
||||
|
||||
The arch code for hot-remove must tear down portions of the linear map and
|
||||
vmemmap corresponding to memory being removed. In both cases the page
|
||||
tables mapping these regions must be freed, and when sparse vmemmap is in
|
||||
use the memory backing the vmemmap must also be freed.
|
||||
|
||||
This patch adds a new remove_pagetable() helper which can be used to tear
|
||||
down either region, and calls it from vmemmap_free() and
|
||||
___remove_pgd_mapping(). The sparse_vmap argument determines whether the
|
||||
backing memory will be freed.
|
||||
|
||||
remove_pagetable() makes two distinct passes over the kernel page table.
|
||||
In the first pass it unmaps, invalidates applicable TLB cache and frees
|
||||
backing memory if required (vmemmap) for each mapped leaf entry. In the
|
||||
second pass it looks for empty page table sections whose page table page
|
||||
can be unmapped, TLB invalidated and freed.
|
||||
|
||||
While freeing intermediate level page table pages bail out if any of its
|
||||
entries are still valid. This can happen for partially filled kernel page
|
||||
table either from a previously attempted failed memory hot add or while
|
||||
removing an address range which does not span the entire page table page
|
||||
range.
|
||||
|
||||
The vmemmap region may share levels of table with the vmalloc region.
|
||||
There can be conflicts between hot remove freeing page table pages with
|
||||
a concurrent vmalloc() walking the kernel page table. This conflict can
|
||||
not just be solved by taking the init_mm ptl because of existing locking
|
||||
scheme in vmalloc(). Hence unlike linear mapping, skip freeing page table
|
||||
pages while tearing down vmemmap mapping.
|
||||
|
||||
While here update arch_add_memory() to handle __add_pages() failures by
|
||||
just unmapping recently added kernel linear mapping. Now enable memory hot
|
||||
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
|
||||
|
||||
This implementation is overall inspired from kernel page table tear down
|
||||
procedure on X86 architecture.
|
||||
|
||||
Acked-by: Steve Capper <steve.capper@arm.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
|
||||
---
|
||||
arch/arm64/Kconfig | 3 +
|
||||
arch/arm64/include/asm/pgtable.h | 7 +-
|
||||
arch/arm64/mm/mmu.c | 290 ++++++++++++++++++++++++++++++-
|
||||
include/linux/mmzone.h | 1 +
|
||||
mm/Kconfig | 2 +-
|
||||
5 files changed, 291 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
|
||||
index 3adcec05b1f6..5a1231b8b8cf 100644
|
||||
--- a/arch/arm64/Kconfig
|
||||
+++ b/arch/arm64/Kconfig
|
||||
@@ -273,6 +273,9 @@ config ZONE_DMA32
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
|
||||
+config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
+ def_bool y
|
||||
+
|
||||
config SMP
|
||||
def_bool y
|
||||
|
||||
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
|
||||
index 5fdcfe237338..e09760ece844 100644
|
||||
--- a/arch/arm64/include/asm/pgtable.h
|
||||
+++ b/arch/arm64/include/asm/pgtable.h
|
||||
@@ -209,7 +209,7 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
|
||||
|
||||
static inline pte_t pte_mkdevmap(pte_t pte)
|
||||
{
|
||||
- return set_pte_bit(pte, __pgprot(PTE_DEVMAP));
|
||||
+ return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
|
||||
}
|
||||
|
||||
static inline void set_pte(pte_t *ptep, pte_t pte)
|
||||
@@ -396,7 +396,10 @@ static inline int pmd_protnone(pmd_t pmd)
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd))
|
||||
#endif
|
||||
-#define pmd_mkdevmap(pmd) pte_pmd(pte_mkdevmap(pmd_pte(pmd)))
|
||||
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
|
||||
+{
|
||||
+ return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
|
||||
+}
|
||||
|
||||
#define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd))
|
||||
#define __phys_to_pmd_val(phys) __phys_to_pte_val(phys)
|
||||
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
|
||||
index 750a69dde39b..282a4b26218c 100644
|
||||
--- a/arch/arm64/mm/mmu.c
|
||||
+++ b/arch/arm64/mm/mmu.c
|
||||
@@ -722,6 +722,250 @@ int kern_addr_valid(unsigned long addr)
|
||||
|
||||
return pfn_valid(pte_pfn(pte));
|
||||
}
|
||||
+
|
||||
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+static void free_hotplug_page_range(struct page *page, size_t size)
|
||||
+{
|
||||
+ WARN_ON(!page || PageReserved(page));
|
||||
+ free_pages((unsigned long)page_address(page), get_order(size));
|
||||
+}
|
||||
+
|
||||
+static void free_hotplug_pgtable_page(struct page *page)
|
||||
+{
|
||||
+ free_hotplug_page_range(page, PAGE_SIZE);
|
||||
+}
|
||||
+
|
||||
+static void free_pte_table(pmd_t *pmdp, unsigned long addr)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+ pte_t *ptep;
|
||||
+ int i;
|
||||
+
|
||||
+ ptep = pte_offset_kernel(pmdp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PTE; i++) {
|
||||
+ if (!pte_none(READ_ONCE(ptep[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ page = pmd_page(READ_ONCE(*pmdp));
|
||||
+ pmd_clear(pmdp);
|
||||
+ __flush_tlb_kernel_pgtable(addr);
|
||||
+ free_hotplug_pgtable_page(page);
|
||||
+}
|
||||
+
|
||||
+static void free_pmd_table(pud_t *pudp, unsigned long addr)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+ pmd_t *pmdp;
|
||||
+ int i;
|
||||
+
|
||||
+ if (CONFIG_PGTABLE_LEVELS <= 2)
|
||||
+ return;
|
||||
+
|
||||
+ pmdp = pmd_offset(pudp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
+ if (!pmd_none(READ_ONCE(pmdp[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ page = pud_page(READ_ONCE(*pudp));
|
||||
+ pud_clear(pudp);
|
||||
+ __flush_tlb_kernel_pgtable(addr);
|
||||
+ free_hotplug_pgtable_page(page);
|
||||
+}
|
||||
+
|
||||
+static void free_pud_table(pgd_t *pgdp, unsigned long addr)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+ pud_t *pudp;
|
||||
+ int i;
|
||||
+
|
||||
+ if (CONFIG_PGTABLE_LEVELS <= 3)
|
||||
+ return;
|
||||
+
|
||||
+ pudp = pud_offset(pgdp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PUD; i++) {
|
||||
+ if (!pud_none(READ_ONCE(pudp[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ page = pgd_page(READ_ONCE(*pgdp));
|
||||
+ pgd_clear(pgdp);
|
||||
+ __flush_tlb_kernel_pgtable(addr);
|
||||
+ free_hotplug_pgtable_page(page);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
||||
+ unsigned long end, bool sparse_vmap)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+ pte_t *ptep, pte;
|
||||
+
|
||||
+ do {
|
||||
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||
+ pte = READ_ONCE(*ptep);
|
||||
+ if (pte_none(pte))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pte_present(pte));
|
||||
+ page = sparse_vmap ? pte_page(pte) : NULL;
|
||||
+ pte_clear(&init_mm, addr, ptep);
|
||||
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||
+ if (sparse_vmap)
|
||||
+ free_hotplug_page_range(page, PAGE_SIZE);
|
||||
+ } while (addr += PAGE_SIZE, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
||||
+ unsigned long end, bool sparse_vmap)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ struct page *page;
|
||||
+ pmd_t *pmdp, pmd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pmd_addr_end(addr, end);
|
||||
+ pmdp = pmd_offset(pudp, addr);
|
||||
+ pmd = READ_ONCE(*pmdp);
|
||||
+ if (pmd_none(pmd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pmd_present(pmd));
|
||||
+ if (pmd_sect(pmd)) {
|
||||
+ page = sparse_vmap ? pmd_page(pmd) : NULL;
|
||||
+ pmd_clear(pmdp);
|
||||
+ flush_tlb_kernel_range(addr, next);
|
||||
+ if (sparse_vmap)
|
||||
+ free_hotplug_page_range(page, PMD_SIZE);
|
||||
+ continue;
|
||||
+ }
|
||||
+ WARN_ON(!pmd_table(pmd));
|
||||
+ unmap_hotplug_pte_range(pmdp, addr, next, sparse_vmap);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pud_range(pgd_t *pgdp, unsigned long addr,
|
||||
+ unsigned long end, bool sparse_vmap)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ struct page *page;
|
||||
+ pud_t *pudp, pud;
|
||||
+
|
||||
+ do {
|
||||
+ next = pud_addr_end(addr, end);
|
||||
+ pudp = pud_offset(pgdp, addr);
|
||||
+ pud = READ_ONCE(*pudp);
|
||||
+ if (pud_none(pud))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pud_present(pud));
|
||||
+ if (pud_sect(pud)) {
|
||||
+ page = sparse_vmap ? pud_page(pud) : NULL;
|
||||
+ pud_clear(pudp);
|
||||
+ flush_tlb_kernel_range(addr, next);
|
||||
+ if (sparse_vmap)
|
||||
+ free_hotplug_page_range(page, PUD_SIZE);
|
||||
+ continue;
|
||||
+ }
|
||||
+ WARN_ON(!pud_table(pud));
|
||||
+ unmap_hotplug_pmd_range(pudp, addr, next, sparse_vmap);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
||||
+ bool sparse_vmap)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pgd_t *pgdp, pgd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pgd_addr_end(addr, end);
|
||||
+ pgdp = pgd_offset_k(addr);
|
||||
+ pgd = READ_ONCE(*pgdp);
|
||||
+ if (pgd_none(pgd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pgd_present(pgd));
|
||||
+ unmap_hotplug_pud_range(pgdp, addr, next, sparse_vmap);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
||||
+ unsigned long end)
|
||||
+{
|
||||
+ pte_t *ptep, pte;
|
||||
+
|
||||
+ do {
|
||||
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||
+ pte = READ_ONCE(*ptep);
|
||||
+ WARN_ON(!pte_none(pte));
|
||||
+ } while (addr += PAGE_SIZE, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
||||
+ unsigned long end)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pmd_t *pmdp, pmd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pmd_addr_end(addr, end);
|
||||
+ pmdp = pmd_offset(pudp, addr);
|
||||
+ pmd = READ_ONCE(*pmdp);
|
||||
+ if (pmd_none(pmd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
||||
+ free_empty_pte_table(pmdp, addr, next);
|
||||
+ free_pte_table(pmdp, addr);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pud_table(pgd_t *pgdp, unsigned long addr,
|
||||
+ unsigned long end)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pud_t *pudp, pud;
|
||||
+
|
||||
+ do {
|
||||
+ next = pud_addr_end(addr, end);
|
||||
+ pudp = pud_offset(pgdp, addr);
|
||||
+ pud = READ_ONCE(*pudp);
|
||||
+ if (pud_none(pud))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
||||
+ free_empty_pmd_table(pudp, addr, next);
|
||||
+ free_pmd_table(pudp, addr);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_tables(unsigned long addr, unsigned long end)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pgd_t *pgdp, pgd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pgd_addr_end(addr, end);
|
||||
+ pgdp = pgd_offset_k(addr);
|
||||
+ pgd = READ_ONCE(*pgdp);
|
||||
+ if (pgd_none(pgd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pgd_present(pgd));
|
||||
+ free_empty_pud_table(pgdp, addr, next);
|
||||
+ free_pud_table(pgdp, addr);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void remove_pagetable(unsigned long start, unsigned long end,
|
||||
+ bool sparse_vmap)
|
||||
+{
|
||||
+ unmap_hotplug_range(start, end, sparse_vmap);
|
||||
+ free_empty_tables(start, end);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#if !ARM64_SWAPPER_USES_SECTION_MAPS
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
@@ -769,6 +1013,27 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
void vmemmap_free(unsigned long start, unsigned long end,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+ /*
|
||||
+ * FIXME: We should have called remove_pagetable(start, end, true).
|
||||
+ * vmemmap and vmalloc virtual range might share intermediate kernel
|
||||
+ * page table entries. Removing vmemmap range page table pages here
|
||||
+ * can potentially conflict with a concurrent vmalloc() allocation.
|
||||
+ *
|
||||
+ * This is primarily because vmalloc() does not take init_mm ptl for
|
||||
+ * the entire page table walk and it's modification. Instead it just
|
||||
+ * takes the lock while allocating and installing page table pages
|
||||
+ * via [p4d|pud|pmd|pte]_alloc(). A concurrently vanishing page table
|
||||
+ * entry via memory hot remove can cause vmalloc() kernel page table
|
||||
+ * walk pointers to be invalid on the fly which can cause corruption
|
||||
+ * or worst, a crash.
|
||||
+ *
|
||||
+ * To avoid this problem, lets not free empty page table pages for
|
||||
+ * given vmemmap range being hot-removed. Just unmap and free the
|
||||
+ * range instead.
|
||||
+ */
|
||||
+ unmap_hotplug_range(start, end, true);
|
||||
+#endif
|
||||
}
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
@@ -1060,10 +1325,18 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
||||
+{
|
||||
+ unsigned long end = start + size;
|
||||
+
|
||||
+ WARN_ON(pgdir != init_mm.pgd);
|
||||
+ remove_pagetable(start, end, false);
|
||||
+}
|
||||
+
|
||||
int arch_add_memory(int nid, u64 start, u64 size,
|
||||
struct mhp_restrictions *restrictions)
|
||||
{
|
||||
- int flags = 0;
|
||||
+ int ret, flags = 0;
|
||||
|
||||
if (rodata_full || debug_pagealloc_enabled())
|
||||
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
||||
@@ -1071,9 +1344,14 @@ int arch_add_memory(int nid, u64 start, u64 size,
|
||||
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
|
||||
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
|
||||
|
||||
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||
restrictions);
|
||||
+ if (ret)
|
||||
+ __remove_pgd_mapping(swapper_pg_dir,
|
||||
+ __phys_to_virt(start), size);
|
||||
+ return ret;
|
||||
}
|
||||
+
|
||||
void arch_remove_memory(int nid, u64 start, u64 size,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
@@ -1081,14 +1359,8 @@ void arch_remove_memory(int nid, u64 start, u64 size,
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
|
||||
- /*
|
||||
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
|
||||
- * adding fails). Until then, this function should only be used
|
||||
- * during memory hotplug (adding memory), not for memory
|
||||
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
|
||||
- * unlocked yet.
|
||||
- */
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
__remove_pages(zone, start_pfn, nr_pages, altmap);
|
||||
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
||||
}
|
||||
#endif
|
||||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||
index d77d717c620c..47230ebdcb01 100644
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -1122,6 +1122,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
|
||||
* PFN_SECTION_SHIFT pfn to/from section number
|
||||
*/
|
||||
#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
|
||||
+#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
|
||||
|
||||
#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
|
||||
diff --git a/mm/Kconfig b/mm/Kconfig
|
||||
index 56cec636a1fc..7c980f483a7d 100644
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -677,7 +677,7 @@ config DEV_PAGEMAP_OPS
|
||||
|
||||
config HMM_MIRROR
|
||||
bool "HMM mirror CPU page table into a device page table"
|
||||
- depends on (X86_64 || PPC64)
|
||||
+ depends on (X86_64 || PPC64 || ARM64)
|
||||
depends on MMU && 64BIT
|
||||
select MMU_NOTIFIER
|
||||
help
|
||||
--
|
||||
2.17.1
|
||||
|
Loading…
Reference in New Issue
Block a user