summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/include/asm/kexec.h5
-rw-r--r--arch/arm64/include/asm/mmu_context.h7
-rw-r--r--arch/arm64/include/asm/trans_pgd.h39
-rw-r--r--arch/arm64/kernel/hibernate.c271
-rw-r--r--arch/arm64/kernel/machine_kexec.c57
-rw-r--r--arch/arm64/kernel/relocate_kernel.S48
-rw-r--r--arch/arm64/mm/Makefile1
-rw-r--r--arch/arm64/mm/trans_pgd.c324
9 files changed, 434 insertions, 322 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3dfb25afa616..43a9867e0dee 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1132,6 +1132,10 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
+config TRANS_TABLE
+ def_bool y
+ depends on HIBERNATION
+
config XEN_DOM0
def_bool y
depends on XEN
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index d24b527e8c00..9befcd87e9a8 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -90,18 +90,19 @@ static inline void crash_prepare_suspend(void) {}
static inline void crash_post_resume(void) {}
#endif
-#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH
struct kimage_arch {
void *dtb;
- unsigned long dtb_mem;
+ phys_addr_t dtb_mem;
+ phys_addr_t kern_reloc;
/* Core ELF header buffer */
void *elf_headers;
unsigned long elf_headers_mem;
unsigned long elf_headers_sz;
};
+#ifdef CONFIG_KEXEC_FILE
extern const struct kexec_file_ops kexec_image_ops;
struct kimage;
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 0b3079fd28eb..70ce8c1d2b07 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -81,16 +81,15 @@ static inline bool __cpu_uses_extended_idmap_level(void)
}
/*
- * Set TCR.T0SZ to its default value (based on VA_BITS)
+ * Ensure TCR.T0SZ is set to the provided value.
*/
static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
{
- unsigned long tcr;
+ unsigned long tcr = read_sysreg(tcr_el1);
- if (!__cpu_uses_extended_idmap())
+ if ((tcr & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET == t0sz)
return;
- tcr = read_sysreg(tcr_el1);
tcr &= ~TCR_T0SZ_MASK;
tcr |= t0sz << TCR_T0SZ_OFFSET;
write_sysreg(tcr, tcr_el1);
diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h
new file mode 100644
index 000000000000..5d08e5adf3d5
--- /dev/null
+++ b/arch/arm64/include/asm/trans_pgd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2020, Microsoft Corporation.
+ * Pavel Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _ASM_TRANS_TABLE_H
+#define _ASM_TRANS_TABLE_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+#include <asm/pgtable-types.h>
+
+/*
+ * trans_alloc_page
+ * - Allocator that should return exactly one zeroed page, if this
+ * allocator fails, trans_pgd_create_copy() and trans_pgd_map_page()
+ * return -ENOMEM error.
+ *
+ * trans_alloc_arg
+ * - Passed to trans_alloc_page as an argument
+ */
+
+struct trans_pgd_info {
+ void * (*trans_alloc_page)(void *arg);
+ void *trans_alloc_arg;
+};
+
+int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd,
+ unsigned long start, unsigned long end);
+
+int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd,
+ void *page, unsigned long dst_addr, pgprot_t pgprot);
+
+int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
+ unsigned long *t0sz, void *page);
+
+#endif /* _ASM_TRANS_TABLE_H */
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 9c9f47e9f7f4..b1cef371df2b 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -16,7 +16,6 @@
#define pr_fmt(x) "hibernate: " x
#include <linux/cpu.h>
#include <linux/kvm_host.h>
-#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/sched.h>
#include <linux/suspend.h>
@@ -31,13 +30,12 @@
#include <asm/memory.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable-hwdef.h>
#include <asm/sections.h>
#include <asm/smp.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
#include <asm/sysreg.h>
+#include <asm/trans_pgd.h>
#include <asm/virt.h>
/*
@@ -178,52 +176,9 @@ int arch_hibernation_header_restore(void *addr)
}
EXPORT_SYMBOL(arch_hibernation_header_restore);
-static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
- unsigned long dst_addr,
- pgprot_t pgprot)
+static void *hibernate_page_alloc(void *arg)
{
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- pgdp = pgd_offset_pgd(trans_pgd, dst_addr);
- if (pgd_none(READ_ONCE(*pgdp))) {
- pudp = (void *)get_safe_page(GFP_ATOMIC);
- if (!pudp)
- return -ENOMEM;
- pgd_populate(&init_mm, pgdp, pudp);
- }
-
- p4dp = p4d_offset(pgdp, dst_addr);
- if (p4d_none(READ_ONCE(*p4dp))) {
- pudp = (void *)get_safe_page(GFP_ATOMIC);
- if (!pudp)
- return -ENOMEM;
- p4d_populate(&init_mm, p4dp, pudp);
- }
-
- pudp = pud_offset(p4dp, dst_addr);
- if (pud_none(READ_ONCE(*pudp))) {
- pmdp = (void *)get_safe_page(GFP_ATOMIC);
- if (!pmdp)
- return -ENOMEM;
- pud_populate(&init_mm, pudp, pmdp);
- }
-
- pmdp = pmd_offset(pudp, dst_addr);
- if (pmd_none(READ_ONCE(*pmdp))) {
- ptep = (void *)get_safe_page(GFP_ATOMIC);
- if (!ptep)
- return -ENOMEM;
- pmd_populate_kernel(&init_mm, pmdp, ptep);
- }
-
- ptep = pte_offset_kernel(pmdp, dst_addr);
- set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC));
-
- return 0;
+ return (void *)get_safe_page((__force gfp_t)(unsigned long)arg);
}
/*
@@ -239,11 +194,16 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
* page system.
*/
static int create_safe_exec_page(void *src_start, size_t length,
- unsigned long dst_addr,
phys_addr_t *phys_dst_addr)
{
+ struct trans_pgd_info trans_info = {
+ .trans_alloc_page = hibernate_page_alloc,
+ .trans_alloc_arg = (__force void *)GFP_ATOMIC,
+ };
+
void *page = (void *)get_safe_page(GFP_ATOMIC);
- pgd_t *trans_pgd;
+ phys_addr_t trans_ttbr0;
+ unsigned long t0sz;
int rc;
if (!page)
@@ -251,13 +211,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
memcpy(page, src_start, length);
__flush_icache_range((unsigned long)page, (unsigned long)page + length);
-
- trans_pgd = (void *)get_safe_page(GFP_ATOMIC);
- if (!trans_pgd)
- return -ENOMEM;
-
- rc = trans_pgd_map_page(trans_pgd, page, dst_addr,
- PAGE_KERNEL_EXEC);
+ rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page);
if (rc)
return rc;
@@ -270,12 +224,15 @@ static int create_safe_exec_page(void *src_start, size_t length,
* page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI
* runtime services), while for a userspace-driven test_resume cycle it
* points to userspace page tables (and we must point it at a zero page
- * ourselves). Elsewhere we only (un)install the idmap with preemption
- * disabled, so T0SZ should be as required regardless.
+ * ourselves).
+ *
+ * We change T0SZ as part of installing the idmap. This is undone by
+ * cpu_uninstall_idmap() in __cpu_suspend_exit().
*/
cpu_set_reserved_ttbr0();
local_flush_tlb_all();
- write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1);
+ __cpu_set_tcr_t0sz(t0sz);
+ write_sysreg(trans_ttbr0, ttbr0_el1);
isb();
*phys_dst_addr = virt_to_phys(page);
@@ -462,182 +419,6 @@ int swsusp_arch_suspend(void)
return ret;
}
-static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
-{
- pte_t pte = READ_ONCE(*src_ptep);
-
- if (pte_valid(pte)) {
- /*
- * Resume will overwrite areas that may be marked
- * read only (code, rodata). Clear the RDONLY bit from
- * the temporary mappings we use during restore.
- */
- set_pte(dst_ptep, pte_mkwrite(pte));
- } else if (debug_pagealloc_enabled() && !pte_none(pte)) {
- /*
- * debug_pagealloc will removed the PTE_VALID bit if
- * the page isn't in use by the resume kernel. It may have
- * been in use by the original kernel, in which case we need
- * to put it back in our copy to do the restore.
- *
- * Before marking this entry valid, check the pfn should
- * be mapped.
- */
- BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
- }
-}
-
-static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start,
- unsigned long end)
-{
- pte_t *src_ptep;
- pte_t *dst_ptep;
- unsigned long addr = start;
-
- dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC);
- if (!dst_ptep)
- return -ENOMEM;
- pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep);
- dst_ptep = pte_offset_kernel(dst_pmdp, start);
-
- src_ptep = pte_offset_kernel(src_pmdp, start);
- do {
- _copy_pte(dst_ptep, src_ptep, addr);
- } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
-
- return 0;
-}
-
-static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start,
- unsigned long end)
-{
- pmd_t *src_pmdp;
- pmd_t *dst_pmdp;
- unsigned long next;
- unsigned long addr = start;
-
- if (pud_none(READ_ONCE(*dst_pudp))) {
- dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC);
- if (!dst_pmdp)
- return -ENOMEM;
- pud_populate(&init_mm, dst_pudp, dst_pmdp);
- }
- dst_pmdp = pmd_offset(dst_pudp, start);
-
- src_pmdp = pmd_offset(src_pudp, start);
- do {
- pmd_t pmd = READ_ONCE(*src_pmdp);
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- continue;
- if (pmd_table(pmd)) {
- if (copy_pte(dst_pmdp, src_pmdp, addr, next))
- return -ENOMEM;
- } else {
- set_pmd(dst_pmdp,
- __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
- }
- } while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
-
- return 0;
-}
-
-static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start,
- unsigned long end)
-{
- pud_t *dst_pudp;
- pud_t *src_pudp;
- unsigned long next;
- unsigned long addr = start;
-
- if (p4d_none(READ_ONCE(*dst_p4dp))) {
- dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC);
- if (!dst_pudp)
- return -ENOMEM;
- p4d_populate(&init_mm, dst_p4dp, dst_pudp);
- }
- dst_pudp = pud_offset(dst_p4dp, start);
-
- src_pudp = pud_offset(src_p4dp, start);
- do {
- pud_t pud = READ_ONCE(*src_pudp);
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- continue;
- if (pud_table(pud)) {
- if (copy_pmd(dst_pudp, src_pudp, addr, next))
- return -ENOMEM;
- } else {
- set_pud(dst_pudp,
- __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
- }
- } while (dst_pudp++, src_pudp++, addr = next, addr != end);
-
- return 0;
-}
-
-static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
- unsigned long end)
-{
- p4d_t *dst_p4dp;
- p4d_t *src_p4dp;
- unsigned long next;
- unsigned long addr = start;
-
- dst_p4dp = p4d_offset(dst_pgdp, start);
- src_p4dp = p4d_offset(src_pgdp, start);
- do {
- next = p4d_addr_end(addr, end);
- if (p4d_none(READ_ONCE(*src_p4dp)))
- continue;
- if (copy_pud(dst_p4dp, src_p4dp, addr, next))
- return -ENOMEM;
- } while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
-
- return 0;
-}
-
-static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start,
- unsigned long end)
-{
- unsigned long next;
- unsigned long addr = start;
- pgd_t *src_pgdp = pgd_offset_k(start);
-
- dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none(READ_ONCE(*src_pgdp)))
- continue;
- if (copy_p4d(dst_pgdp, src_pgdp, addr, next))
- return -ENOMEM;
- } while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
-
- return 0;
-}
-
-static int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start,
- unsigned long end)
-{
- int rc;
- pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC);
-
- if (!trans_pgd) {
- pr_err("Failed to allocate memory for temporary page tables.\n");
- return -ENOMEM;
- }
-
- rc = copy_page_tables(trans_pgd, start, end);
- if (!rc)
- *dst_pgdp = trans_pgd;
-
- return rc;
-}
-
/*
* Setup then Resume from the hibernate image using swsusp_arch_suspend_exit().
*
@@ -650,16 +431,20 @@ int swsusp_arch_resume(void)
void *zero_page;
size_t exit_size;
pgd_t *tmp_pg_dir;
- phys_addr_t phys_hibernate_exit;
void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
void *, phys_addr_t, phys_addr_t);
+ struct trans_pgd_info trans_info = {
+ .trans_alloc_page = hibernate_page_alloc,
+ .trans_alloc_arg = (void *)GFP_ATOMIC,
+ };
/*
* Restoring the memory image will overwrite the ttbr1 page tables.
* Create a second copy of just the linear map, and use this when
* restoring.
*/
- rc = trans_pgd_create_copy(&tmp_pg_dir, PAGE_OFFSET, PAGE_END);
+ rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET,
+ PAGE_END);
if (rc)
return rc;
@@ -673,19 +458,13 @@ int swsusp_arch_resume(void)
return -ENOMEM;
}
- /*
- * Locate the exit code in the bottom-but-one page, so that *NULL
- * still has disastrous affects.
- */
- hibernate_exit = (void *)PAGE_SIZE;
exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start;
/*
* Copy swsusp_arch_suspend_exit() to a safe page. This will generate
* a new set of ttbr0 page tables and load them.
*/
rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size,
- (unsigned long)hibernate_exit,
- &phys_hibernate_exit);
+ (phys_addr_t *)&hibernate_exit);
if (rc) {
pr_err("Failed to create safe executable page for hibernate_exit code.\n");
return rc;
@@ -704,7 +483,7 @@ int swsusp_arch_resume(void)
* We can skip this step if we booted at EL1, or are running with VHE.
*/
if (el2_reset_needed()) {
- phys_addr_t el2_vectors = phys_hibernate_exit; /* base */
+ phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit;
el2_vectors += hibernate_el2_vectors -
__hibernate_exit_text_start; /* offset */
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144cfaea7..90a335c74442 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -42,6 +42,7 @@ static void _kexec_image_info(const char *func, int line,
pr_debug(" start: %lx\n", kimage->start);
pr_debug(" head: %lx\n", kimage->head);
pr_debug(" nr_segments: %lu\n", kimage->nr_segments);
+ pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc);
for (i = 0; i < kimage->nr_segments; i++) {
pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n",
@@ -58,6 +59,23 @@ void machine_kexec_cleanup(struct kimage *kimage)
/* Empty routine needed to avoid build errors. */
}
+int machine_kexec_post_load(struct kimage *kimage)
+{
+ void *reloc_code = page_to_virt(kimage->control_code_page);
+
+ memcpy(reloc_code, arm64_relocate_new_kernel,
+ arm64_relocate_new_kernel_size);
+ kimage->arch.kern_reloc = __pa(reloc_code);
+ kexec_image_info(kimage);
+
+ /* Flush the reloc_code in preparation for its execution. */
+ __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size);
+ flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code +
+ arm64_relocate_new_kernel_size);
+
+ return 0;
+}
+
/**
* machine_kexec_prepare - Prepare for a kexec reboot.
*
@@ -67,8 +85,6 @@ void machine_kexec_cleanup(struct kimage *kimage)
*/
int machine_kexec_prepare(struct kimage *kimage)
{
- kexec_image_info(kimage);
-
if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) {
pr_err("Can't kexec: CPUs are stuck in the kernel.\n");
return -EBUSY;
@@ -143,8 +159,6 @@ static void kexec_segment_flush(const struct kimage *kimage)
*/
void machine_kexec(struct kimage *kimage)
{
- phys_addr_t reboot_code_buffer_phys;
- void *reboot_code_buffer;
bool in_kexec_crash = (kimage == kexec_crash_image);
bool stuck_cpus = cpus_are_stuck_in_kernel();
@@ -155,31 +169,6 @@ void machine_kexec(struct kimage *kimage)
WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()),
"Some CPUs may be stale, kdump will be unreliable.\n");
- reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
- reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
-
- kexec_image_info(kimage);
-
- /*
- * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use
- * after the kernel is shut down.
- */
- memcpy(reboot_code_buffer, arm64_relocate_new_kernel,
- arm64_relocate_new_kernel_size);
-
- /* Flush the reboot_code_buffer in preparation for its execution. */
- __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);
-
- /*
- * Although we've killed off the secondary CPUs, we don't update
- * the online mask if we're handling a crash kernel and consequently
- * need to avoid flush_icache_range(), which will attempt to IPI
- * the offline CPUs. Therefore, we must use the __* variant here.
- */
- __flush_icache_range((uintptr_t)reboot_code_buffer,
- (uintptr_t)reboot_code_buffer +
- arm64_relocate_new_kernel_size);
-
/* Flush the kimage list and its buffers. */
kexec_list_flush(kimage);
@@ -193,7 +182,7 @@ void machine_kexec(struct kimage *kimage)
/*
* cpu_soft_restart will shutdown the MMU, disable data caches, then
- * transfer control to the reboot_code_buffer which contains a copy of
+ * transfer control to the kern_reloc which contains a copy of
* the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel
* uses physical addressing to relocate the new image to its final
* position and transfers control to the image entry point when the
@@ -203,12 +192,8 @@ void machine_kexec(struct kimage *kimage)
* userspace (kexec-tools).
* In kexec_file case, the kernel starts directly without purgatory.
*/
- cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start,
-#ifdef CONFIG_KEXEC_FILE
- kimage->arch.dtb_mem);
-#else
- 0);
-#endif
+ cpu_soft_restart(kimage->arch.kern_reloc, kimage->head, kimage->start,
+ kimage->arch.dtb_mem);
BUG(); /* Should never get here. */
}
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index 84eec95ec06c..b78ea5de97a4 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -17,28 +17,24 @@
/*
* arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it.
*
- * The memory that the old kernel occupies may be overwritten when coping the
+ * The memory that the old kernel occupies may be overwritten when copying the
* new image to its final location. To assure that the
* arm64_relocate_new_kernel routine which does that copy is not overwritten,
* all code and data needed by arm64_relocate_new_kernel must be between the
* symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The
* machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec
- * control_code_page, a special page which has been set up to be preserved
- * during the copy operation.
+ * safe memory that has been set up to be preserved during the copy operation.
*/
SYM_CODE_START(arm64_relocate_new_kernel)
-
/* Setup the list loop variables. */
mov x18, x2 /* x18 = dtb address */
mov x17, x1 /* x17 = kimage_start */
mov x16, x0 /* x16 = kimage_head */
- raw_dcache_line_size x15, x0 /* x15 = dcache line size */
mov x14, xzr /* x14 = entry ptr */
mov x13, xzr /* x13 = copy dest */
-
/* Check if the new image needs relocation. */
tbnz x16, IND_DONE_BIT, .Ldone
-
+ raw_dcache_line_size x15, x1 /* x15 = dcache line size */
.Lloop:
and x12, x16, PAGE_MASK /* x12 = addr */
@@ -47,44 +43,28 @@ SYM_CODE_START(arm64_relocate_new_kernel)
tbz x16, IND_SOURCE_BIT, .Ltest_indirection
/* Invalidate dest page to PoC. */
- mov x0, x13
- add x20, x0, #PAGE_SIZE
+ mov x2, x13
+ add x20, x2, #PAGE_SIZE
sub x1, x15, #1
- bic x0, x0, x1
-2: dc ivac, x0
- add x0, x0, x15
- cmp x0, x20
+ bic x2, x2, x1
+2: dc ivac, x2
+ add x2, x2, x15
+ cmp x2, x20
b.lo 2b
dsb sy
- mov x20, x13
- mov x21, x12
- copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7
-
- /* dest += PAGE_SIZE */
- add x13, x13, PAGE_SIZE
+ copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8
b .Lnext
-
.Ltest_indirection:
tbz x16, IND_INDIRECTION_BIT, .Ltest_destination
-
- /* ptr = addr */
- mov x14, x12
+ mov x14, x12 /* ptr = addr */
b .Lnext
-
.Ltest_destination:
tbz x16, IND_DESTINATION_BIT, .Lnext
-
- /* dest = addr */
- mov x13, x12
-
+ mov x13, x12 /* dest = addr */
.Lnext:
- /* entry = *ptr++ */
- ldr x16, [x14], #8
-
- /* while (!(entry & DONE)) */
- tbz x16, IND_DONE_BIT, .Lloop
-
+ ldr x16, [x14], #8 /* entry = *ptr++ */
+ tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */
.Ldone:
/* wait for writes from copy_page to finish */
dsb nsh
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 5ead3c3de3b6..77222d92667a 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
+obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ARM64_MTE) += mteswap.o
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
new file mode 100644
index 000000000000..527f0a39c3da
--- /dev/null
+++ b/arch/arm64/mm/trans_pgd.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Transitional page tables for kexec and hibernate
+ *
+ * This file derived from: arch/arm64/kernel/hibernate.c
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ * Pavel Tatashin <pasha.tatashin@soleen.com>
+ *
+ */
+
+/*
+ * Transitional tables are used during system transferring from one world to
+ * another: such as during hibernate restore, and kexec reboots. During these
+ * phases one cannot rely on page table not being overwritten. This is because
+ * hibernate and kexec can overwrite the current page tables during transition.
+ */
+
+#include <asm/trans_pgd.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <linux/suspend.h>
+#include <linux/bug.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+
+static void *trans_alloc(struct trans_pgd_info *info)
+{
+ return info->trans_alloc_page(info->trans_alloc_arg);
+}
+
+static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
+{
+ pte_t pte = READ_ONCE(*src_ptep);
+
+ if (pte_valid(pte)) {
+ /*
+ * Resume will overwrite areas that may be marked
+ * read only (code, rodata). Clear the RDONLY bit from
+ * the temporary mappings we use during restore.
+ */
+ set_pte(dst_ptep, pte_mkwrite(pte));
+ } else if (debug_pagealloc_enabled() && !pte_none(pte)) {
+ /*
+ * debug_pagealloc will removed the PTE_VALID bit if
+ * the page isn't in use by the resume kernel. It may have
+ * been in use by the original kernel, in which case we need
+ * to put it back in our copy to do the restore.
+ *
+ * Before marking this entry valid, check the pfn should
+ * be mapped.
+ */
+ BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
+ }
+}
+
+static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
+ pmd_t *src_pmdp, unsigned long start, unsigned long end)
+{
+ pte_t *src_ptep;
+ pte_t *dst_ptep;
+ unsigned long addr = start;
+
+ dst_ptep = trans_alloc(info);
+ if (!dst_ptep)
+ return -ENOMEM;
+ pmd_populate_kernel(NULL, dst_pmdp, dst_ptep);
+ dst_ptep = pte_offset_kernel(dst_pmdp, start);
+
+ src_ptep = pte_offset_kernel(src_pmdp, start);
+ do {
+ _copy_pte(dst_ptep, src_ptep, addr);
+ } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 0;
+}
+
+static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
+ pud_t *src_pudp, unsigned long start, unsigned long end)
+{
+ pmd_t *src_pmdp;
+ pmd_t *dst_pmdp;
+ unsigned long next;
+ unsigned long addr = start;
+
+ if (pud_none(READ_ONCE(*dst_pudp))) {
+ dst_pmdp = trans_alloc(info);
+ if (!dst_pmdp)
+ return -ENOMEM;
+ pud_populate(NULL, dst_pudp, dst_pmdp);
+ }
+ dst_pmdp = pmd_offset(dst_pudp, start);
+
+ src_pmdp = pmd_offset(src_pudp, start);
+ do {
+ pmd_t pmd = READ_ONCE(*src_pmdp);
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ continue;
+ if (pmd_table(pmd)) {
+ if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
+ return -ENOMEM;
+ } else {
+ set_pmd(dst_pmdp,
+ __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
+ }
+ } while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
+ p4d_t *src_p4dp, unsigned long start,
+ unsigned long end)
+{
+ pud_t *dst_pudp;
+ pud_t *src_pudp;
+ unsigned long next;
+ unsigned long addr = start;
+
+ if (p4d_none(READ_ONCE(*dst_p4dp))) {
+ dst_pudp = trans_alloc(info);
+ if (!dst_pudp)
+ return -ENOMEM;
+ p4d_populate(NULL, dst_p4dp, dst_pudp);
+ }
+ dst_pudp = pud_offset(dst_p4dp, start);
+
+ src_pudp = pud_offset(src_p4dp, start);
+ do {
+ pud_t pud = READ_ONCE(*src_pudp);
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ continue;
+ if (pud_table(pud)) {
+ if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
+ return -ENOMEM;
+ } else {
+ set_pud(dst_pudp,
+ __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
+ }
+ } while (dst_pudp++, src_pudp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp,
+ pgd_t *src_pgdp, unsigned long start,
+ unsigned long end)
+{
+ p4d_t *dst_p4dp;
+ p4d_t *src_p4dp;
+ unsigned long next;
+ unsigned long addr = start;
+
+ dst_p4dp = p4d_offset(dst_pgdp, start);
+ src_p4dp = p4d_offset(src_pgdp, start);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(READ_ONCE(*src_p4dp)))
+ continue;
+ if (copy_pud(info, dst_p4dp, src_p4dp, addr, next))
+ return -ENOMEM;
+ } while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp,
+ unsigned long start, unsigned long end)
+{
+ unsigned long next;
+ unsigned long addr = start;
+ pgd_t *src_pgdp = pgd_offset_k(start);
+
+ dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(READ_ONCE(*src_pgdp)))
+ continue;
+ if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next))
+ return -ENOMEM;
+ } while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+/*
+ * Create trans_pgd and copy linear map.
+ * info: contains allocator and its argument
+ * dst_pgdp: new page table that is created, and to which map is copied.
+ * start: Start of the interval (inclusive).
+ * end: End of the interval (exclusive).
+ *
+ * Returns 0 on success, and -ENOMEM on failure.
+ */
+int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp,
+ unsigned long start, unsigned long end)
+{
+ int rc;
+ pgd_t *trans_pgd = trans_alloc(info);
+
+ if (!trans_pgd) {
+ pr_err("Failed to allocate memory for temporary page tables.\n");
+ return -ENOMEM;
+ }
+
+ rc = copy_page_tables(info, trans_pgd, start, end);
+ if (!rc)
+ *dst_pgdp = trans_pgd;
+
+ return rc;
+}
+
+/*
+ * Add map entry to trans_pgd for a base-size page at PTE level.
+ * info: contains allocator and its argument
+ * trans_pgd: page table in which new map is added.
+ * page: page to be mapped.
+ * dst_addr: new VA address for the page
+ * pgprot: protection for the page.
+ *
+ * Returns 0 on success, and -ENOMEM on failure.
+ */
+int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd,
+ void *page, unsigned long dst_addr, pgprot_t pgprot)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_pgd(trans_pgd, dst_addr);
+ if (pgd_none(READ_ONCE(*pgdp))) {
+ p4dp = trans_alloc(info);
+ if (!pgdp)
+ return -ENOMEM;
+ pgd_populate(NULL, pgdp, p4dp);
+ }
+
+ p4dp = p4d_offset(pgdp, dst_addr);
+ if (p4d_none(READ_ONCE(*p4dp))) {
+ pudp = trans_alloc(info);
+ if (!pudp)
+ return -ENOMEM;
+ p4d_populate(NULL, p4dp, pudp);
+ }
+
+ pudp = pud_offset(p4dp, dst_addr);
+ if (pud_none(READ_ONCE(*pudp))) {
+ pmdp = trans_alloc(info);
+ if (!pmdp)
+ return -ENOMEM;
+ pud_populate(NULL, pudp, pmdp);
+ }
+
+ pmdp = pmd_offset(pudp, dst_addr);
+ if (pmd_none(READ_ONCE(*pmdp))) {
+ ptep = trans_alloc(info);
+ if (!ptep)
+ return -ENOMEM;
+ pmd_populate_kernel(NULL, pmdp, ptep);
+ }
+
+ ptep = pte_offset_kernel(pmdp, dst_addr);
+ set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot));
+
+ return 0;
+}
+
+/*
+ * The page we want to idmap may be outside the range covered by VA_BITS that
+ * can be built using the kernel's p?d_populate() helpers. As a one off, for a
+ * single page, we build these page tables bottom up and just assume that will
+ * need the maximum T0SZ.
+ *
+ * Returns 0 on success, and -ENOMEM on failure.
+ * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to
+ * maximum T0SZ for this page.
+ */
+int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
+ unsigned long *t0sz, void *page)
+{
+ phys_addr_t dst_addr = virt_to_phys(page);
+ unsigned long pfn = __phys_to_pfn(dst_addr);
+ int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47;
+ int bits_mapped = PAGE_SHIFT - 4;
+ unsigned long level_mask, prev_level_entry, *levels[4];
+ int this_level, index, level_lsb, level_msb;
+
+ dst_addr &= PAGE_MASK;
+ prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC));
+
+ for (this_level = 3; this_level >= 0; this_level--) {
+ levels[this_level] = trans_alloc(info);
+ if (!levels[this_level])
+ return -ENOMEM;
+
+ level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level);
+ level_msb = min(level_lsb + bits_mapped, max_msb);
+ level_mask = GENMASK_ULL(level_msb, level_lsb);
+
+ index = (dst_addr & level_mask) >> level_lsb;
+ *(levels[this_level] + index) = prev_level_entry;
+
+ pfn = virt_to_pfn(levels[this_level]);
+ prev_level_entry = pte_val(pfn_pte(pfn,
+ __pgprot(PMD_TYPE_TABLE)));
+
+ if (level_msb == max_msb)
+ break;
+ }
+
+ *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn));
+ *t0sz = TCR_T0SZ(max_msb + 1);
+
+ return 0;
+}