From 4363287178a85e41cd59f9f1d423fbe1f9048ec8 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 19 Aug 2020 17:10:11 +0300 Subject: [PATCH 01/22] riscv/mm: Simplify retry logic in do_page_fault() Let's combine the two retry logic if statements in do_page_fault() to simplify the code. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 716d64e36f83..f5c2e4a249eb 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -127,17 +127,15 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { + flags |= FAULT_FLAG_TRIED; - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - goto retry; - } + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + goto retry; } mmap_read_unlock(mm); From cac4d1dc85be2996ea19aea4f6ac6525f4c97171 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 18:38:58 +0300 Subject: [PATCH 02/22] riscv/mm/fault: Move no context handling to no_context() This patch moves the no context handling in do_page_fault() to no_context() function and converts gotos to calls to the new function. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 83 +++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index f5c2e4a249eb..1612552478c5 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -19,6 +19,24 @@ #include "../kernel/head.h" +static inline void no_context(struct pt_regs *regs, unsigned long addr) +{ + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + bust_spinlocks(1); + pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", + (addr < PAGE_SIZE) ? "NULL pointer dereference" : + "paging request", addr); + die(regs, "Oops"); + do_exit(SIGKILL); +} + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -59,8 +77,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * If we're in an interrupt, have no user context, or are running * in an atomic region, then we must not take the fault. */ - if (unlikely(faulthandler_disabled() || !mm)) - goto no_context; + if (unlikely(faulthandler_disabled() || !mm)) { + no_context(regs, addr); + return; + } if (user_mode(regs)) flags |= FAULT_FLAG_USER; @@ -153,21 +173,8 @@ bad_area: return; } -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - die(regs, "Oops"); - do_exit(SIGKILL); + no_context(regs, addr); + return; /* * We ran out of memory, call the OOM killer, and return the userspace @@ -175,16 +182,20 @@ no_context: */ out_of_memory: mmap_read_unlock(mm); - if (!user_mode(regs)) - goto no_context; + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } pagefault_out_of_memory(); return; do_sigbus: mmap_read_unlock(mm); /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - goto no_context; + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; @@ -213,19 +224,25 @@ vmalloc_fault: pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; pgd_k = init_mm.pgd + index; - if (!pgd_present(*pgd_k)) - goto no_context; + if (!pgd_present(*pgd_k)) { + no_context(regs, addr); + return; + } set_pgd(pgd, *pgd_k); p4d = p4d_offset(pgd, addr); p4d_k = p4d_offset(pgd_k, addr); - if (!p4d_present(*p4d_k)) - goto no_context; + if (!p4d_present(*p4d_k)) { + no_context(regs, addr); + return; + } pud = pud_offset(p4d, addr); pud_k = pud_offset(p4d_k, addr); - if (!pud_present(*pud_k)) - goto no_context; + if (!pud_present(*pud_k)) { + no_context(regs, addr); + return; + } /* * Since the vmalloc area is global, it is unnecessary @@ -233,8 +250,10 @@ vmalloc_fault: */ pmd = pmd_offset(pud, addr); pmd_k = pmd_offset(pud_k, addr); - if (!pmd_present(*pmd_k)) - goto no_context; + if (!pmd_present(*pmd_k)) { + no_context(regs, addr); + return; + } set_pmd(pmd, *pmd_k); /* @@ -244,8 +263,10 @@ vmalloc_fault: * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, addr); - if (!pte_present(*pte_k)) - goto no_context; + if (!pte_present(*pte_k)) { + no_context(regs, addr); + return; + } /* * The kernel assumes that TLBs don't cache invalid From a51271d99cdd04910227060936d0598ba49fb1cc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 18:48:01 +0300 Subject: [PATCH 03/22] riscv/mm/fault: Move bad area handling to bad_area() This patch moves the bad area handling in do_page_fault() to bad_area() function and converts gotos to calls to the new function. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 67 ++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 1612552478c5..ac9a99255365 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -37,6 +37,22 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) do_exit(SIGKILL); } +static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +{ + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ + mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); + return; + } + + no_context(regs, addr); +} + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -90,14 +106,20 @@ asmlinkage void do_page_fault(struct pt_regs *regs) retry: mmap_read_lock(mm); vma = find_vma(mm, addr); - if (unlikely(!vma)) - goto bad_area; + if (unlikely(!vma)) { + bad_area(regs, mm, code, addr); + return; + } if (likely(vma->vm_start <= addr)) goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - goto bad_area; - if (unlikely(expand_stack(vma, addr))) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, mm, code, addr); + return; + } + if (unlikely(expand_stack(vma, addr))) { + bad_area(regs, mm, code, addr); + return; + } /* * Ok, we have a good vm_area for this memory access, so @@ -108,16 +130,22 @@ good_area: switch (cause) { case EXC_INST_PAGE_FAULT: - if (!(vma->vm_flags & VM_EXEC)) - goto bad_area; + if (!(vma->vm_flags & VM_EXEC)) { + bad_area(regs, mm, code, addr); + return; + } break; case EXC_LOAD_PAGE_FAULT: - if (!(vma->vm_flags & VM_READ)) - goto bad_area; + if (!(vma->vm_flags & VM_READ)) { + bad_area(regs, mm, code, addr); + return; + } break; case EXC_STORE_PAGE_FAULT: - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; + if (!(vma->vm_flags & VM_WRITE)) { + bad_area(regs, mm, code, addr); + return; + } flags |= FAULT_FLAG_WRITE; break; default: @@ -161,21 +189,6 @@ good_area: mmap_read_unlock(mm); return; - /* - * Something tried to access memory that isn't in our memory map. - * Fix it, but check if it's kernel or user first. - */ -bad_area: - mmap_read_unlock(mm); - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) { - do_trap(regs, SIGSEGV, code, addr); - return; - } - - no_context(regs, addr); - return; - /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). From ac416a724f113207407848ef58a0270cd03d94d1 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 18:54:26 +0300 Subject: [PATCH 04/22] riscv/mm/fault: Move vmalloc fault handling to vmalloc_fault() This patch moves the vmalloc fault handling in do_page_fault() to vmalloc_fault() function and converts gotos to calls to the new function. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 164 +++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index ac9a99255365..460ea1d6c24e 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -53,6 +53,84 @@ static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code no_context(regs, addr); } +static void inline vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) +{ + pgd_t *pgd, *pgd_k; + pud_t *pud, *pud_k; + p4d_t *p4d, *p4d_k; + pmd_t *pmd, *pmd_k; + pte_t *pte_k; + int index; + + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) + return do_trap(regs, SIGSEGV, code, addr); + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "tsk->active_mm->pgd" here. + * We might be inside an interrupt in the middle + * of a task switch. + */ + index = pgd_index(addr); + pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) { + no_context(regs, addr); + return; + } + set_pgd(pgd, *pgd_k); + + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + if (!p4d_present(*p4d_k)) { + no_context(regs, addr); + return; + } + + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); + if (!pud_present(*pud_k)) { + no_context(regs, addr); + return; + } + + /* + * Since the vmalloc area is global, it is unnecessary + * to copy individual PTEs + */ + pmd = pmd_offset(pud, addr); + pmd_k = pmd_offset(pud_k, addr); + if (!pmd_present(*pmd_k)) { + no_context(regs, addr); + return; + } + set_pmd(pmd, *pmd_k); + + /* + * Make sure the actual PTE exists as well to + * catch kernel vmalloc-area accesses to non-mapped + * addresses. If we don't do this, this will just + * silently loop forever. + */ + pte_k = pte_offset_kernel(pmd_k, addr); + if (!pte_present(*pte_k)) { + no_context(regs, addr); + return; + } + + /* + * The kernel assumes that TLBs don't cache invalid + * entries, but in RISC-V, SFENCE.VMA specifies an + * ordering constraint, not a cache flush; it is + * necessary even after writing invalid entries. + */ + local_flush_tlb_page(addr); +} + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -82,8 +160,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * only copy the information from the master page table, * nothing more. */ - if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) - goto vmalloc_fault; + if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) { + vmalloc_fault(regs, code, addr); + return; + } /* Enable interrupts if they were enabled in the parent context. */ if (likely(regs->status & SR_PIE)) @@ -211,84 +291,4 @@ do_sigbus: } do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; - -vmalloc_fault: - { - pgd_t *pgd, *pgd_k; - pud_t *pud, *pud_k; - p4d_t *p4d, *p4d_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - int index; - - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) - return do_trap(regs, SIGSEGV, code, addr); - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "tsk->active_mm->pgd" here. - * We might be inside an interrupt in the middle - * of a task switch. - */ - index = pgd_index(addr); - pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) { - no_context(regs, addr); - return; - } - set_pgd(pgd, *pgd_k); - - p4d = p4d_offset(pgd, addr); - p4d_k = p4d_offset(pgd_k, addr); - if (!p4d_present(*p4d_k)) { - no_context(regs, addr); - return; - } - - pud = pud_offset(p4d, addr); - pud_k = pud_offset(p4d_k, addr); - if (!pud_present(*pud_k)) { - no_context(regs, addr); - return; - } - - /* - * Since the vmalloc area is global, it is unnecessary - * to copy individual PTEs - */ - pmd = pmd_offset(pud, addr); - pmd_k = pmd_offset(pud_k, addr); - if (!pmd_present(*pmd_k)) { - no_context(regs, addr); - return; - } - set_pmd(pmd, *pmd_k); - - /* - * Make sure the actual PTE exists as well to - * catch kernel vmalloc-area accesses to non-mapped - * addresses. If we don't do this, this will just - * silently loop forever. - */ - pte_k = pte_offset_kernel(pmd_k, addr); - if (!pte_present(*pte_k)) { - no_context(regs, addr); - return; - } - - /* - * The kernel assumes that TLBs don't cache invalid - * entries, but in RISC-V, SFENCE.VMA specifies an - * ordering constraint, not a cache flush; it is - * necessary even after writing invalid entries. - */ - local_flush_tlb_page(addr); - - return; - } } From bda281d5bfb70f895880ebfb94a7f20d0604437f Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 19:01:43 +0300 Subject: [PATCH 05/22] riscv/mm/fault: Simplify fault error handling Move fault error handling after retry logic. This simplifies the code flow and makes it easier to move fault error handling to its own function. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 460ea1d6c24e..bfb40927cb7a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -247,14 +247,6 @@ good_area: if (fault_signal_pending(fault, regs)) return; - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { flags |= FAULT_FLAG_TRIED; @@ -267,6 +259,14 @@ good_area: } mmap_read_unlock(mm); + + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } return; /* @@ -274,7 +274,6 @@ good_area: * (which will retry the fault, or kill us if we got oom-killed). */ out_of_memory: - mmap_read_unlock(mm); if (!user_mode(regs)) { no_context(regs, addr); return; @@ -283,7 +282,6 @@ out_of_memory: return; do_sigbus: - mmap_read_unlock(mm); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) { no_context(regs, addr); From 6c11ffbfd849830e8cede5fb0699828e74a7d26b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 19:04:09 +0300 Subject: [PATCH 06/22] riscv/mm/fault: Move fault error handling to mm_fault_error() This patch moves the fault error handling to mm_fault_error() function and converts gotos to calls to the new function. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 56 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index bfb40927cb7a..49b190d0c088 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -37,6 +37,36 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) do_exit(SIGKILL); } +static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) +{ + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ +out_of_memory: + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + pagefault_out_of_memory(); + return; + +do_sigbus: + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + do_trap(regs, SIGBUS, BUS_ADRERR, addr); + return; +} + static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) { /* @@ -261,32 +291,8 @@ good_area: mmap_read_unlock(mm); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - return; - - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ -out_of_memory: - if (!user_mode(regs)) { - no_context(regs, addr); + mm_fault_error(regs, addr, fault); return; } - pagefault_out_of_memory(); - return; - -do_sigbus: - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) { - no_context(regs, addr); - return; - } - do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; } From 7a75f3d47a0b1be6eeb67d14e4003b2b91f8aa59 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 19:05:44 +0300 Subject: [PATCH 07/22] riscv/mm/fault: Simplify mm_fault_error() Simplify the mm_fault_error() handling function by eliminating the unnecessary gotos. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 45 +++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 49b190d0c088..3b430fb18de3 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -39,32 +39,27 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + pagefault_out_of_memory(); + return; + } else if (fault & VM_FAULT_SIGBUS) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + do_trap(regs, SIGBUS, BUS_ADRERR, addr); + return; + } BUG(); - - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ -out_of_memory: - if (!user_mode(regs)) { - no_context(regs, addr); - return; - } - pagefault_out_of_memory(); - return; - -do_sigbus: - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) { - no_context(regs, addr); - return; - } - do_trap(regs, SIGBUS, BUS_ADRERR, addr); - return; } static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) From 6747430197ed414be37e843064a7f365f4d1fd57 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 19:42:54 +0300 Subject: [PATCH 08/22] riscv/mm/fault: Move FAULT_FLAG_WRITE handling in do_page_fault() Let's handle the translation of EXC_STORE_PAGE_FAULT to FAULT_FLAG_WRITE once before looking up the VMA. This makes it easier to extract access error logic in the next patch. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 3b430fb18de3..bdc70d3d507f 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -208,6 +208,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + if (cause == EXC_STORE_PAGE_FAULT) + flags |= FAULT_FLAG_WRITE; + retry: mmap_read_lock(mm); vma = find_vma(mm, addr); @@ -251,7 +254,6 @@ good_area: bad_area(regs, mm, code, addr); return; } - flags |= FAULT_FLAG_WRITE; break; default: panic("%s: unhandled cause %lu", __func__, cause); From afb8c6fee8ce9b54b9c810eea0597ef6a876abb7 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 25 Aug 2020 19:41:17 +0300 Subject: [PATCH 09/22] riscv/mm/fault: Move access error check to function Move the access error check into a access_error() function to simplify the control flow in do_page_fault(). Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 48 ++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index bdc70d3d507f..a23eaf5ce95c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -156,6 +156,30 @@ static void inline vmalloc_fault(struct pt_regs *regs, int code, unsigned long a local_flush_tlb_page(addr); } +static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) +{ + switch (cause) { + case EXC_INST_PAGE_FAULT: + if (!(vma->vm_flags & VM_EXEC)) { + return true; + } + break; + case EXC_LOAD_PAGE_FAULT: + if (!(vma->vm_flags & VM_READ)) { + return true; + } + break; + case EXC_STORE_PAGE_FAULT: + if (!(vma->vm_flags & VM_WRITE)) { + return true; + } + break; + default: + panic("%s: unhandled cause %lu", __func__, cause); + } + return false; +} + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -236,27 +260,9 @@ retry: good_area: code = SEGV_ACCERR; - switch (cause) { - case EXC_INST_PAGE_FAULT: - if (!(vma->vm_flags & VM_EXEC)) { - bad_area(regs, mm, code, addr); - return; - } - break; - case EXC_LOAD_PAGE_FAULT: - if (!(vma->vm_flags & VM_READ)) { - bad_area(regs, mm, code, addr); - return; - } - break; - case EXC_STORE_PAGE_FAULT: - if (!(vma->vm_flags & VM_WRITE)) { - bad_area(regs, mm, code, addr); - return; - } - break; - default: - panic("%s: unhandled cause %lu", __func__, cause); + if (unlikely(access_error(cause, vma))) { + bad_area(regs, mm, code, addr); + return; } /* From baf7cbd94b5688f167443a2cc3dcea3300132099 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 31 Aug 2020 15:33:48 +0800 Subject: [PATCH 10/22] riscv: Set more data to cacheinfo Set cacheinfo.{size,sets,line_size} for each cache node, then we can get these information from userland through auxiliary vector. Signed-off-by: Zong Li Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 66 +++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index bd0f122965c3..291d7d8f748b 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -25,12 +25,53 @@ cache_get_priv_group(struct cacheinfo *this_leaf) return NULL; } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct device_node *node, - enum cache_type type, unsigned int level) +static void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, + unsigned int level, unsigned int size, + unsigned int sets, unsigned int line_size) { this_leaf->level = level; this_leaf->type = type; + this_leaf->size = size; + this_leaf->number_of_sets = sets; + this_leaf->coherency_line_size = line_size; + + /* + * If the cache is fully associative, there is no need to + * check the other properties. + */ + if (sets == 1) + return; + + /* + * Set the ways number for n-ways associative, make sure + * all properties are big than zero. + */ + if (sets > 0 && size > 0 && line_size > 0) + this_leaf->ways_of_associativity = (size / sets) / line_size; +} + +static void fill_cacheinfo(struct cacheinfo **this_leaf, + struct device_node *node, unsigned int level) +{ + unsigned int size, sets, line_size; + + if (!of_property_read_u32(node, "cache-size", &size) && + !of_property_read_u32(node, "cache-block-size", &line_size) && + !of_property_read_u32(node, "cache-sets", &sets)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_UNIFIED, level, size, sets, line_size); + } + + if (!of_property_read_u32(node, "i-cache-size", &size) && + !of_property_read_u32(node, "i-cache-sets", &sets) && + !of_property_read_u32(node, "i-cache-block-size", &line_size)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_INST, level, size, sets, line_size); + } + + if (!of_property_read_u32(node, "d-cache-size", &size) && + !of_property_read_u32(node, "d-cache-sets", &sets) && + !of_property_read_u32(node, "d-cache-block-size", &line_size)) { + ci_leaf_init((*this_leaf)++, CACHE_TYPE_DATA, level, size, sets, line_size); + } } static int __init_cache_level(unsigned int cpu) @@ -83,29 +124,24 @@ static int __populate_cache_leaves(unsigned int cpu) struct device_node *prev = NULL; int levels = 1, level = 1; - if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + /* Level 1 caches in cpu node */ + fill_cacheinfo(&this_leaf, np, level); + /* Next level caches in cache nodes */ prev = np; while ((np = of_find_next_cache_node(np))) { of_node_put(prev); prev = np; + if (!of_device_is_compatible(np, "cache")) break; if (of_property_read_u32(np, "cache-level", &level)) break; if (level <= levels) break; - if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + + fill_cacheinfo(&this_leaf, np, level); + levels = level; } of_node_put(np); From b5fca7c55f9fbab5ad732c3bce00f31af6ba5cfa Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 31 Aug 2020 15:33:49 +0800 Subject: [PATCH 11/22] riscv: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO AT_VECTOR_SIZE_ARCH should be defined with the maximum number of NEW_AUX_ENT entries that ARCH_DLINFO can contain, but it wasn't defined for RISC-V at all even though ARCH_DLINFO will contain one NEW_AUX_ENT for the VDSO address. Signed-off-by: Zong Li Reviewed-by: Palmer Dabbelt Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/auxvec.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index d86cb17bbabe..22e0ae888406 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -10,4 +10,7 @@ /* vDSO location */ #define AT_SYSINFO_EHDR 33 +/* entries in ARCH_DLINFO */ +#define AT_VECTOR_SIZE_ARCH 1 + #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ From 38f5bd23deae24c8fa67a2c574b6d43df27a8aa8 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 31 Aug 2020 15:33:50 +0800 Subject: [PATCH 12/22] riscv: Add cache information in AUX vector There are no standard CSR registers to provide cache information, the way for RISC-V is to get this information from DT. Currently, AT_L1I_X, AT_L1D_X and AT_L2_X are present in glibc header, and sysconf syscall could use them to get information of cache through AUX vector. The result of 'getconf -a' as follows: LEVEL1_ICACHE_SIZE 32768 LEVEL1_ICACHE_ASSOC 8 LEVEL1_ICACHE_LINESIZE 64 LEVEL1_DCACHE_SIZE 32768 LEVEL1_DCACHE_ASSOC 8 LEVEL1_DCACHE_LINESIZE 64 LEVEL2_CACHE_SIZE 2097152 LEVEL2_CACHE_ASSOC 32 LEVEL2_CACHE_LINESIZE 64 Signed-off-by: Zong Li Reviewed-by: Palmer Dabbelt Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheinfo.h | 5 +++++ arch/riscv/include/asm/elf.h | 13 +++++++++++ arch/riscv/include/uapi/asm/auxvec.h | 23 +++++++++++++++++++- arch/riscv/kernel/cacheinfo.c | 32 +++++++++++++++++++++++++++- 4 files changed, 71 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/cacheinfo.h b/arch/riscv/include/asm/cacheinfo.h index 5d9662e9aba8..d1a365215ec0 100644 --- a/arch/riscv/include/asm/cacheinfo.h +++ b/arch/riscv/include/asm/cacheinfo.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 SiFive + */ #ifndef _ASM_RISCV_CACHEINFO_H #define _ASM_RISCV_CACHEINFO_H @@ -11,5 +14,7 @@ struct riscv_cacheinfo_ops { }; void riscv_set_cacheinfo_ops(struct riscv_cacheinfo_ops *ops); +uintptr_t get_cache_size(u32 level, enum cache_type type); +uintptr_t get_cache_geometry(u32 level, enum cache_type type); #endif /* _ASM_RISCV_CACHEINFO_H */ diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index d83a4efd052b..5c725e1df58b 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * These are used to set parameters in the core dumps. @@ -61,6 +62,18 @@ extern unsigned long elf_hwcap; do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_L1I_CACHESIZE, \ + get_cache_size(1, CACHE_TYPE_INST)); \ + NEW_AUX_ENT(AT_L1I_CACHEGEOMETRY, \ + get_cache_geometry(1, CACHE_TYPE_INST)); \ + NEW_AUX_ENT(AT_L1D_CACHESIZE, \ + get_cache_size(1, CACHE_TYPE_DATA)); \ + NEW_AUX_ENT(AT_L1D_CACHEGEOMETRY, \ + get_cache_geometry(1, CACHE_TYPE_DATA)); \ + NEW_AUX_ENT(AT_L2_CACHESIZE, \ + get_cache_size(2, CACHE_TYPE_UNIFIED)); \ + NEW_AUX_ENT(AT_L2_CACHEGEOMETRY, \ + get_cache_geometry(2, CACHE_TYPE_UNIFIED)); \ } while (0) #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 22e0ae888406..32c73ba1d531 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -10,7 +10,28 @@ /* vDSO location */ #define AT_SYSINFO_EHDR 33 +/* + * The set of entries below represent more extensive information + * about the caches, in the form of two entry per cache type, + * one entry containing the cache size in bytes, and the other + * containing the cache line size in bytes in the bottom 16 bits + * and the cache associativity in the next 16 bits. + * + * The associativity is such that if N is the 16-bit value, the + * cache is N way set associative. A value if 0xffff means fully + * associative, a value of 1 means directly mapped. + * + * For all these fields, a value of 0 means that the information + * is not known. + */ +#define AT_L1I_CACHESIZE 40 +#define AT_L1I_CACHEGEOMETRY 41 +#define AT_L1D_CACHESIZE 42 +#define AT_L1D_CACHEGEOMETRY 43 +#define AT_L2_CACHESIZE 44 +#define AT_L2_CACHEGEOMETRY 45 + /* entries in ARCH_DLINFO */ -#define AT_VECTOR_SIZE_ARCH 1 +#define AT_VECTOR_SIZE_ARCH 7 #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 291d7d8f748b..de59dd457b41 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -3,7 +3,6 @@ * Copyright (C) 2017 SiFive */ -#include #include #include #include @@ -25,6 +24,37 @@ cache_get_priv_group(struct cacheinfo *this_leaf) return NULL; } +static struct cacheinfo *get_cacheinfo(u32 level, enum cache_type type) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(smp_processor_id()); + struct cacheinfo *this_leaf; + int index; + + for (index = 0; index < this_cpu_ci->num_leaves; index++) { + this_leaf = this_cpu_ci->info_list + index; + if (this_leaf->level == level && this_leaf->type == type) + return this_leaf; + } + + return NULL; +} + +uintptr_t get_cache_size(u32 level, enum cache_type type) +{ + struct cacheinfo *this_leaf = get_cacheinfo(level, type); + + return this_leaf ? this_leaf->size : 0; +} + +uintptr_t get_cache_geometry(u32 level, enum cache_type type) +{ + struct cacheinfo *this_leaf = get_cacheinfo(level, type); + + return this_leaf ? (this_leaf->ways_of_associativity << 16 | + this_leaf->coherency_line_size) : + 0; +} + static void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, unsigned int level, unsigned int size, unsigned int sets, unsigned int line_size) From 2baa6d9506f24d52f53317f60ccbcdbb2c4f4c40 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 5 Sep 2020 08:52:52 +0300 Subject: [PATCH 13/22] riscv/mm/fault: Fix inline placement in vmalloc_fault() declaration The "inline" keyword is in the wrong place in vmalloc_fault() declaration: >> arch/riscv/mm/fault.c:56:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] 56 | static void inline vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) | ^~~~~~ Fix that up. Reported-by: kernel test robot Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index a23eaf5ce95c..a173432ccf82 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -78,7 +78,7 @@ static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code no_context(regs, addr); } -static void inline vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) +static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) { pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; From a960c1323749383e62b67f5d49cdfbdcccde0ef6 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 5 Sep 2020 09:07:26 +0300 Subject: [PATCH 14/22] riscv/mm/fault: Set FAULT_FLAG_INSTRUCTION flag in do_page_fault() If the page fault "cause" is EXC_INST_PAGE_FAULT, set the FAULT_FLAG_INSTRUCTION flag to let handle_mm_fault() and friends know about it. This has no functional changes because RISC-V uses the default arch_vma_access_permitted() implementation, which always returns true. However, dax_pmd_fault(), for example, has a tracepoint that uses FAULT_FLAG_INSTRUCTION, so we might as well set it. Signed-off-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index a173432ccf82..1359e21c0c62 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -234,7 +234,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) if (cause == EXC_STORE_PAGE_FAULT) flags |= FAULT_FLAG_WRITE; - + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; retry: mmap_read_lock(mm); vma = find_vma(mm, addr); From 54701a0d12e2e2c9f0814572b42bdd3067ffcf15 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 14 Sep 2020 08:52:02 +0800 Subject: [PATCH 15/22] RISC-V: Fix duplicate included thread_info.h asm/thread_info.h is included more than once, Remove the one that isn't necessary. Signed-off-by: Tian Tao Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 0a4e81b8dc79..9ade707b8ed8 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -3,7 +3,6 @@ * Copyright (C) 2012 Regents of the University of California */ -#include #include #include #include From 8f3a2b4a96dc014e99e1df327db1450fdbbd5e15 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 17 Sep 2020 15:37:10 -0700 Subject: [PATCH 16/22] RISC-V: Move DT mapping outof fixmap Currently, RISC-V reserves 1MB of fixmap memory for device tree. However, it maps only single PMD (2MB) space for fixmap which leaves only < 1MB space left for other kernel features such as early ioremap which requires fixmap as well. The fixmap size can be increased by another 2MB but it brings additional complexity and changes the virtual memory layout as well. If we require some additional feature requiring fixmap again, it has to be moved again. Technically, DT doesn't need a fixmap as the memory occupied by the DT is only used during boot. That's why, We map device tree in early page table using two consecutive PGD mappings at lower addresses (< PAGE_OFFSET). This frees lot of space in fixmap and also makes maximum supported device tree size supported as PGDIR_SIZE. Thus, init memory section can be used for the same purpose as well. This simplifies fixmap implementation. Signed-off-by: Anup Patel Signed-off-by: Atish Patra Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/fixmap.h | 3 --- arch/riscv/include/asm/pgtable.h | 1 + arch/riscv/kernel/head.S | 1 - arch/riscv/kernel/head.h | 2 -- arch/riscv/kernel/setup.c | 9 +++++++-- arch/riscv/mm/init.c | 26 ++++++++++++-------------- 6 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 1ff075a8dfc7..11613f38228a 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -22,9 +22,6 @@ */ enum fixed_addresses { FIX_HOLE, -#define FIX_FDT_SIZE SZ_1M - FIX_FDT_END, - FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, FIX_PTE, FIX_PMD, FIX_TEXT_POKE1, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index eaea1f717010..815f8c959dd4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -464,6 +464,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl #define kern_addr_valid(addr) (1) /* FIXME */ extern void *dtb_early_va; +extern uintptr_t dtb_early_pa; void setup_bootmem(void); void paging_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 9ade707b8ed8..703e59d717f0 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -258,7 +258,6 @@ clear_bss_done: #endif /* Start the kernel */ call soc_early_init - call parse_dtb tail start_kernel .Lsecondary_start: diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 105fb0496b24..b48dda3d04f6 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,6 +16,4 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); extern void *__cpu_up_stack_pointer[]; extern void *__cpu_up_task_pointer[]; -void __init parse_dtb(void); - #endif /* __ASM_HEAD_H */ diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 2c6dd329312b..edea7ef88402 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -48,8 +48,9 @@ atomic_t hart_lottery __section(.sdata); unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); -void __init parse_dtb(void) +static void __init parse_dtb(void) { + /* Early scan of device tree from init memory */ if (early_init_dt_scan(dtb_early_va)) return; @@ -62,6 +63,7 @@ void __init parse_dtb(void) void __init setup_arch(char **cmdline_p) { + parse_dtb(); init_mm.start_code = (unsigned long) _stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; @@ -76,7 +78,10 @@ void __init setup_arch(char **cmdline_p) #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - unflatten_device_tree(); + if (early_init_dt_verify(__va(dtb_early_pa))) + unflatten_device_tree(); + else + pr_err("No DTB found in kernel mappings\n"); #endif #ifdef CONFIG_SWIOTLB diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 787c75f751a5..2b651f63f5c4 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -28,7 +28,9 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -void *dtb_early_va; +#define DTB_EARLY_BASE_VA PGDIR_SIZE +void *dtb_early_va __initdata; +uintptr_t dtb_early_pa __initdata; static void __init zone_sizes_init(void) { @@ -141,8 +143,6 @@ disable: } #endif /* CONFIG_BLK_DEV_INITRD */ -static phys_addr_t dtb_early_pa __initdata; - void __init setup_bootmem(void) { struct memblock_region *reg; @@ -399,7 +399,7 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) asmlinkage void __init setup_vm(uintptr_t dtb_pa) { - uintptr_t va, end_va; + uintptr_t va, pa, end_va; uintptr_t load_pa = (uintptr_t)(&_start); uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); @@ -448,16 +448,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) load_pa + (va - PAGE_OFFSET), map_size, PAGE_KERNEL_EXEC); - /* Create fixed mapping for early FDT parsing */ - end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; - for (va = __fix_to_virt(FIX_FDT); va < end_va; va += PAGE_SIZE) - create_pte_mapping(fixmap_pte, va, - dtb_pa + (va - __fix_to_virt(FIX_FDT)), - PAGE_SIZE, PAGE_KERNEL); - - /* Save pointer to DTB for early FDT parsing */ - dtb_early_va = (void *)fix_to_virt(FIX_FDT) + (dtb_pa & ~PAGE_MASK); - /* Save physical address for memblock reservation */ + /* Create two consecutive PGD mappings for FDT early scan */ + pa = dtb_pa & ~(PGDIR_SIZE - 1); + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, + pa, PGDIR_SIZE, PAGE_KERNEL); + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, + pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); + dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); dtb_early_pa = dtb_pa; } @@ -516,6 +513,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #else dtb_early_va = (void *)dtb_pa; #endif + dtb_early_pa = dtb_pa; } static inline void setup_vm_final(void) From 6262f661ff5d7d6a2613b95d0b7820c60b46b0b5 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:11 -0700 Subject: [PATCH 17/22] RISC-V: Add early ioremap support UEFI uses early IO or memory mappings for runtime services before normal ioremap() is usable. Add the necessary fixmap bindings and pmd mappings for generic ioremap support to work. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/Kbuild | 1 + arch/riscv/include/asm/fixmap.h | 13 +++++++++++++ arch/riscv/include/asm/io.h | 1 + arch/riscv/kernel/setup.c | 2 ++ arch/riscv/mm/init.c | 33 +++++++++++++++++++++++++++++++++ 6 files changed, 51 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index df18372861d8..5e4ace64acbc 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -37,6 +37,7 @@ config RISCV select GENERIC_ARCH_TOPOLOGY if SMP select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS + select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IOREMAP select GENERIC_IRQ_MULTI_HANDLER diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 3d9410bb4de0..59dd7be55005 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += kvm_para.h diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 11613f38228a..54cbf07fb4e9 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -27,6 +27,19 @@ enum fixed_addresses { FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, + + __end_of_permanent_fixed_addresses, + /* + * Temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + */ +#define NR_FIX_BTMAPS (SZ_256K / PAGE_SIZE) +#define FIX_BTMAPS_SLOTS 7 +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + __end_of_fixed_addresses }; diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 3835c3295dc5..c025a746a148 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * MMIO access functions are separated out to break dependency cycles diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index edea7ef88402..41ef96d0d97a 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -71,6 +72,7 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; + early_ioremap_setup(); parse_early_param(); setup_bootmem(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 2b651f63f5c4..b75ebe8e7a92 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -403,6 +403,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) uintptr_t load_pa = (uintptr_t)(&_start); uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t fix_bmap_spmd, fix_bmap_epmd; +#endif va_pa_offset = PAGE_OFFSET - load_pa; pfn_base = PFN_DOWN(load_pa); @@ -456,6 +459,36 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); dtb_early_pa = dtb_pa; + + /* + * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap + * range can not span multiple pmds. + */ + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); + +#ifndef __PAGETABLE_PMD_FOLDED + /* + * Early ioremap fixmap is already created as it lies within first 2MB + * of fixmap region. We always map PMD_SIZE. Thus, both FIX_BTMAP_END + * FIX_BTMAP_BEGIN should lie in the same pmd. Verify that and warn + * the user if not. + */ + fix_bmap_spmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_BEGIN))]; + fix_bmap_epmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_END))]; + if (pmd_val(fix_bmap_spmd) != pmd_val(fix_bmap_epmd)) { + WARN_ON(1); + pr_warn("fixmap btmap start [%08lx] != end [%08lx]\n", + pmd_val(fix_bmap_spmd), pmd_val(fix_bmap_epmd)); + pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); + } +#endif } static void __init setup_vm_final(void) From e8dcb61f2ade040a372d66907d220dd3fdee2505 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:12 -0700 Subject: [PATCH 18/22] RISC-V: Implement late mapping page table allocation functions Currently, page table setup is done during setup_va_final where fixmap can be used to create the temporary mappings. The physical frame is allocated from memblock_alloc_* functions. However, this won't work if page table mapping needs to be created for a different mm context (i.e. efi mm) at a later point of time. Use generic kernel page allocation function & macros for any mapping after setup_vm_final. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Acked-by: Mike Rapoport Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 130 ++++++++++++++++++++++++++++++++----------- 1 file changed, 99 insertions(+), 31 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b75ebe8e7a92..63acc8185bfa 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -32,6 +32,15 @@ extern char _start[]; void *dtb_early_va __initdata; uintptr_t dtb_early_pa __initdata; +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); +#endif +}; + static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; @@ -203,6 +212,8 @@ void __init setup_bootmem(void) } #ifdef CONFIG_MMU +static struct pt_alloc_ops pt_ops; + unsigned long va_pa_offset; EXPORT_SYMBOL(va_pa_offset); unsigned long pfn_base; @@ -211,7 +222,6 @@ EXPORT_SYMBOL(pfn_base); pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; -static bool mmu_enabled; #define MAX_EARLY_MAPPING_SIZE SZ_128M @@ -234,27 +244,46 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) } } -static pte_t *__init get_pte_virt(phys_addr_t pa) +static inline pte_t *__init get_pte_virt_early(phys_addr_t pa) { - if (mmu_enabled) { - clear_fixmap(FIX_PTE); - return (pte_t *)set_fixmap_offset(FIX_PTE, pa); - } else { - return (pte_t *)((uintptr_t)pa); - } + return (pte_t *)((uintptr_t)pa); } -static phys_addr_t __init alloc_pte(uintptr_t va) +static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PTE); + return (pte_t *)set_fixmap_offset(FIX_PTE, pa); +} + +static inline pte_t *get_pte_virt_late(phys_addr_t pa) +{ + return (pte_t *) __va(pa); +} + +static inline phys_addr_t __init alloc_pte_early(uintptr_t va) { /* * We only create PMD or PGD early mappings so we * should never reach here with MMU disabled. */ - BUG_ON(!mmu_enabled); + BUG(); +} +static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) +{ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } +static phys_addr_t alloc_pte_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))) + BUG(); + return __pa(vaddr); +} + static void __init create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) @@ -279,28 +308,46 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; #endif pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE); -static pmd_t *__init get_pmd_virt(phys_addr_t pa) +static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { - if (mmu_enabled) { - clear_fixmap(FIX_PMD); - return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); - } else { - return (pmd_t *)((uintptr_t)pa); - } + /* Before MMU is enabled */ + return (pmd_t *)((uintptr_t)pa); } -static phys_addr_t __init alloc_pmd(uintptr_t va) +static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PMD); + return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); +} + +static pmd_t *get_pmd_virt_late(phys_addr_t pa) +{ + return (pmd_t *) __va(pa); +} + +static phys_addr_t __init alloc_pmd_early(uintptr_t va) { uintptr_t pmd_num; - if (mmu_enabled) - return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; BUG_ON(pmd_num >= NUM_EARLY_PMDS); return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; } +static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pmd_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + static void __init create_pmd_mapping(pmd_t *pmdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) @@ -316,28 +363,28 @@ static void __init create_pmd_mapping(pmd_t *pmdp, } if (pmd_none(pmdp[pmd_idx])) { - pte_phys = alloc_pte(va); + pte_phys = pt_ops.alloc_pte(va); pmdp[pmd_idx] = pfn_pmd(PFN_DOWN(pte_phys), PAGE_TABLE); - ptep = get_pte_virt(pte_phys); + ptep = pt_ops.get_pte_virt(pte_phys); memset(ptep, 0, PAGE_SIZE); } else { pte_phys = PFN_PHYS(_pmd_pfn(pmdp[pmd_idx])); - ptep = get_pte_virt(pte_phys); + ptep = pt_ops.get_pte_virt(pte_phys); } create_pte_mapping(ptep, va, pa, sz, prot); } #define pgd_next_t pmd_t -#define alloc_pgd_next(__va) alloc_pmd(__va) -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa) +#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) +#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next fixmap_pmd #else #define pgd_next_t pte_t -#define alloc_pgd_next(__va) alloc_pte(__va) -#define get_pgd_next_virt(__pa) get_pte_virt(__pa) +#define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) +#define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next fixmap_pte @@ -421,6 +468,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((load_pa % map_size) != 0); BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE); + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; +#endif /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); @@ -497,9 +550,16 @@ static void __init setup_vm_final(void) phys_addr_t pa, start, end; struct memblock_region *reg; - /* Set mmu_enabled flag */ - mmu_enabled = true; - + /** + * MMU is enabled at this point. But page table setup is not complete yet. + * fixmap page table alloc functions should be used at this point + */ + pt_ops.alloc_pte = alloc_pte_fixmap; + pt_ops.get_pte_virt = get_pte_virt_fixmap; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_fixmap; + pt_ops.get_pmd_virt = get_pmd_virt_fixmap; +#endif /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), @@ -533,6 +593,14 @@ static void __init setup_vm_final(void) /* Move to swapper page table */ csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); + + /* generic page allocation functions must be used to setup page table */ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; +#endif } #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) From cb7d2dd5612a77a2597c00fce770a52c921e2ea5 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:13 -0700 Subject: [PATCH 19/22] RISC-V: Add PE/COFF header for EFI stub Linux kernel Image can appear as an EFI application With appropriate PE/COFF header fields in the beginning of the Image header. An EFI application loader can directly load a Linux kernel Image and an EFI stub residing in kernel can boot Linux kernel directly. Add the necessary PE/COFF header. Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20200421033336.9663-3-atish.patra@wdc.com [ardb: - use C prefix for c.li to ensure the expected opcode is emitted - align all image sections according to PE/COFF section alignment ] Signed-off-by: Ard Biesheuvel Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sections.h | 13 ++++ arch/riscv/kernel/efi-header.S | 111 ++++++++++++++++++++++++++++++ arch/riscv/kernel/head.S | 16 +++++ arch/riscv/kernel/image-vars.h | 51 ++++++++++++++ arch/riscv/kernel/vmlinux.lds.S | 23 ++++++- 5 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 arch/riscv/include/asm/sections.h create mode 100644 arch/riscv/kernel/efi-header.S create mode 100644 arch/riscv/kernel/image-vars.h diff --git a/arch/riscv/include/asm/sections.h b/arch/riscv/include/asm/sections.h new file mode 100644 index 000000000000..3a9971b1210f --- /dev/null +++ b/arch/riscv/include/asm/sections.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef __ASM_SECTIONS_H +#define __ASM_SECTIONS_H + +#include + +extern char _start[]; +extern char _start_kernel[]; + +#endif /* __ASM_SECTIONS_H */ diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S new file mode 100644 index 000000000000..8e733aa48ba6 --- /dev/null +++ b/arch/riscv/kernel/efi-header.S @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Adapted from arch/arm64/kernel/efi-header.S + */ + +#include +#include + + .macro __EFI_PE_HEADER + .long PE_MAGIC +coff_header: +#ifdef CONFIG_64BIT + .short IMAGE_FILE_MACHINE_RISCV64 // Machine +#else + .short IMAGE_FILE_MACHINE_RISCV32 // Machine +#endif + .short section_count // NumberOfSections + .long 0 // TimeDateStamp + .long 0 // PointerToSymbolTable + .long 0 // NumberOfSymbols + .short section_table - optional_header // SizeOfOptionalHeader + .short IMAGE_FILE_DEBUG_STRIPPED | \ + IMAGE_FILE_EXECUTABLE_IMAGE | \ + IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics + +optional_header: +#ifdef CONFIG_64BIT + .short PE_OPT_MAGIC_PE32PLUS // PE32+ format +#else + .short PE_OPT_MAGIC_PE32 // PE32 format +#endif + .byte 0x02 // MajorLinkerVersion + .byte 0x14 // MinorLinkerVersion + .long __pecoff_text_end - efi_header_end // SizeOfCode + .long __pecoff_data_virt_size // SizeOfInitializedData + .long 0 // SizeOfUninitializedData + .long __efistub_efi_pe_entry - _start // AddressOfEntryPoint + .long efi_header_end - _start // BaseOfCode +#ifdef CONFIG_32BIT + .long __pecoff_text_end - _start // BaseOfData +#endif + +extra_header_fields: + .quad 0 // ImageBase + .long PECOFF_SECTION_ALIGNMENT // SectionAlignment + .long PECOFF_FILE_ALIGNMENT // FileAlignment + .short 0 // MajorOperatingSystemVersion + .short 0 // MinorOperatingSystemVersion + .short LINUX_EFISTUB_MAJOR_VERSION // MajorImageVersion + .short LINUX_EFISTUB_MINOR_VERSION // MinorImageVersion + .short 0 // MajorSubsystemVersion + .short 0 // MinorSubsystemVersion + .long 0 // Win32VersionValue + + .long _end - _start // SizeOfImage + + // Everything before the kernel image is considered part of the header + .long efi_header_end - _start // SizeOfHeaders + .long 0 // CheckSum + .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem + .short 0 // DllCharacteristics + .quad 0 // SizeOfStackReserve + .quad 0 // SizeOfStackCommit + .quad 0 // SizeOfHeapReserve + .quad 0 // SizeOfHeapCommit + .long 0 // LoaderFlags + .long (section_table - .) / 8 // NumberOfRvaAndSizes + + .quad 0 // ExportTable + .quad 0 // ImportTable + .quad 0 // ResourceTable + .quad 0 // ExceptionTable + .quad 0 // CertificationTable + .quad 0 // BaseRelocationTable + + // Section table +section_table: + .ascii ".text\0\0\0" + .long __pecoff_text_end - efi_header_end // VirtualSize + .long efi_header_end - _start // VirtualAddress + .long __pecoff_text_end - efi_header_end // SizeOfRawData + .long efi_header_end - _start // PointerToRawData + + .long 0 // PointerToRelocations + .long 0 // PointerToLineNumbers + .short 0 // NumberOfRelocations + .short 0 // NumberOfLineNumbers + .long IMAGE_SCN_CNT_CODE | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_EXECUTE // Characteristics + + .ascii ".data\0\0\0" + .long __pecoff_data_virt_size // VirtualSize + .long __pecoff_text_end - _start // VirtualAddress + .long __pecoff_data_raw_size // SizeOfRawData + .long __pecoff_text_end - _start // PointerToRawData + + .long 0 // PointerToRelocations + .long 0 // PointerToLineNumbers + .short 0 // NumberOfRelocations + .short 0 // NumberOfLineNumbers + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_WRITE // Characteristics + + .set section_count, (. - section_table) / 40 + + .balign 0x1000 +efi_header_end: + .endm diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 703e59d717f0..11e2a4fe66e0 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -12,6 +12,7 @@ #include #include #include +#include "efi-header.S" __HEAD ENTRY(_start) @@ -21,10 +22,18 @@ ENTRY(_start) * Do not modify it without modifying the structure and all bootloaders * that expects this header format!! */ +#ifdef CONFIG_EFI + /* + * This instruction decodes to "MZ" ASCII required by UEFI. + */ + c.li s4,-13 + j _start_kernel +#else /* jump to start kernel */ j _start_kernel /* reserved */ .word 0 +#endif .balign 8 #if __riscv_xlen == 64 /* Image load offset(2MB) from start of RAM */ @@ -42,7 +51,14 @@ ENTRY(_start) .ascii RISCV_IMAGE_MAGIC .balign 4 .ascii RISCV_IMAGE_MAGIC2 +#ifdef CONFIG_EFI + .word pe_head_start - _start +pe_head_start: + + __EFI_PE_HEADER +#else .word 0 +#endif .align 2 #ifdef CONFIG_MMU diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h new file mode 100644 index 000000000000..8c212efb37a6 --- /dev/null +++ b/arch/riscv/kernel/image-vars.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Linker script variables to be set after section resolution, as + * ld.lld does not like variables assigned before SECTIONS is processed. + * Based on arch/arm64/kerne/image-vars.h + */ +#ifndef __RISCV_KERNEL_IMAGE_VARS_H +#define __RISCV_KERNEL_IMAGE_VARS_H + +#ifndef LINKER_SCRIPT +#error This file should only be included in vmlinux.lds.S +#endif + +#ifdef CONFIG_EFI + +/* + * The EFI stub has its own symbol namespace prefixed by __efistub_, to + * isolate it from the kernel proper. The following symbols are legally + * accessed by the stub, so provide some aliases to make them accessible. + * Only include data symbols here, or text symbols of functions that are + * guaranteed to be safe when executed at another offset than they were + * linked at. The routines below are all implemented in assembler in a + * position independent manner + */ +__efistub_memcmp = memcmp; +__efistub_memchr = memchr; +__efistub_memcpy = memcpy; +__efistub_memmove = memmove; +__efistub_memset = memset; +__efistub_strlen = strlen; +__efistub_strnlen = strnlen; +__efistub_strcmp = strcmp; +__efistub_strncmp = strncmp; +__efistub_strrchr = strrchr; + +#ifdef CONFIG_KASAN +__efistub___memcpy = memcpy; +__efistub___memmove = memmove; +__efistub___memset = memset; +#endif + +__efistub__start = _start; +__efistub__start_kernel = _start_kernel; +__efistub__end = _end; +__efistub__edata = _edata; +__efistub_screen_info = screen_info; + +#endif + +#endif /* __RISCV_KERNEL_IMAGE_VARS_H */ diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index f3586e31ed1e..9795359cb9da 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -10,6 +10,7 @@ #include #include #include +#include "image-vars.h" #include OUTPUT_ARCH(riscv) @@ -17,6 +18,9 @@ ENTRY(_start) jiffies = jiffies_64; +PECOFF_SECTION_ALIGNMENT = 0x1000; +PECOFF_FILE_ALIGNMENT = 0x200; + SECTIONS { /* Beginning of code and text segment */ @@ -67,6 +71,11 @@ SECTIONS _etext = .; } +#ifdef CONFIG_EFI + . = ALIGN(PECOFF_SECTION_ALIGNMENT); + __pecoff_text_end = .; +#endif + /* Start of data section */ _sdata = .; RO_DATA(SECTION_ALIGN) @@ -83,16 +92,26 @@ SECTIONS .sdata : { __global_pointer$ = . + 0x800; *(.sdata*) - /* End of data section */ - _edata = .; } +#ifdef CONFIG_EFI + .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } + __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); +#endif + + /* End of data section */ + _edata = .; + BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) .rel.dyn : { *(.rel.dyn*) } +#ifdef CONFIG_EFI + . = ALIGN(PECOFF_SECTION_ALIGNMENT); + __pecoff_data_virt_size = ABSOLUTE(. - __pecoff_text_end); +#endif _end = .; STABS_DEBUG From d7071743db31b4f6898b1c742e4b451bb4bc4b02 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:14 -0700 Subject: [PATCH 20/22] RISC-V: Add EFI stub support. Add a RISC-V architecture specific stub code that actually copies the actual kernel image to a valid address and jump to it after boot services are terminated. Enable UEFI related kernel configs as well for RISC-V. Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20200421033336.9663-4-atish.patra@wdc.com [ardb: - move hartid fetch into check_platform_features() - use image_size not reserve_size - select ISA_C - do not use dram_base] Signed-off-by: Ard Biesheuvel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 22 +++++ arch/riscv/Makefile | 1 + arch/riscv/configs/defconfig | 1 + arch/riscv/include/asm/efi.h | 35 +++++++ drivers/firmware/efi/Kconfig | 3 +- drivers/firmware/efi/libstub/Makefile | 10 ++ drivers/firmware/efi/libstub/riscv-stub.c | 109 ++++++++++++++++++++++ 7 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/efi.h create mode 100644 drivers/firmware/efi/libstub/riscv-stub.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5e4ace64acbc..6ef2394be857 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -401,6 +401,26 @@ config CMDLINE_FORCE endchoice +config EFI_STUB + bool + +config EFI + bool "UEFI runtime support" + depends on OF + select LIBFDT + select UCS2_STRING + select EFI_PARAMS_FROM_FDT + select EFI_STUB + select EFI_GENERIC_STUB + select RISCV_ISA_C + default y + help + This option provides support for runtime services provided + by UEFI firmware (such as non-volatile variables, realtime + clock, and platform reset). A UEFI stub is also provided to + allow the kernel to be booted as an EFI application. This + is only useful on systems that have UEFI firmware. + endmenu config BUILTIN_DTB @@ -413,3 +433,5 @@ menu "Power management options" source "kernel/power/Kconfig" endmenu + +source "drivers/firmware/Kconfig" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index fb6e37db836d..10df59f28add 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -80,6 +80,7 @@ head-y := arch/riscv/kernel/head.o core-y += arch/riscv/ libs-y += arch/riscv/lib/ +libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a PHONY += vdso_install vdso_install: diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index d58c93efb603..d222d353d86d 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -130,3 +130,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y # CONFIG_SYSFS_SYSCALL is not set +CONFIG_EFI=y diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h new file mode 100644 index 000000000000..bf6a40e20a8e --- /dev/null +++ b/arch/riscv/include/asm/efi.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#ifndef _ASM_EFI_H +#define _ASM_EFI_H + +#include +#include +#include +#include + +/* on RISC-V, the FDT may be located anywhere in system RAM */ +static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) +{ + return ULONG_MAX; +} + +/* Load initrd at enough distance from DRAM start */ +static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) +{ + return image_addr + SZ_256M; +} + +#define alloc_screen_info(x...) (&screen_info) + +static inline void free_screen_info(struct screen_info *si) +{ +} + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} + +#endif /* _ASM_EFI_H */ diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 3939699e62fe..a29fbd6e657e 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -111,7 +111,7 @@ config EFI_GENERIC_STUB config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" - depends on EFI_GENERIC_STUB + depends on EFI_GENERIC_STUB && !RISCV default y help Select this config option to add support for the dtb= command @@ -128,6 +128,7 @@ config EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER bool "Enable the command line initrd loader" if !X86 depends on EFI_STUB && (EFI_GENERIC_STUB || X86) default y + depends on !RISCV help Select this config option to add support for the initrd= command line parameter, allowing an initrd that resides on the same volume diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 296b18fbd7a2..e9fc2ddabd5f 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -22,6 +22,8 @@ cflags-$(CONFIG_ARM64) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \ cflags-$(CONFIG_ARM) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \ -fno-builtin -fpic \ $(call cc-option,-mno-single-pic-base) +cflags-$(CONFIG_RISCV) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \ + -fpic cflags-$(CONFIG_EFI_GENERIC_STUB) += -I$(srctree)/scripts/dtc/libfdt @@ -63,6 +65,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o fdt.o string.o \ lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM64) += arm64-stub.o lib-$(CONFIG_X86) += x86-stub.o +lib-$(CONFIG_RISCV) += riscv-stub.o CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) @@ -106,6 +109,13 @@ STUBCOPY_FLAGS-$(CONFIG_ARM64) += --prefix-alloc-sections=.init \ --prefix-symbols=__efistub_ STUBCOPY_RELOC-$(CONFIG_ARM64) := R_AARCH64_ABS +# For RISC-V, we don't need anything special other than arm64. Keep all the +# symbols in .init section and make sure that no absolute symbols references +# doesn't exist. +STUBCOPY_FLAGS-$(CONFIG_RISCV) += --prefix-alloc-sections=.init \ + --prefix-symbols=__efistub_ +STUBCOPY_RELOC-$(CONFIG_RISCV) := R_RISCV_HI20 + $(obj)/%.stub.o: $(obj)/%.o FORCE $(call if_changed,stubcopy) diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c new file mode 100644 index 000000000000..380e4e251399 --- /dev/null +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include + +#include +#include + +#include "efistub.h" + +/* + * RISC-V requires the kernel image to placed 2 MB aligned base for 64 bit and + * 4MB for 32 bit. + */ +#ifdef CONFIG_64BIT +#define MIN_KIMG_ALIGN SZ_2M +#else +#define MIN_KIMG_ALIGN SZ_4M +#endif + +typedef void __noreturn (*jump_kernel_func)(unsigned int, unsigned long); + +static u32 hartid; + +static u32 get_boot_hartid_from_fdt(void) +{ + const void *fdt; + int chosen_node, len; + const fdt32_t *prop; + + fdt = get_efi_config_table(DEVICE_TREE_GUID); + if (!fdt) + return U32_MAX; + + chosen_node = fdt_path_offset(fdt, "/chosen"); + if (chosen_node < 0) + return U32_MAX; + + prop = fdt_getprop((void *)fdt, chosen_node, "boot-hartid", &len); + if (!prop || len != sizeof(u32)) + return U32_MAX; + + return fdt32_to_cpu(*prop); +} + +efi_status_t check_platform_features(void) +{ + hartid = get_boot_hartid_from_fdt(); + if (hartid == U32_MAX) { + efi_err("/chosen/boot-hartid missing or invalid!\n"); + return EFI_UNSUPPORTED; + } + return EFI_SUCCESS; +} + +void __noreturn efi_enter_kernel(unsigned long entrypoint, unsigned long fdt, + unsigned long fdt_size) +{ + unsigned long stext_offset = _start_kernel - _start; + unsigned long kernel_entry = entrypoint + stext_offset; + jump_kernel_func jump_kernel = (jump_kernel_func)kernel_entry; + + /* + * Jump to real kernel here with following constraints. + * 1. MMU should be disabled. + * 2. a0 should contain hartid + * 3. a1 should DT address + */ + csr_write(CSR_SATP, 0); + jump_kernel(hartid, fdt); +} + +efi_status_t handle_kernel_image(unsigned long *image_addr, + unsigned long *image_size, + unsigned long *reserve_addr, + unsigned long *reserve_size, + efi_loaded_image_t *image) +{ + unsigned long kernel_size = 0; + unsigned long preferred_addr; + efi_status_t status; + + kernel_size = _edata - _start; + *image_addr = (unsigned long)_start; + *image_size = kernel_size + (_end - _edata); + + /* + * RISC-V kernel maps PAGE_OFFSET virtual address to the same physical + * address where kernel is booted. That's why kernel should boot from + * as low as possible to avoid wastage of memory. Currently, dram_base + * is occupied by the firmware. So the preferred address for kernel to + * boot is next aligned address. If preferred address is not available, + * relocate_kernel will fall back to efi_low_alloc_above to allocate + * lowest possible memory region as long as the address and size meets + * the alignment constraints. + */ + preferred_addr = MIN_KIMG_ALIGN; + status = efi_relocate_kernel(image_addr, kernel_size, *image_size, + preferred_addr, MIN_KIMG_ALIGN, 0x0); + + if (status != EFI_SUCCESS) { + efi_err("Failed to relocate kernel\n"); + *image_size = 0; + } + return status; +} From b91540d52a08b65eb6a2b09132e1bd54fa82754c Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:15 -0700 Subject: [PATCH 21/22] RISC-V: Add EFI runtime services This patch adds EFI runtime service support for RISC-V. Signed-off-by: Atish Patra [ardb: - Remove the page check] Signed-off-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 + arch/riscv/include/asm/efi.h | 20 ++++ arch/riscv/include/asm/mmu.h | 2 + arch/riscv/include/asm/pgtable.h | 4 + arch/riscv/kernel/Makefile | 2 + arch/riscv/kernel/efi.c | 96 ++++++++++++++++ arch/riscv/kernel/setup.c | 7 +- arch/riscv/mm/init.c | 2 +- drivers/firmware/efi/Makefile | 2 + drivers/firmware/efi/libstub/efi-stub.c | 11 +- drivers/firmware/efi/riscv-runtime.c | 143 ++++++++++++++++++++++++ 11 files changed, 287 insertions(+), 4 deletions(-) create mode 100644 arch/riscv/kernel/efi.c create mode 100644 drivers/firmware/efi/riscv-runtime.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6ef2394be857..c8e57d77c144 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -412,7 +412,9 @@ config EFI select EFI_PARAMS_FROM_FDT select EFI_STUB select EFI_GENERIC_STUB + select EFI_RUNTIME_WRAPPERS select RISCV_ISA_C + depends on MMU default y help This option provides support for runtime services provided diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h index bf6a40e20a8e..7542282f1141 100644 --- a/arch/riscv/include/asm/efi.h +++ b/arch/riscv/include/asm/efi.h @@ -5,11 +5,28 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include #include #include #include #include +#ifdef CONFIG_EFI +extern void efi_init(void); +#else +#define efi_init() +#endif + +int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); + +#define arch_efi_call_virt_setup() efi_virtmap_load() +#define arch_efi_call_virt_teardown() efi_virtmap_unload() + +#define arch_efi_call_virt(p, f, args...) p->f(args) + +#define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE) + /* on RISC-V, the FDT may be located anywhere in system RAM */ static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) { @@ -32,4 +49,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } +void efi_virtmap_load(void); +void efi_virtmap_unload(void); + #endif /* _ASM_EFI_H */ diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 967eacb01ab5..dabcf2cfb3dc 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -20,6 +20,8 @@ typedef struct { #endif } mm_context_t; +void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 815f8c959dd4..183f1f4b2ae6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -100,6 +100,10 @@ #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) +#define PAGE_KERNEL_READ __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) +#define PAGE_KERNEL_READ_EXEC __pgprot((_PAGE_KERNEL & ~_PAGE_WRITE) \ + | _PAGE_EXEC) #define PAGE_TABLE __pgprot(_PAGE_TABLE) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index dc93710f0b2f..fa896c5f7ccb 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -55,4 +55,6 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_EFI) += efi.o + clean: diff --git a/arch/riscv/kernel/efi.c b/arch/riscv/kernel/efi.c new file mode 100644 index 000000000000..024159298231 --- /dev/null +++ b/arch/riscv/kernel/efi.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * Adapted from arch/arm64/kernel/efi.c + */ + +#include +#include + +#include +#include +#include + +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pgprot_t efimem_to_pgprot_map(efi_memory_desc_t *md) +{ + u64 attr = md->attribute; + u32 type = md->type; + + if (type == EFI_MEMORY_MAPPED_IO) + return PAGE_KERNEL; + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return PAGE_KERNEL_READ; + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return PAGE_KERNEL_READ_EXEC; + + /* RW- */ + if (((attr & (EFI_MEMORY_RP | EFI_MEMORY_WP | EFI_MEMORY_XP)) == + EFI_MEMORY_XP) || + type != EFI_RUNTIME_SERVICES_CODE) + return PAGE_KERNEL; + + /* RWX */ + return PAGE_KERNEL_EXEC; +} + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pgprot_t prot = __pgprot(pgprot_val(efimem_to_pgprot_map(md)) & + ~(_PAGE_GLOBAL)); + int i; + + /* RISC-V maps one page at a time */ + for (i = 0; i < md->num_pages; i++) + create_pgd_mapping(mm->pgd, md->virt_addr + i * PAGE_SIZE, + md->phys_addr + i * PAGE_SIZE, + PAGE_SIZE, prot); + return 0; +} + +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = READ_ONCE(*ptep); + unsigned long val; + + if (md->attribute & EFI_MEMORY_RO) { + val = pte_val(pte) & ~_PAGE_WRITE; + val = pte_val(pte) | _PAGE_READ; + pte = __pte(val); + } + if (md->attribute & EFI_MEMORY_XP) { + val = pte_val(pte) & ~_PAGE_EXEC; + pte = __pte(val); + } + set_pte(ptep, pte); + + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && + md->type != EFI_RUNTIME_SERVICES_DATA); + + /* + * Calling apply_to_page_range() is only safe on regions that are + * guaranteed to be mapped down to pages. Since we are only called + * for regions that have been mapped using efi_create_mapping() above + * (and this is checked by the generic Memory Attributes table parsing + * routines), there is no need to check that again here. + */ + return apply_to_page_range(mm, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + set_permissions, md); +} diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 41ef96d0d97a..4c96ac198e14 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -26,11 +27,12 @@ #include #include #include +#include #include "head.h" -#ifdef CONFIG_DUMMY_CONSOLE -struct screen_info screen_info = { +#if defined(CONFIG_DUMMY_CONSOLE) || defined(CONFIG_EFI) +struct screen_info screen_info __section(.data) = { .orig_video_lines = 30, .orig_video_cols = 80, .orig_video_mode = 0, @@ -75,6 +77,7 @@ void __init setup_arch(char **cmdline_p) early_ioremap_setup(); parse_early_param(); + efi_init(); setup_bootmem(); paging_init(); #if IS_ENABLED(CONFIG_BUILTIN_DTB) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 63acc8185bfa..c888c4470b34 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -390,7 +390,7 @@ static void __init create_pmd_mapping(pmd_t *pmdp, #define fixmap_pgd_next fixmap_pte #endif -static void __init create_pgd_mapping(pgd_t *pgdp, +void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot) { diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 61fd1e8b26fb..4d628081bb2f 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -35,6 +35,8 @@ fake_map-$(CONFIG_X86) += x86_fake_mem.o arm-obj-$(CONFIG_EFI) := efi-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) obj-$(CONFIG_ARM64) += $(arm-obj-y) +riscv-obj-$(CONFIG_EFI) := efi-init.o riscv-runtime.o +obj-$(CONFIG_RISCV) += $(riscv-obj-y) obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o obj-$(CONFIG_EFI_EARLYCON) += earlycon.o obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 311a16802dd6..914a343c7785 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -17,7 +17,10 @@ /* * This is the base address at which to start allocating virtual memory ranges - * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use + * for UEFI Runtime Services. + * + * For ARM/ARM64: + * This is in the low TTBR0 range so that we can use * any allocation we choose, and eliminate the risk of a conflict after kexec. * The value chosen is the largest non-zero power of 2 suitable for this purpose * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can @@ -25,6 +28,12 @@ * Since 32-bit ARM could potentially execute with a 1G/3G user/kernel split, * map everything below 1 GB. (512 MB is a reasonable upper bound for the * entire footprint of the UEFI runtime services memory regions) + * + * For RISC-V: + * There is no specific reason for which, this address (512MB) can't be used + * EFI runtime virtual address for RISC-V. It also helps to use EFI runtime + * services on both RV32/RV64. Keep the same runtime virtual address for RISC-V + * as well to minimize the code churn. */ #define EFI_RT_VIRTUAL_BASE SZ_512M #define EFI_RT_VIRTUAL_SIZE SZ_512M diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c new file mode 100644 index 000000000000..d28e715d2bcc --- /dev/null +++ b/drivers/firmware/efi/riscv-runtime.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Extensible Firmware Interface + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + * + * Based on Extensible Firmware Interface Specification version 2.4 + * Adapted from drivers/firmware/efi/arm-runtime.c + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static bool __init efi_virtmap_init(void) +{ + efi_memory_desc_t *md; + + efi_mm.pgd = pgd_alloc(&efi_mm); + mm_init_cpumask(&efi_mm); + init_new_context(NULL, &efi_mm); + + for_each_efi_memory_desc(md) { + phys_addr_t phys = md->phys_addr; + int ret; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) + return false; + + ret = efi_create_mapping(&efi_mm, md); + if (ret) { + pr_warn(" EFI remap %pa: failed to create mapping (%d)\n", + &phys, ret); + return false; + } + } + + if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) + return false; + + return true; +} + +/* + * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., + * non-early mapping of the UEFI system table and virtual mappings for all + * EFI_MEMORY_RUNTIME regions. + */ +static int __init riscv_enable_runtime_services(void) +{ + u64 mapsize; + + if (!efi_enabled(EFI_BOOT)) { + pr_info("EFI services will not be available.\n"); + return 0; + } + + efi_memmap_unmap(); + + mapsize = efi.memmap.desc_size * efi.memmap.nr_map; + + if (efi_memmap_init_late(efi.memmap.phys_map, mapsize)) { + pr_err("Failed to remap EFI memory map\n"); + return 0; + } + + if (efi_soft_reserve_enabled()) { + efi_memory_desc_t *md; + + for_each_efi_memory_desc(md) { + int md_size = md->num_pages << EFI_PAGE_SHIFT; + struct resource *res; + + if (!(md->attribute & EFI_MEMORY_SP)) + continue; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (WARN_ON(!res)) + break; + + res->start = md->phys_addr; + res->end = md->phys_addr + md_size - 1; + res->name = "Soft Reserved"; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_SOFT_RESERVED; + + insert_resource(&iomem_resource, res); + } + } + + if (efi_runtime_disabled()) { + pr_info("EFI runtime services will be disabled.\n"); + return 0; + } + + if (efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("EFI runtime services access via paravirt.\n"); + return 0; + } + + pr_info("Remapping and enabling EFI services.\n"); + + if (!efi_virtmap_init()) { + pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); + return -ENOMEM; + } + + /* Set up runtime services function pointers */ + efi_native_runtime_setup(); + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + return 0; +} +early_initcall(riscv_enable_runtime_services); + +void efi_virtmap_load(void) +{ + preempt_disable(); + switch_mm(current->active_mm, &efi_mm, NULL); +} + +void efi_virtmap_unload(void) +{ + switch_mm(&efi_mm, current->active_mm, NULL); + preempt_enable(); +} From de22d2107ced3cc5355cc9dbbd85e44183546bd5 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 17 Sep 2020 15:37:16 -0700 Subject: [PATCH 22/22] RISC-V: Add page table dump support for uefi Extend the current page table dump support in RISC-V to include efi pages as well. Here is the output of efi runtime page table mappings. ---[ UEFI runtime start ]--- 0x0000000020002000-0x0000000020003000 0x00000000be732000 4K PTE D A . . . W R V 0x0000000020018000-0x0000000020019000 0x00000000be738000 4K PTE D A . . . W R V 0x000000002002c000-0x000000002002d000 0x00000000be73c000 4K PTE D A . . . W R V 0x0000000020031000-0x0000000020032000 0x00000000bff61000 4K PTE D A . . X W R V ---[ UEFI runtime end ]--- Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/ptdump.c | 48 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 0831c2e61a8f..ace74dec7492 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -3,6 +3,7 @@ * Copyright (C) 2019 SiFive */ +#include #include #include #include @@ -49,6 +50,14 @@ struct addr_marker { const char *name; }; +/* Private information for debugfs */ +struct ptd_mm_info { + struct mm_struct *mm; + const struct addr_marker *markers; + unsigned long base_addr; + unsigned long end; +}; + static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN {KASAN_SHADOW_START, "Kasan shadow start"}, @@ -68,6 +77,28 @@ static struct addr_marker address_markers[] = { {-1, NULL}, }; +static struct ptd_mm_info kernel_ptd_info = { + .mm = &init_mm, + .markers = address_markers, + .base_addr = KERN_VIRT_START, + .end = ULONG_MAX, +}; + +#ifdef CONFIG_EFI +static struct addr_marker efi_addr_markers[] = { + { 0, "UEFI runtime start" }, + { SZ_1G, "UEFI runtime end" }, + { -1, NULL } +}; + +static struct ptd_mm_info efi_ptd_info = { + .mm = &efi_mm, + .markers = efi_addr_markers, + .base_addr = 0, + .end = SZ_2G, +}; +#endif + /* Page Table Entry */ struct prot_bits { u64 mask; @@ -245,22 +276,22 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, } } -static void ptdump_walk(struct seq_file *s) +static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) { struct pg_state st = { .seq = s, - .marker = address_markers, + .marker = pinfo->markers, .level = -1, .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]) { - {KERN_VIRT_START, ULONG_MAX}, + {pinfo->base_addr, pinfo->end}, {0, 0} } } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL); } void ptdump_check_wx(void) @@ -293,7 +324,7 @@ void ptdump_check_wx(void) static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk(m); + ptdump_walk(m, m->private); return 0; } @@ -308,8 +339,13 @@ static int ptdump_init(void) for (j = 0; j < ARRAY_SIZE(pte_bits); j++) pg_level[i].mask |= pte_bits[j].mask; - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, + debugfs_create_file("kernel_page_tables", 0400, NULL, &kernel_ptd_info, &ptdump_fops); +#ifdef CONFIG_EFI + if (efi_enabled(EFI_RUNTIME_SERVICES)) + debugfs_create_file("efi_page_tables", 0400, NULL, &efi_ptd_info, + &ptdump_fops); +#endif return 0; }