alistair23-linux/arch/openrisc/mm/fault.c

/*
 * OpenRISC fault.c
 *
 * Linux architectural port borrowing liberally from similar works of
 * others.  All original copyrights apply as per the original source
 * declaration.
 *
 * Modifications for the OpenRISC architecture:
 * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
 * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
 *
 *      This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sched.h>

#include <asm/uaccess.h>
#include <asm/siginfo.h>
#include <asm/signal.h>

#define NUM_TLB_ENTRIES 64
#define TLB_OFFSET(add) (((add) >> PAGE_SHIFT) & (NUM_TLB_ENTRIES-1))

unsigned long pte_misses;	/* updated by do_page_fault() */
unsigned long pte_errors;	/* updated by do_page_fault() */

/* __PHX__ :: - check the vmalloc_fault in do_page_fault()
 *            - also look into include/asm-or32/mmu_context.h
 */
volatile pgd_t *current_pgd;

extern void die(char *, struct pt_regs *, long);

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 *
 * If this routine detects a bad access, it returns 1, otherwise it
 * returns 0.
 */

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
			      unsigned long vector, int write_acc)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	siginfo_t info;
	int fault;
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	tsk = current;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * NOTE2: This is done so that, when updating the vmalloc
	 * mappings we don't have to walk all processes pgdirs and
	 * add the high mappings all at once. Instead we do it as they
	 * are used. However vmalloc'ed page entries have the PAGE_GLOBAL
	 * bit set so sometimes the TLB can use a lingering entry.
	 *
	 * This verifies that the fault happens in kernel space
	 * and that the fault was not a protection error.
	 */

	if (address >= VMALLOC_START &&
	    (vector != 0x300 && vector != 0x400) &&
	    !user_mode(regs))
		goto vmalloc_fault;

	/* If exceptions were enabled, we can reenable them here */
	if (user_mode(regs)) {
		/* Exception was in userspace: reenable interrupts */
		local_irq_enable();
		flags |= FAULT_FLAG_USER;
	} else {
		/* If exception was in a syscall, then IRQ's may have
		 * been enabled or disabled.  If they were enabled,
		 * reenable them.
		 */
		if (regs->sr && (SPR_SR_IEE | SPR_SR_TEE))
			local_irq_enable();
	}

	mm = tsk->mm;
	info.si_code = SEGV_MAPERR;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */

	if (in_interrupt() || !mm)
		goto no_context;

retry:
	down_read(&mm->mmap_sem);
	vma = find_vma(mm, address);

	if (!vma)
		goto bad_area;

	if (vma->vm_start <= address)
		goto good_area;

	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;

	if (user_mode(regs)) {
		/*
		 * accessing the stack below usp is always a bug.
		 * we get page-aligned addresses so we can only check
		 * if we're within a page from usp, but that might be
		 * enough to catch brutal errors at least.
		 */
		if (address + PAGE_SIZE < regs->sp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;

	/*
	 * Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */

good_area:
	info.si_code = SEGV_ACCERR;

	/* first do some preliminary protection checks */

	if (write_acc) {
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags |= FAULT_FLAG_WRITE;
	} else {
		/* not present */
		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			goto bad_area;
	}

	/* are we trying to execute nonexecutable area */
	if ((vector == 0x400) && !(vma->vm_page_prot.pgprot & _PAGE_EXEC))
		goto bad_area;

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */

	fault = handle_mm_fault(vma, address, flags);

	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return;

	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGSEGV)
			goto bad_area;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}

	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		/*RGD modeled on Cris */
		if (fault & VM_FAULT_MAJOR)
			tsk->maj_flt++;
		else
			tsk->min_flt++;
		if (fault & VM_FAULT_RETRY) {
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			flags |= FAULT_FLAG_TRIED;

			 /* No need to up_read(&mm->mmap_sem) as we would
			 * have already released it in __lock_page_or_retry
			 * in mm/filemap.c.
			 */

			goto retry;
		}
	}

	up_read(&mm->mmap_sem);
	return;

	/*
	 * Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */

bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:

	/* User mode accesses just cause a SIGSEGV */

	if (user_mode(regs)) {
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void *)address;
		force_sig_info(SIGSEGV, &info, tsk);
		return;
	}

no_context:

	/* Are we prepared to handle this kernel fault?
	 *
	 * (The kernel has valid exception-points in the source
	 *  when it acesses user-memory. When it fails in one
	 *  of those points, we find it in a table and do a jump
	 *  to some fixup code that loads an appropriate error
	 *  code)
	 */

	{
		const struct exception_table_entry *entry;

		__asm__ __volatile__("l.nop 42");

		if ((entry = search_exception_tables(regs->pc)) != NULL) {
			/* Adjust the instruction pointer in the stackframe */
			regs->pc = entry->fixup;
			return;
		}
	}

	/*
	 * Oops. The kernel tried to access some bad page. We'll have to
	 * terminate things with extreme prejudice.
	 */

	if ((unsigned long)(address) < PAGE_SIZE)
		printk(KERN_ALERT
		       "Unable to handle kernel NULL pointer dereference");
	else
		printk(KERN_ALERT "Unable to handle kernel access");
	printk(" at virtual address 0x%08lx\n", address);

	die("Oops", regs, write_acc);

	do_exit(SIGKILL);

	/*
	 * We ran out of memory, or some other thing happened to us that made
	 * us unable to handle the page fault gracefully.
	 */

out_of_memory:
	__asm__ __volatile__("l.nop 42");
	__asm__ __volatile__("l.nop 1");

	up_read(&mm->mmap_sem);
	if (!user_mode(regs))
		goto no_context;
	pagefault_out_of_memory();
	return;

do_sigbus:
	up_read(&mm->mmap_sem);

	/*
	 * Send a sigbus, regardless of whether we were in kernel
	 * or user mode.
	 */
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void *)address;
	force_sig_info(SIGBUS, &info, tsk);

	/* Kernel mode? Handle exceptions or die */
	if (!user_mode(regs))
		goto no_context;
	return;

vmalloc_fault:
	{
		/*
		 * Synchronize this task's top level page-table
		 * with the 'reference' page table.
		 *
		 * Use current_pgd instead of tsk->active_mm->pgd
		 * since the latter might be unavailable if this
		 * code is executed in a misfortunately run irq
		 * (like inside schedule() between switch_mm and
		 *  switch_to...).
		 */

		int offset = pgd_index(address);
		pgd_t *pgd, *pgd_k;
		pud_t *pud, *pud_k;
		pmd_t *pmd, *pmd_k;
		pte_t *pte_k;

/*
		phx_warn("do_page_fault(): vmalloc_fault will not work, "
			 "since current_pgd assign a proper value somewhere\n"
			 "anyhow we don't need this at the moment\n");

		phx_mmu("vmalloc_fault");
*/
		pgd = (pgd_t *)current_pgd + offset;
		pgd_k = init_mm.pgd + offset;

		/* Since we're two-level, we don't need to do both
		 * set_pgd and set_pmd (they do the same thing). If
		 * we go three-level at some point, do the right thing
		 * with pgd_present and set_pgd here.
		 *
		 * Also, since the vmalloc area is global, we don't
		 * need to copy individual PTE's, it is enough to
		 * copy the pgd pointer into the pte page of the
		 * root task. If that is there, we'll find our pte if
		 * it exists.
		 */

		pud = pud_offset(pgd, address);
		pud_k = pud_offset(pgd_k, address);
		if (!pud_present(*pud_k))
			goto no_context;

		pmd = pmd_offset(pud, address);
		pmd_k = pmd_offset(pud_k, address);

		if (!pmd_present(*pmd_k))
			goto bad_area_nosemaphore;

		set_pmd(pmd, *pmd_k);

		/* Make sure the actual PTE exists as well to
		 * catch kernel vmalloc-area accesses to non-mapped
		 * addresses. If we don't do this, this will just
		 * silently loop forever.
		 */

		pte_k = pte_offset_kernel(pmd_k, address);
		if (!pte_present(*pte_k))
			goto no_context;

		return;
	}
}
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`/*`
			`* OpenRISC fault.c`
			`*`
			`* Linux architectural port borrowing liberally from similar works of`
			`* others. All original copyrights apply as per the original source`
			`* declaration.`
			`*`
			`* Modifications for the OpenRISC architecture:`
			`* Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>`
			`* Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* as published by the Free Software Foundation; either version`
			`* 2 of the License, or (at your option) any later version.`
			`*/`

			`#include <linux/mm.h>`
			`#include <linux/interrupt.h>`
			`#include <linux/module.h>`
			`#include <linux/sched.h>`

			`#include <asm/uaccess.h>`
			`#include <asm/siginfo.h>`
			`#include <asm/signal.h>`

			`#define NUM_TLB_ENTRIES 64`
			`#define TLB_OFFSET(add) (((add) >> PAGE_SHIFT) & (NUM_TLB_ENTRIES-1))`

			`unsigned long pte_misses; /* updated by do_page_fault() */`
			`unsigned long pte_errors; /* updated by do_page_fault() */`

			`/* __PHX__ :: - check the vmalloc_fault in do_page_fault()`
			`* - also look into include/asm-or32/mmu_context.h`
			`*/`
			`volatile pgd_t *current_pgd;`

			`extern void die(char , struct pt_regs , long);`

			`/*`
			`* This routine handles page faults. It determines the address,`
			`* and the problem, and then passes it off to one of the appropriate`
			`* routines.`
			`*`
			`* If this routine detects a bad access, it returns 1, otherwise it`
			`* returns 0.`
			`*/`

			`asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,`
			`unsigned long vector, int write_acc)`
			`{`
			`struct task_struct *tsk;`
			`struct mm_struct *mm;`
			`struct vm_area_struct *vma;`
			`siginfo_t info;`
			`int fault;`
openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00			`unsigned int flags = FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00
			`tsk = current;`

			`/*`
			`* We fault-in kernel-space virtual memory on-demand. The`
			`* 'reference' page table is init_mm.pgd.`
			`*`
			`* NOTE! We MUST NOT take any locks for this case. We may`
			`* be in an interrupt or a critical region, and should`
			`* only copy the information from the master page table,`
			`* nothing more.`
			`*`
			`* NOTE2: This is done so that, when updating the vmalloc`
			`* mappings we don't have to walk all processes pgdirs and`
			`* add the high mappings all at once. Instead we do it as they`
			`* are used. However vmalloc'ed page entries have the PAGE_GLOBAL`
			`* bit set so sometimes the TLB can use a lingering entry.`
			`*`
			`* This verifies that the fault happens in kernel space`
			`* and that the fault was not a protection error.`
			`*/`

			`if (address >= VMALLOC_START &&`
			`(vector != 0x300 && vector != 0x400) &&`
			`!user_mode(regs))`
			`goto vmalloc_fault;`

			`/* If exceptions were enabled, we can reenable them here */`
			`if (user_mode(regs)) {`
			`/* Exception was in userspace: reenable interrupts */`
			`local_irq_enable();`
arch: mm: pass userspace fault flag to generic fault handler Unlike global OOM handling, memory cgroup code will invoke the OOM killer in any OOM situation because it has no way of telling faults occuring in kernel context - which could be handled more gracefully - from user-triggered faults. Pass a flag that identifies faults originating in user space from the architecture-specific fault handlers to generic code so that memcg OOM handling can be improved. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: azurIt <azurit@pobox.sk> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-12 16:13:39 -06:00			`flags \|= FAULT_FLAG_USER;`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`} else {`
			`/* If exception was in a syscall, then IRQ's may have`
			`* been enabled or disabled. If they were enabled,`
			`* reenable them.`
			`*/`
			`if (regs->sr && (SPR_SR_IEE \| SPR_SR_TEE))`
			`local_irq_enable();`
			`}`

			`mm = tsk->mm;`
			`info.si_code = SEGV_MAPERR;`

			`/*`
			`* If we're in an interrupt or have no user`
			`* context, we must not take the fault..`
			`*/`

			`if (in_interrupt() \|\| !mm)`
			`goto no_context;`

openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00			`retry:`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`down_read(&mm->mmap_sem);`
			`vma = find_vma(mm, address);`

			`if (!vma)`
			`goto bad_area;`

			`if (vma->vm_start <= address)`
			`goto good_area;`

			`if (!(vma->vm_flags & VM_GROWSDOWN))`
			`goto bad_area;`

			`if (user_mode(regs)) {`
			`/*`
			`* accessing the stack below usp is always a bug.`
			`* we get page-aligned addresses so we can only check`
			`* if we're within a page from usp, but that might be`
			`* enough to catch brutal errors at least.`
			`*/`
			`if (address + PAGE_SIZE < regs->sp)`
			`goto bad_area;`
			`}`
			`if (expand_stack(vma, address))`
			`goto bad_area;`

			`/*`
			`* Ok, we have a good vm_area for this memory access, so`
			`* we can handle it..`
			`*/`

			`good_area:`
			`info.si_code = SEGV_ACCERR;`

			`/* first do some preliminary protection checks */`

			`if (write_acc) {`
			`if (!(vma->vm_flags & VM_WRITE))`
			`goto bad_area;`
openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00			`flags \|= FAULT_FLAG_WRITE;`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`} else {`
			`/* not present */`
			`if (!(vma->vm_flags & (VM_READ \| VM_EXEC)))`
			`goto bad_area;`
			`}`

			`/* are we trying to execute nonexecutable area */`
			`if ((vector == 0x400) && !(vma->vm_page_prot.pgprot & _PAGE_EXEC))`
			`goto bad_area;`

			`/*`
			`* If for any reason at all we couldn't handle the fault,`
			`* make sure we exit gracefully rather than endlessly redo`
			`* the fault.`
			`*/`

mm: do not pass mm_struct into handle_mm_fault We always have vma->vm_mm around. Link: http://lkml.kernel.org/r/1466021202-61880-8-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-07-26 16:25:18 -06:00			`fault = handle_mm_fault(vma, address, flags);`
openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00
			`if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))`
			`return;`

OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`if (unlikely(fault & VM_FAULT_ERROR)) {`
			`if (fault & VM_FAULT_OOM)`
			`goto out_of_memory;`
vm: add VM_FAULT_SIGSEGV handling support The core VM already knows about VM_FAULT_SIGBUS, but cannot return a "you should SIGSEGV" error, because the SIGSEGV case was generally handled by the caller - usually the architecture fault handler. That results in lots of duplication - all the architecture fault handlers end up doing very similar "look up vma, check permissions, do retries etc" - but it generally works. However, there are cases where the VM actually wants to SIGSEGV, and applications _expect_ SIGSEGV. In particular, when accessing the stack guard page, libsigsegv expects a SIGSEGV. And it usually got one, because the stack growth is handled by that duplicated architecture fault handler. However, when the generic VM layer started propagating the error return from the stack expansion in commit fee7e49d4514 ("mm: propagate error from stack expansion even for guard page"), that now exposed the existing VM_FAULT_SIGBUS result to user space. And user space really expected SIGSEGV, not SIGBUS. To fix that case, we need to add a VM_FAULT_SIGSEGV, and teach all those duplicate architecture fault handlers about it. They all already have the code to handle SIGSEGV, so it's about just tying that new return value to the existing code, but it's all a bit annoying. This is the mindless minimal patch to do this. A more extensive patch would be to try to gather up the mostly shared fault handling logic into one generic helper routine, and long-term we really should do that cleanup. Just from this patch, you can generally see that most architectures just copied (directly or indirectly) the old x86 way of doing things, but in the meantime that original x86 model has been improved to hold the VM semaphore for shorter times etc and to handle VM_FAULT_RETRY and other "newer" things, so it would be a good idea to bring all those improvements to the generic case and teach other architectures about them too. Reported-and-tested-by: Takashi Iwai <tiwai@suse.de> Tested-by: Jan Engelhardt <jengelh@inai.de> Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # "s390 still compiles and boots" Cc: linux-arch@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2015-01-29 11:51:32 -07:00			`else if (fault & VM_FAULT_SIGSEGV)`
			`goto bad_area;`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00			`else if (fault & VM_FAULT_SIGBUS)`
			`goto do_sigbus;`
			`BUG();`
			`}`
openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00
			`if (flags & FAULT_FLAG_ALLOW_RETRY) {`
			`/RGD modeled on Cris /`
			`if (fault & VM_FAULT_MAJOR)`
			`tsk->maj_flt++;`
			`else`
			`tsk->min_flt++;`
			`if (fault & VM_FAULT_RETRY) {`
			`flags &= ~FAULT_FLAG_ALLOW_RETRY;`
readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li <shaohua.li@fusionio.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-08 17:32:19 -06:00			`flags \|= FAULT_FLAG_TRIED;`
openrisc/mm/fault.c: Port OOM changes to do_page_fault Commit d065bd810b6deb67d4897a14bfe21f8eb526ba99 (mm: retry page fault when blocking on disk transfer) and commit 37b23e0525d393d48a7d59f870b3bc061a30ccdb (x86,mm: make pagefault killable) The above commits introduced changes into the x86 pagefault handler for making the page fault handler retryable as well as killable. These changes reduce the mmap_sem hold time, which is crucial during OOM killer invocation. Port these changes to openrisc. Signed-off-by: Mohd. Faris <mohdfarisq2010@gmail.com> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com> Signed-off-by: Jonas Bonn <jonas@southpole.se> 2012-03-31 06:00:51 -06:00
			`/* No need to up_read(&mm->mmap_sem) as we would`
			`* have already released it in __lock_page_or_retry`
			`* in mm/filemap.c.`
			`*/`

			`goto retry;`
			`}`
			`}`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00
			`up_read(&mm->mmap_sem);`
			`return;`

			`/*`
			`* Something tried to access memory that isn't in our memory map..`
			`* Fix it, but check if it's kernel or user first..`
			`*/`

			`bad_area:`
			`up_read(&mm->mmap_sem);`

			`bad_area_nosemaphore:`

			`/* User mode accesses just cause a SIGSEGV */`

			`if (user_mode(regs)) {`
			`info.si_signo = SIGSEGV;`
			`info.si_errno = 0;`
			`/* info.si_code has been set above */`
			`info.si_addr = (void *)address;`
			`force_sig_info(SIGSEGV, &info, tsk);`
			`return;`
			`}`

			`no_context:`

			`/* Are we prepared to handle this kernel fault?`
			`*`
			`* (The kernel has valid exception-points in the source`
			`* when it acesses user-memory. When it fails in one`
			`* of those points, we find it in a table and do a jump`
			`* to some fixup code that loads an appropriate error`
			`* code)`
			`*/`

			`{`
			`const struct exception_table_entry *entry;`

			`__asm__ __volatile__("l.nop 42");`

			`if ((entry = search_exception_tables(regs->pc)) != NULL) {`
			`/* Adjust the instruction pointer in the stackframe */`
			`regs->pc = entry->fixup;`
			`return;`
			`}`
			`}`

			`/*`
			`* Oops. The kernel tried to access some bad page. We'll have to`
			`* terminate things with extreme prejudice.`
			`*/`

			`if ((unsigned long)(address) < PAGE_SIZE)`
			`printk(KERN_ALERT`
			`"Unable to handle kernel NULL pointer dereference");`
			`else`
			`printk(KERN_ALERT "Unable to handle kernel access");`
			`printk(" at virtual address 0x%08lx\n", address);`

			`die("Oops", regs, write_acc);`

			`do_exit(SIGKILL);`

			`/*`
			`* We ran out of memory, or some other thing happened to us that made`
			`* us unable to handle the page fault gracefully.`
			`*/`

			`out_of_memory:`
			`__asm__ __volatile__("l.nop 42");`
			`__asm__ __volatile__("l.nop 1");`

			`up_read(&mm->mmap_sem);`
mm: invoke oom-killer from remaining unconverted page fault handlers A few remaining architectures directly kill the page faulting task in an out of memory situation. This is usually not a good idea since that task might not even use a significant amount of memory and so may not be the optimal victim to resolve the situation. Since 2.6.29's 1c0fe6e ("mm: invoke oom-killer from page fault") there is a hook that architecture page fault handlers are supposed to call to invoke the OOM killer and let it pick the right task to kill. Convert the remaining architectures over to this hook. To have the previous behavior of simply taking out the faulting task the vm.oom_kill_allocating_task sysctl can be set to 1. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vineet Gupta <vgupta@synopsys.com> [arch/arc bits] Cc: James Hogan <james.hogan@imgtec.com> Cc: David Howells <dhowells@redhat.com> Cc: Jonas Bonn <jonas@southpole.se> Cc: Chen Liqin <liqin.chen@sunplusct.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-08 16:59:50 -06:00			`if (!user_mode(regs))`
			`goto no_context;`
			`pagefault_out_of_memory();`
			`return;`
OpenRISC: Memory management Signed-off-by: Jonas Bonn <jonas@southpole.se> Reviewed-by: Arnd Bergmann <arnd@arndb.de> 2011-06-04 02:06:11 -06:00
			`do_sigbus:`
			`up_read(&mm->mmap_sem);`

			`/*`
			`* Send a sigbus, regardless of whether we were in kernel`
			`* or user mode.`
			`*/`
			`info.si_signo = SIGBUS;`
			`info.si_errno = 0;`
			`info.si_code = BUS_ADRERR;`
			`info.si_addr = (void *)address;`
			`force_sig_info(SIGBUS, &info, tsk);`

			`/* Kernel mode? Handle exceptions or die */`
			`if (!user_mode(regs))`
			`goto no_context;`
			`return;`

			`vmalloc_fault:`
			`{`
			`/*`
			`* Synchronize this task's top level page-table`
			`* with the 'reference' page table.`
			`*`
			`* Use current_pgd instead of tsk->active_mm->pgd`
			`* since the latter might be unavailable if this`
			`* code is executed in a misfortunately run irq`
			`* (like inside schedule() between switch_mm and`
			`* switch_to...).`
			`*/`

			`int offset = pgd_index(address);`
			`pgd_t pgd, pgd_k;`
			`pud_t pud, pud_k;`
			`pmd_t pmd, pmd_k;`
			`pte_t *pte_k;`

			`/*`
			`phx_warn("do_page_fault(): vmalloc_fault will not work, "`
			`"since current_pgd assign a proper value somewhere\n"`
			`"anyhow we don't need this at the moment\n");`

			`phx_mmu("vmalloc_fault");`
			`*/`
			`pgd = (pgd_t *)current_pgd + offset;`
			`pgd_k = init_mm.pgd + offset;`

			`/* Since we're two-level, we don't need to do both`
			`* set_pgd and set_pmd (they do the same thing). If`
			`* we go three-level at some point, do the right thing`
			`* with pgd_present and set_pgd here.`
			`*`
			`* Also, since the vmalloc area is global, we don't`
			`* need to copy individual PTE's, it is enough to`
			`* copy the pgd pointer into the pte page of the`
			`* root task. If that is there, we'll find our pte if`
			`* it exists.`
			`*/`

			`pud = pud_offset(pgd, address);`
			`pud_k = pud_offset(pgd_k, address);`
			`if (!pud_present(*pud_k))`
			`goto no_context;`

			`pmd = pmd_offset(pud, address);`
			`pmd_k = pmd_offset(pud_k, address);`

			`if (!pmd_present(*pmd_k))`
			`goto bad_area_nosemaphore;`

			`set_pmd(pmd, *pmd_k);`

			`/* Make sure the actual PTE exists as well to`
			`* catch kernel vmalloc-area accesses to non-mapped`
			`* addresses. If we don't do this, this will just`
			`* silently loop forever.`
			`*/`

			`pte_k = pte_offset_kernel(pmd_k, address);`
			`if (!pte_present(*pte_k))`
			`goto no_context;`

			`return;`
			`}`
			`}`