arch/ppc64/mm/fault.c - linux - Git at Google

 /*
  *  arch/ppc/mm/fault.c
  *
  *  PowerPC version
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  *
  *  Derived from "arch/i386/mm/fault.c"
  *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Modified by Cort Dougan and Paul Mackerras.
  *
  *  Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */

 #include <linux/config.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/kprobes.h>

 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/kdebug.h>

 /*
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
  */
 static int store_updates_sp(struct pt_regs *regs)
 {
 	unsigned int inst;

 	if (get_user(inst, (unsigned int __user *)regs->nip))
 		return 0;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) != 1)
 		return 0;
 	/* check major opcode */
 	switch (inst >> 26) {
 	case 37:	/* stwu */
 	case 39:	/* stbu */
 	case 45:	/* sthu */
 	case 53:	/* stfsu */
 	case 55:	/* stfdu */
 		return 1;
 	case 62:	/* std or stdu */
 		return (inst & 3) == 1;
 	case 31:
 		/* check minor opcode */
 		switch ((inst >> 1) & 0x3ff) {
 		case 181:	/* stdux */
 		case 183:	/* stwux */
 		case 247:	/* stbux */
 		case 439:	/* sthux */
 		case 695:	/* stfsux */
 		case 759:	/* stfdux */
 			return 1;
 		}
 	}
 	return 0;
 }

 /*
  * The error_code parameter is
  *  - DSISR for a non-SLB data access fault,
  *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
  *  - 0 any SLB fault.
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
 int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 			    unsigned long error_code)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 	siginfo_t info;
 	unsigned long code = SEGV_MAPERR;
 	unsigned long is_write = error_code & DSISR_ISSTORE;
 	unsigned long trap = TRAP(regs);
  	unsigned long is_exec = trap == 0x400;

 	BUG_ON((trap == 0x380) || (trap == 0x480));

 	if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
 				11, SIGSEGV) == NOTIFY_STOP)
 		return 0;

 	if (trap == 0x300) {
 		if (debugger_fault_handler(regs))
 			return 0;
 	}

 	/* On a kernel SLB miss we can only check for a valid exception entry */
 	if (!user_mode(regs) && (address >= TASK_SIZE))
 		return SIGSEGV;

 	if (error_code & DSISR_DABRMATCH) {
 		if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
 					11, SIGSEGV) == NOTIFY_STOP)
 			return 0;
 		if (debugger_dabr_match(regs))
 			return 0;
 	}

 	if (in_atomic() || mm == NULL) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with"
 		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
 		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
 		       regs->nip, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
 	}

 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
 	 * erroneous fault occuring in a code path which already holds mmap_sem
 	 * we will deadlock attempting to validate the fault against the
 	 * address space.  Luckily the kernel only validly references user
 	 * space from well defined areas of code, which are listed in the
 	 * exceptions table.
 	 *
 	 * As the vast majority of faults will be valid we will only perform
 	 * the source reference check when there is a possibilty of a deadlock.
 	 * Attempt to lock the address space, if we cannot we then validate the
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if (!user_mode(regs) && !search_exception_tables(regs->nip))
 			goto bad_area_nosemaphore;

 		down_read(&mm->mmap_sem);
 	}

 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;

 	if (vma->vm_start <= address) {
 		goto good_area;
 	}
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;

 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
 	 * 288 bytes below the stack pointer.
 	 * The kernel signal delivery code writes up to about 1.5kB
 	 * below the stack pointer (r1) before decrementing it.
 	 * The exec code can write slightly over 640kB to the stack
 	 * before setting the user r1.  Thus we allow the stack to
 	 * expand to 1MB without further checks.
 	 */
 	if (address + 0x100000 < vma->vm_end) {
 		/* get user regs even if this fault is in kernel mode */
 		struct pt_regs *uregs = current->thread.regs;
 		if (uregs == NULL)
 			goto bad_area;

 		/*
 		 * A user-mode access to an address a long way below
 		 * the stack pointer is only valid if the instruction
 		 * is one which would update the stack pointer to the
 		 * address accessed if the instruction completed,
 		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
 		 * (or the byte, halfword, float or double forms).
 		 *
 		 * If we don't check this then any write to the area
 		 * between the last mapped region and the stack will
 		 * expand the stack rather than segfaulting.
 		 */
 		if (address + 2048 < uregs->gpr[1]
 		    && (!user_mode(regs) || !store_updates_sp(regs)))
 			goto bad_area;
 	}

 	if (expand_stack(vma, address))
 		goto bad_area;

 good_area:
 	code = SEGV_ACCERR;

 	if (is_exec) {
 		/* protection fault */
 		if (error_code & DSISR_PROTFAULT)
 			goto bad_area;
 		if (!(vma->vm_flags & VM_EXEC))
 			goto bad_area;
 	/* a write */
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 	/* a read */
 	} else {
 		if (!(vma->vm_flags & VM_READ))
 			goto bad_area;
 	}

  survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
 	switch (handle_mm_fault(mm, vma, address, is_write)) {

 	case VM_FAULT_MINOR:
 		current->min_flt++;
 		break;
 	case VM_FAULT_MAJOR:
 		current->maj_flt++;
 		break;
 	case VM_FAULT_SIGBUS:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
 	default:
 		BUG();
 	}

 	up_read(&mm->mmap_sem);
 	return 0;

 bad_area:
 	up_read(&mm->mmap_sem);

 bad_area_nosemaphore:
 	/* User mode accesses cause a SIGSEGV */
 	if (user_mode(regs)) {
 		info.si_signo = SIGSEGV;
 		info.si_errno = 0;
 		info.si_code = code;
 		info.si_addr = (void __user *) address;
 		force_sig_info(SIGSEGV, &info, current);
 		return 0;
 	}

 	if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
 	    && printk_ratelimit())
 		printk(KERN_CRIT "kernel tried to execute NX-protected"
 		       " page (%lx) - exploit attempt? (uid: %d)\n",
 		       address, current->uid);

 	return SIGSEGV;

 /*
  * We ran out of memory, or some other thing happened to us that made
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	if (current->pid == 1) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
 	}
 	printk("VM: killing process %s\n", current->comm);
 	if (user_mode(regs))
 		do_exit(SIGKILL);
 	return SIGKILL;

 do_sigbus:
 	up_read(&mm->mmap_sem);
 	if (user_mode(regs)) {
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;
 		info.si_code = BUS_ADRERR;
 		info.si_addr = (void __user *)address;
 		force_sig_info(SIGBUS, &info, current);
 		return 0;
 	}
 	return SIGBUS;
 }

 /*
  * bad_page_fault is called when we have a bad access from the kernel.
  * It is called from do_page_fault above and from some of the procedures
  * in traps.c.
  */
 void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
 	const struct exception_table_entry *entry;

 	/* Are we prepared to handle this fault?  */
 	if ((entry = search_exception_tables(regs->nip)) != NULL) {
 		regs->nip = entry->fixup;
 		return;
 	}

 	/* kernel has accessed a bad area */
 	die("Kernel access of bad area", regs, sig);
 }
	/*
	* arch/ppc/mm/fault.c
	*
	* PowerPC version
	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
	*
	* Derived from "arch/i386/mm/fault.c"
	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	*
	* Modified by Cort Dougan and Paul Mackerras.
	*
	* Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public License
	* as published by the Free Software Foundation; either version
	* 2 of the License, or (at your option) any later version.
	*/

	#include <linux/config.h>
	#include <linux/signal.h>
	#include <linux/sched.h>
	#include <linux/kernel.h>
	#include <linux/errno.h>
	#include <linux/string.h>
	#include <linux/types.h>
	#include <linux/mman.h>
	#include <linux/mm.h>
	#include <linux/interrupt.h>
	#include <linux/smp_lock.h>
	#include <linux/module.h>
	#include <linux/kprobes.h>

	#include <asm/page.h>
	#include <asm/pgtable.h>
	#include <asm/mmu.h>
	#include <asm/mmu_context.h>
	#include <asm/system.h>
	#include <asm/uaccess.h>
	#include <asm/kdebug.h>

	/*
	* Check whether the instruction at regs->nip is a store using
	* an update addressing form which will update r1.
	*/
	static int store_updates_sp(struct pt_regs *regs)
	{
	unsigned int inst;

	if (get_user(inst, (unsigned int __user *)regs->nip))
	return 0;
	/* check for 1 in the rA field */
	if (((inst >> 16) & 0x1f) != 1)
	return 0;
	/* check major opcode */
	switch (inst >> 26) {
	case 37: /* stwu */
	case 39: /* stbu */
	case 45: /* sthu */
	case 53: /* stfsu */
	case 55: /* stfdu */
	return 1;
	case 62: /* std or stdu */
	return (inst & 3) == 1;
	case 31:
	/* check minor opcode */
	switch ((inst >> 1) & 0x3ff) {
	case 181: /* stdux */
	case 183: /* stwux */
	case 247: /* stbux */
	case 439: /* sthux */
	case 695: /* stfsux */
	case 759: /* stfdux */
	return 1;
	}
	}
	return 0;
	}

	/*
	* The error_code parameter is
	* - DSISR for a non-SLB data access fault,
	* - SRR1 & 0x08000000 for a non-SLB instruction access fault
	* - 0 any SLB fault.
	* The return value is 0 if the fault was handled, or the signal
	* number if this is a kernel fault that can't be handled here.
	*/
	int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
	unsigned long error_code)
	{
	struct vm_area_struct * vma;
	struct mm_struct *mm = current->mm;
	siginfo_t info;
	unsigned long code = SEGV_MAPERR;
	unsigned long is_write = error_code & DSISR_ISSTORE;
	unsigned long trap = TRAP(regs);
	unsigned long is_exec = trap == 0x400;

	BUG_ON((trap == 0x380) \|\| (trap == 0x480));

	if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
	11, SIGSEGV) == NOTIFY_STOP)
	return 0;

	if (trap == 0x300) {
	if (debugger_fault_handler(regs))
	return 0;
	}

	/* On a kernel SLB miss we can only check for a valid exception entry */
	if (!user_mode(regs) && (address >= TASK_SIZE))
	return SIGSEGV;

	if (error_code & DSISR_DABRMATCH) {
	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
	11, SIGSEGV) == NOTIFY_STOP)
	return 0;
	if (debugger_dabr_match(regs))
	return 0;
	}

	if (in_atomic() \|\| mm == NULL) {
	if (!user_mode(regs))
	return SIGSEGV;
	/* in_atomic() in user mode is really bad,
	as is current->mm == NULL. */
	printk(KERN_EMERG "Page fault in user mode with"
	"in_atomic() = %d mm = %p\n", in_atomic(), mm);
	printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
	regs->nip, regs->msr);
	die("Weird page fault", regs, SIGSEGV);
	}

	/* When running in the kernel we expect faults to occur only to
	* addresses in user space. All other faults represent errors in the
	* kernel and should generate an OOPS. Unfortunatly, in the case of an
	* erroneous fault occuring in a code path which already holds mmap_sem
	* we will deadlock attempting to validate the fault against the
	* address space. Luckily the kernel only validly references user
	* space from well defined areas of code, which are listed in the
	* exceptions table.
	*
	* As the vast majority of faults will be valid we will only perform
	* the source reference check when there is a possibilty of a deadlock.
	* Attempt to lock the address space, if we cannot we then validate the
	* source. If this is invalid we can skip the address space check,
	* thus avoiding the deadlock.
	*/
	if (!down_read_trylock(&mm->mmap_sem)) {
	if (!user_mode(regs) && !search_exception_tables(regs->nip))
	goto bad_area_nosemaphore;

	down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
	goto bad_area;

	if (vma->vm_start <= address) {
	goto good_area;
	}
	if (!(vma->vm_flags & VM_GROWSDOWN))
	goto bad_area;

	/*
	* N.B. The POWER/Open ABI allows programs to access up to
	* 288 bytes below the stack pointer.
	* The kernel signal delivery code writes up to about 1.5kB
	* below the stack pointer (r1) before decrementing it.
	* The exec code can write slightly over 640kB to the stack
	* before setting the user r1. Thus we allow the stack to
	* expand to 1MB without further checks.
	*/
	if (address + 0x100000 < vma->vm_end) {
	/* get user regs even if this fault is in kernel mode */
	struct pt_regs *uregs = current->thread.regs;
	if (uregs == NULL)
	goto bad_area;

	/*
	* A user-mode access to an address a long way below
	* the stack pointer is only valid if the instruction
	* is one which would update the stack pointer to the
	* address accessed if the instruction completed,
	* i.e. either stwu rs,n(r1) or stwux rs,r1,rb
	* (or the byte, halfword, float or double forms).
	*
	* If we don't check this then any write to the area
	* between the last mapped region and the stack will
	* expand the stack rather than segfaulting.
	*/
	if (address + 2048 < uregs->gpr[1]
	&& (!user_mode(regs) \|\| !store_updates_sp(regs)))
	goto bad_area;
	}

	if (expand_stack(vma, address))
	goto bad_area;

	good_area:
	code = SEGV_ACCERR;

	if (is_exec) {
	/* protection fault */
	if (error_code & DSISR_PROTFAULT)
	goto bad_area;
	if (!(vma->vm_flags & VM_EXEC))
	goto bad_area;
	/* a write */
	} else if (is_write) {
	if (!(vma->vm_flags & VM_WRITE))
	goto bad_area;
	/* a read */
	} else {
	if (!(vma->vm_flags & VM_READ))
	goto bad_area;
	}

	survive:
	/*
	* If for any reason at all we couldn't handle the fault,
	* make sure we exit gracefully rather than endlessly redo
	* the fault.
	*/
	switch (handle_mm_fault(mm, vma, address, is_write)) {

	case VM_FAULT_MINOR:
	current->min_flt++;
	break;
	case VM_FAULT_MAJOR:
	current->maj_flt++;
	break;
	case VM_FAULT_SIGBUS:
	goto do_sigbus;
	case VM_FAULT_OOM:
	goto out_of_memory;
	default:
	BUG();
	}

	up_read(&mm->mmap_sem);
	return 0;

	bad_area:
	up_read(&mm->mmap_sem);

	bad_area_nosemaphore:
	/* User mode accesses cause a SIGSEGV */
	if (user_mode(regs)) {
	info.si_signo = SIGSEGV;
	info.si_errno = 0;
	info.si_code = code;
	info.si_addr = (void __user *) address;
	force_sig_info(SIGSEGV, &info, current);
	return 0;
	}

	if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
	&& printk_ratelimit())
	printk(KERN_CRIT "kernel tried to execute NX-protected"
	" page (%lx) - exploit attempt? (uid: %d)\n",
	address, current->uid);

	return SIGSEGV;

	/*
	* We ran out of memory, or some other thing happened to us that made
	* us unable to handle the page fault gracefully.
	*/
	out_of_memory:
	up_read(&mm->mmap_sem);
	if (current->pid == 1) {
	yield();
	down_read(&mm->mmap_sem);
	goto survive;
	}
	printk("VM: killing process %s\n", current->comm);
	if (user_mode(regs))
	do_exit(SIGKILL);
	return SIGKILL;

	do_sigbus:
	up_read(&mm->mmap_sem);
	if (user_mode(regs)) {
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void __user *)address;
	force_sig_info(SIGBUS, &info, current);
	return 0;
	}
	return SIGBUS;
	}

	/*
	* bad_page_fault is called when we have a bad access from the kernel.
	* It is called from do_page_fault above and from some of the procedures
	* in traps.c.
	*/
	void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
	{
	const struct exception_table_entry *entry;

	/* Are we prepared to handle this fault? */
	if ((entry = search_exception_tables(regs->nip)) != NULL) {
	regs->nip = entry->fixup;
	return;
	}

	/* kernel has accessed a bad area */
	die("Kernel access of bad area", regs, sig);
	}