summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2005-09-26 16:04:21 +1000
committerPaul Mackerras <paulus@samba.org>2005-09-26 16:04:21 +1000
commit14cf11af6cf608eb8c23e989ddb17a715ddce109 (patch)
tree271a97ce73e265f39c569cb159c195c5b4bb3f8c /arch/powerpc/mm
parente5baa396af7560382d2cf3f0871d616b61fc284c (diff)
powerpc: Merge enough to start building in arch/powerpc.
This creates the directory structure under arch/powerpc and a bunch of Kconfig files. It does a first-cut merge of arch/powerpc/mm, arch/powerpc/lib and arch/powerpc/platforms/powermac. This is enough to build a 32-bit powermac kernel with ARCH=powerpc. For now we are getting some unmerged files from arch/ppc/kernel and arch/ppc/syslib, or arch/ppc64/kernel. This makes some minor changes to files in those directories and files outside arch/powerpc. The boot directory is still not merged. That's going to be interesting. Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c120
-rw-r--r--arch/powerpc/mm/4xx_mmu.c141
-rw-r--r--arch/powerpc/mm/Makefile12
-rw-r--r--arch/powerpc/mm/fault.c391
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c237
-rw-r--r--arch/powerpc/mm/hash_32.S618
-rw-r--r--arch/powerpc/mm/init.c581
-rw-r--r--arch/powerpc/mm/init64.c385
-rw-r--r--arch/powerpc/mm/mem.c299
-rw-r--r--arch/powerpc/mm/mem64.c259
-rw-r--r--arch/powerpc/mm/mem_pieces.c163
-rw-r--r--arch/powerpc/mm/mem_pieces.h48
-rw-r--r--arch/powerpc/mm/mmu_context.c86
-rw-r--r--arch/powerpc/mm/mmu_context64.c63
-rw-r--r--arch/powerpc/mm/mmu_decl.h85
-rw-r--r--arch/powerpc/mm/pgtable.c470
-rw-r--r--arch/powerpc/mm/pgtable64.c357
-rw-r--r--arch/powerpc/mm/ppc_mmu.c296
-rw-r--r--arch/powerpc/mm/tlb.c183
19 files changed, 4794 insertions, 0 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
new file mode 100644
index 000000000000..3d79ce281b67
--- /dev/null
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -0,0 +1,120 @@
+/*
+ * Modifications by Matt Porter (mporter@mvista.com) to support
+ * PPC44x Book E processors.
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/bootx.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+
+#include "mmu_decl.h"
+
+extern char etext[], _stext[];
+
+/* Used by the 44x TLB replacement exception handler.
+ * Just needed it declared someplace.
+ */
+unsigned int tlb_44x_index = 0;
+unsigned int tlb_44x_hwater = 62;
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem
+ */
+static void __init
+ppc44x_pin_tlb(int slot, unsigned int virt, unsigned int phys)
+{
+ unsigned long attrib = 0;
+
+ __asm__ __volatile__("\
+ clrrwi %2,%2,10\n\
+ ori %2,%2,%4\n\
+ clrrwi %1,%1,10\n\
+ li %0,0\n\
+ ori %0,%0,%5\n\
+ tlbwe %2,%3,%6\n\
+ tlbwe %1,%3,%7\n\
+ tlbwe %0,%3,%8"
+ :
+ : "r" (attrib), "r" (phys), "r" (virt), "r" (slot),
+ "i" (PPC44x_TLB_VALID | PPC44x_TLB_256M),
+ "i" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
+ "i" (PPC44x_TLB_PAGEID),
+ "i" (PPC44x_TLB_XLAT),
+ "i" (PPC44x_TLB_ATTRIB));
+}
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+ flush_instruction_cache();
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+ unsigned int pinned_tlbs = 1;
+ int i;
+
+ /* Determine number of entries necessary to cover lowmem */
+ pinned_tlbs = (unsigned int)
+ (_ALIGN(total_lowmem, PPC44x_PIN_SIZE) >> PPC44x_PIN_SHIFT);
+
+ /* Write upper watermark to save location */
+ tlb_44x_hwater = PPC44x_LOW_SLOT - pinned_tlbs;
+
+ /* If necessary, set additional pinned TLBs */
+ if (pinned_tlbs > 1)
+ for (i = (PPC44x_LOW_SLOT-(pinned_tlbs-1)); i < PPC44x_LOW_SLOT; i++) {
+ unsigned int phys_addr = (PPC44x_LOW_SLOT-i) * PPC44x_PIN_SIZE;
+ ppc44x_pin_tlb(i, phys_addr+PAGE_OFFSET, phys_addr);
+ }
+
+ return total_lowmem;
+}
diff --git a/arch/powerpc/mm/4xx_mmu.c b/arch/powerpc/mm/4xx_mmu.c
new file mode 100644
index 000000000000..b7bcbc232f39
--- /dev/null
+++ b/arch/powerpc/mm/4xx_mmu.c
@@ -0,0 +1,141 @@
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/bootx.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+#include "mmu_decl.h"
+
+extern int __map_without_ltlbs;
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+ /*
+ * The Zone Protection Register (ZPR) defines how protection will
+ * be applied to every page which is a member of a given zone. At
+ * present, we utilize only two of the 4xx's zones.
+ * The zone index bits (of ZSEL) in the PTE are used for software
+ * indicators, except the LSB. For user access, zone 1 is used,
+ * for kernel access, zone 0 is used. We set all but zone 1
+ * to zero, allowing only kernel access as indicated in the PTE.
+ * For zone 1, we set a 01 binary (a value of 10 will not work)
+ * to allow user access as indicated in the PTE. This also allows
+ * kernel access as indicated in the PTE.
+ */
+
+ mtspr(SPRN_ZPR, 0x10000000);
+
+ flush_instruction_cache();
+
+ /*
+ * Set up the real-mode cache parameters for the exception vector
+ * handlers (which are run in real-mode).
+ */
+
+ mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */
+
+ /*
+ * Cache instruction and data space where the exception
+ * vectors and the kernel live in real-mode.
+ */
+
+ mtspr(SPRN_DCCR, 0xF0000000); /* 512 MB of data space at 0x0. */
+ mtspr(SPRN_ICCR, 0xF0000000); /* 512 MB of instr. space at 0x0. */
+}
+
+#define LARGE_PAGE_SIZE_16M (1<<24)
+#define LARGE_PAGE_SIZE_4M (1<<22)
+
+unsigned long __init mmu_mapin_ram(void)
+{
+ unsigned long v, s;
+ phys_addr_t p;
+
+ v = KERNELBASE;
+ p = PPC_MEMSTART;
+ s = 0;
+
+ if (__map_without_ltlbs) {
+ return s;
+ }
+
+ while (s <= (total_lowmem - LARGE_PAGE_SIZE_16M)) {
+ pmd_t *pmdp;
+ unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+ spin_lock(&init_mm.page_table_lock);
+ pmdp = pmd_offset(pgd_offset_k(v), v);
+ pmd_val(*pmdp++) = val;
+ pmd_val(*pmdp++) = val;
+ pmd_val(*pmdp++) = val;
+ pmd_val(*pmdp++) = val;
+ spin_unlock(&init_mm.page_table_lock);
+
+ v += LARGE_PAGE_SIZE_16M;
+ p += LARGE_PAGE_SIZE_16M;
+ s += LARGE_PAGE_SIZE_16M;
+ }
+
+ while (s <= (total_lowmem - LARGE_PAGE_SIZE_4M)) {
+ pmd_t *pmdp;
+ unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+ spin_lock(&init_mm.page_table_lock);
+ pmdp = pmd_offset(pgd_offset_k(v), v);
+ pmd_val(*pmdp) = val;
+ spin_unlock(&init_mm.page_table_lock);
+
+ v += LARGE_PAGE_SIZE_4M;
+ p += LARGE_PAGE_SIZE_4M;
+ s += LARGE_PAGE_SIZE_4M;
+ }
+
+ return s;
+}
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
new file mode 100644
index 000000000000..9f52c26acd86
--- /dev/null
+++ b/arch/powerpc/mm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ppc-specific parts of the memory manager.
+#
+
+obj-y := fault.o mem.o
+obj-$(CONFIG_PPC32) += init.o pgtable.o mmu_context.o \
+ mem_pieces.o tlb.o
+obj-$(CONFIG_PPC64) += init64.o pgtable64.o mmu_context64.o
+obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu.o hash_32.o
+obj-$(CONFIG_40x) += 4xx_mmu.o
+obj-$(CONFIG_44x) += 44x_mmu.o
+obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
new file mode 100644
index 000000000000..3df641fa789d
--- /dev/null
+++ b/arch/powerpc/mm/fault.c
@@ -0,0 +1,391 @@
+/*
+ * arch/ppc/mm/fault.c
+ *
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Derived from "arch/i386/mm/fault.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Modified by Cort Dougan and Paul Mackerras.
+ *
+ * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+#include <asm/kdebug.h>
+#include <asm/siginfo.h>
+
+/*
+ * Check whether the instruction at regs->nip is a store using
+ * an update addressing form which will update r1.
+ */
+static int store_updates_sp(struct pt_regs *regs)
+{
+ unsigned int inst;
+
+ if (get_user(inst, (unsigned int __user *)regs->nip))
+ return 0;
+ /* check for 1 in the rA field */
+ if (((inst >> 16) & 0x1f) != 1)
+ return 0;
+ /* check major opcode */
+ switch (inst >> 26) {
+ case 37: /* stwu */
+ case 39: /* stbu */
+ case 45: /* sthu */
+ case 53: /* stfsu */
+ case 55: /* stfdu */
+ return 1;
+ case 62: /* std or stdu */
+ return (inst & 3) == 1;
+ case 31:
+ /* check minor opcode */
+ switch ((inst >> 1) & 0x3ff) {
+ case 181: /* stdux */
+ case 183: /* stwux */
+ case 247: /* stbux */
+ case 439: /* sthux */
+ case 695: /* stfsux */
+ case 759: /* stfdux */
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void do_dabr(struct pt_regs *regs, unsigned long error_code)
+{
+ siginfo_t info;
+
+ if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+ 11, SIGSEGV) == NOTIFY_STOP)
+ return;
+
+ if (debugger_dabr_match(regs))
+ return;
+
+ /* Clear the DABR */
+ set_dabr(0);
+
+ /* Deliver the signal to userspace */
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_HWBKPT;
+ info.si_addr = (void __user *)regs->nip;
+ force_sig_info(SIGTRAP, &info, current);
+}
+
+/*
+ * For 600- and 800-family processors, the error_code parameter is DSISR
+ * for a data fault, SRR1 for an instruction fault. For 400-family processors
+ * the error_code parameter is ESR for a data fault, 0 for an instruction
+ * fault.
+ * For 64-bit processors, the error_code parameter is
+ * - DSISR for a non-SLB data access fault,
+ * - SRR1 & 0x08000000 for a non-SLB instruction access fault
+ * - 0 any SLB fault.
+ *
+ * The return value is 0 if the fault was handled, or the signal
+ * number if this is a kernel fault that can't be handled here.
+ */
+int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+ siginfo_t info;
+ int code = SEGV_MAPERR;
+ int is_write = 0;
+ int trap = TRAP(regs);
+ int is_exec = trap == 0x400;
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+ /*
+ * Fortunately the bit assignments in SRR1 for an instruction
+ * fault and DSISR for a data fault are mostly the same for the
+ * bits we are interested in. But there are some bits which
+ * indicate errors in DSISR but can validly be set in SRR1.
+ */
+ if (trap == 0x400)
+ error_code &= 0x48200000;
+ else
+ is_write = error_code & DSISR_ISSTORE;
+#else
+ is_write = error_code & ESR_DST;
+#endif /* CONFIG_4xx || CONFIG_BOOKE */
+
+ if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
+ 11, SIGSEGV) == NOTIFY_STOP)
+ return 0;
+
+ if (trap == 0x300) {
+ if (debugger_fault_handler(regs))
+ return 0;
+ }
+
+ /* On a kernel SLB miss we can only check for a valid exception entry */
+ if (!user_mode(regs) && (address >= TASK_SIZE))
+ return SIGSEGV;
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+ if (error_code & DSISR_DABRMATCH) {
+ /* DABR match */
+ do_dabr(regs, error_code);
+ return 0;
+ }
+#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
+
+ if (in_atomic() || mm == NULL) {
+ if (!user_mode(regs))
+ return SIGSEGV;
+ /* in_atomic() in user mode is really bad,
+ as is current->mm == NULL. */
+ printk(KERN_EMERG "Page fault in user mode with"
+ "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+ printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
+ regs->nip, regs->msr);
+ die("Weird page fault", regs, SIGSEGV);
+ }
+
+ /* When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in the
+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
+ * erroneous fault occuring in a code path which already holds mmap_sem
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+ * exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibilty of a deadlock.
+ * Attempt to lock the address space, if we cannot we then validate the
+ * source. If this is invalid we can skip the address space check,
+ * thus avoiding the deadlock.
+ */
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!user_mode(regs) && !search_exception_tables(regs->nip))
+ goto bad_area_nosemaphore;
+
+ down_read(&mm->mmap_sem);
+ }
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto bad_area;
+ if (vma->vm_start <= address)
+ goto good_area;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+
+ /*
+ * N.B. The POWER/Open ABI allows programs to access up to
+ * 288 bytes below the stack pointer.
+ * The kernel signal delivery code writes up to about 1.5kB
+ * below the stack pointer (r1) before decrementing it.
+ * The exec code can write slightly over 640kB to the stack
+ * before setting the user r1. Thus we allow the stack to
+ * expand to 1MB without further checks.
+ */
+ if (address + 0x100000 < vma->vm_end) {
+ /* get user regs even if this fault is in kernel mode */
+ struct pt_regs *uregs = current->thread.regs;
+ if (uregs == NULL)
+ goto bad_area;
+
+ /*
+ * A user-mode access to an address a long way below
+ * the stack pointer is only valid if the instruction
+ * is one which would update the stack pointer to the
+ * address accessed if the instruction completed,
+ * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+ * (or the byte, halfword, float or double forms).
+ *
+ * If we don't check this then any write to the area
+ * between the last mapped region and the stack will
+ * expand the stack rather than segfaulting.
+ */
+ if (address + 2048 < uregs->gpr[1]
+ && (!user_mode(regs) || !store_updates_sp(regs)))
+ goto bad_area;
+ }
+ if (expand_stack(vma, address))
+ goto bad_area;
+
+good_area:
+ code = SEGV_ACCERR;
+#if defined(CONFIG_6xx)
+ if (error_code & 0x95700000)
+ /* an error such as lwarx to I/O controller space,
+ address matching DABR, eciwx, etc. */
+ goto bad_area;
+#endif /* CONFIG_6xx */
+#if defined(CONFIG_8xx)
+ /* The MPC8xx seems to always set 0x80000000, which is
+ * "undefined". Of those that can be set, this is the only
+ * one which seems bad.
+ */
+ if (error_code & 0x10000000)
+ /* Guarded storage error. */
+ goto bad_area;
+#endif /* CONFIG_8xx */
+
+ if (is_exec) {
+#ifdef CONFIG_PPC64
+ /* protection fault */
+ if (error_code & DSISR_PROTFAULT)
+ goto bad_area;
+ if (!(vma->vm_flags & VM_EXEC))
+ goto bad_area;
+#endif
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+ pte_t *ptep;
+
+ /* Since 4xx/Book-E supports per-page execute permission,
+ * we lazily flush dcache to icache. */
+ ptep = NULL;
+ if (get_pteptr(mm, address, &ptep) && pte_present(*ptep)) {
+ struct page *page = pte_page(*ptep);
+
+ if (! test_bit(PG_arch_1, &page->flags)) {
+ flush_dcache_icache_page(page);
+ set_bit(PG_arch_1, &page->flags);
+ }
+ pte_update(ptep, 0, _PAGE_HWEXEC);
+ _tlbie(address);
+ pte_unmap(ptep);
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ if (ptep != NULL)
+ pte_unmap(ptep);
+#endif
+ /* a write */
+ } else if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
+ goto bad_area;
+ /* a read */
+ } else {
+ /* protection fault */
+ if (error_code & 0x08000000)
+ goto bad_area;
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ goto bad_area;
+ }
+
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ survive:
+ switch (handle_mm_fault(mm, vma, address, is_write)) {
+
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ case VM_FAULT_SIGBUS:
+ goto do_sigbus;
+ case VM_FAULT_OOM:
+ goto out_of_memory;
+ default:
+ BUG();
+ }
+
+ up_read(&mm->mmap_sem);
+ return 0;
+
+bad_area:
+ up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+ /* User mode accesses cause a SIGSEGV */
+ if (user_mode(regs)) {
+ _exception(SIGSEGV, regs, code, address);
+ return 0;
+ }
+
+ if (is_exec && (error_code & DSISR_PROTFAULT)
+ && printk_ratelimit())
+ printk(KERN_CRIT "kernel tried to execute NX-protected"
+ " page (%lx) - exploit attempt? (uid: %d)\n",
+ address, current->uid);
+
+ return SIGSEGV;
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (current->pid == 1) {
+ yield();
+ down_read(&mm->mmap_sem);
+ goto survive;
+ }
+ printk("VM: killing process %s\n", current->comm);
+ if (user_mode(regs))
+ do_exit(SIGKILL);
+ return SIGKILL;
+
+do_sigbus:
+ up_read(&mm->mmap_sem);
+ if (user_mode(regs)) {
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGBUS, &info, current);
+ return 0;
+ }
+ return SIGBUS;
+}
+
+/*
+ * bad_page_fault is called when we have a bad access from the kernel.
+ * It is called from the DSI and ISI handlers in head.S and from some
+ * of the procedures in traps.c.
+ */
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+{
+ const struct exception_table_entry *entry;
+
+ /* Are we prepared to handle this fault? */
+ if ((entry = search_exception_tables(regs->nip)) != NULL) {
+ regs->nip = entry->fixup;
+ return;
+ }
+
+ /* kernel has accessed a bad area */
+ die("Kernel access of bad area", regs, sig);
+}
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
new file mode 100644
index 000000000000..af9ca0eb6d55
--- /dev/null
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -0,0 +1,237 @@
+/*
+ * Modifications by Kumar Gala (kumar.gala@freescale.com) to support
+ * E500 Book E processors.
+ *
+ * Copyright 2004 Freescale Semiconductor, Inc
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/bootx.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+
+extern void loadcam_entry(unsigned int index);
+unsigned int tlbcam_index;
+unsigned int num_tlbcam_entries;
+static unsigned long __cam0, __cam1, __cam2;
+extern unsigned long total_lowmem;
+extern unsigned long __max_low_memory;
+#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
+
+#define NUM_TLBCAMS (16)
+
+struct tlbcam {
+ u32 MAS0;
+ u32 MAS1;
+ u32 MAS2;
+ u32 MAS3;
+ u32 MAS7;
+} TLBCAM[NUM_TLBCAMS];
+
+struct tlbcamrange {
+ unsigned long start;
+ unsigned long limit;
+ phys_addr_t phys;
+} tlbcam_addrs[NUM_TLBCAMS];
+
+extern unsigned int tlbcam_index;
+
+/*
+ * Return PA for this VA if it is mapped by a CAM, or 0
+ */
+unsigned long v_mapped_by_tlbcam(unsigned long va)
+{
+ int b;
+ for (b = 0; b < tlbcam_index; ++b)
+ if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit)
+ return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_mapped_by_tlbcam(unsigned long pa)
+{
+ int b;
+ for (b = 0; b < tlbcam_index; ++b)
+ if (pa >= tlbcam_addrs[b].phys
+ && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+ +tlbcam_addrs[b].phys)
+ return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
+ return 0;
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 4 between 4k and 256M.
+ */
+void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, int flags, unsigned int pid)
+{
+ unsigned int tsize, lz;
+
+ asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
+ tsize = (21 - lz) / 2;
+
+#ifdef CONFIG_SMP
+ if ((flags & _PAGE_NO_CACHE) == 0)
+ flags |= _PAGE_COHERENT;
+#endif
+
+ TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1);
+ TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid);
+ TLBCAM[index].MAS2 = virt & PAGE_MASK;
+
+ TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0;
+ TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0;
+ TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0;
+ TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
+ TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
+
+ TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR;
+ TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+
+#ifndef CONFIG_KGDB /* want user access for breakpoints */
+ if (flags & _PAGE_USER) {
+ TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
+ TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+ }
+#else
+ TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
+ TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+#endif
+
+ tlbcam_addrs[index].start = virt;
+ tlbcam_addrs[index].limit = virt + size - 1;
+ tlbcam_addrs[index].phys = phys;
+
+ loadcam_entry(index);
+}
+
+void invalidate_tlbcam_entry(int index)
+{
+ TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
+ TLBCAM[index].MAS1 = ~MAS1_VALID;
+
+ loadcam_entry(index);
+}
+
+void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
+ unsigned long cam2)
+{
+ settlbcam(0, KERNELBASE, PPC_MEMSTART, cam0, _PAGE_KERNEL, 0);
+ tlbcam_index++;
+ if (cam1) {
+ tlbcam_index++;
+ settlbcam(1, KERNELBASE+cam0, PPC_MEMSTART+cam0, cam1, _PAGE_KERNEL, 0);
+ }
+ if (cam2) {
+ tlbcam_index++;
+ settlbcam(2, KERNELBASE+cam0+cam1, PPC_MEMSTART+cam0+cam1, cam2, _PAGE_KERNEL, 0);
+ }
+}
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+ flush_instruction_cache();
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+ cam_mapin_ram(__cam0, __cam1, __cam2);
+
+ return __cam0 + __cam1 + __cam2;
+}
+
+
+void __init
+adjust_total_lowmem(void)
+{
+ unsigned long max_low_mem = MAX_LOW_MEM;
+ unsigned long cam_max = 0x10000000;
+ unsigned long ram;
+
+ /* adjust CAM size to max_low_mem */
+ if (max_low_mem < cam_max)
+ cam_max = max_low_mem;
+
+ /* adjust lowmem size to max_low_mem */
+ if (max_low_mem < total_lowmem)
+ ram = max_low_mem;
+ else
+ ram = total_lowmem;
+
+ /* Calculate CAM values */
+ __cam0 = 1UL << 2 * (__ilog2(ram) / 2);
+ if (__cam0 > cam_max)
+ __cam0 = cam_max;
+ ram -= __cam0;
+ if (ram) {
+ __cam1 = 1UL << 2 * (__ilog2(ram) / 2);
+ if (__cam1 > cam_max)
+ __cam1 = cam_max;
+ ram -= __cam1;
+ }
+ if (ram) {
+ __cam2 = 1UL << 2 * (__ilog2(ram) / 2);
+ if (__cam2 > cam_max)
+ __cam2 = cam_max;
+ ram -= __cam2;
+ }
+
+ printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
+ " CAM2=%ldMb residual: %ldMb\n",
+ __cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
+ (total_lowmem - __cam0 - __cam1 - __cam2) >> 20);
+ __max_low_memory = max_low_mem = __cam0 + __cam1 + __cam2;
+}
diff --git a/arch/powerpc/mm/hash_32.S b/arch/powerpc/mm/hash_32.S
new file mode 100644
index 000000000000..57278a8dd132
--- /dev/null
+++ b/arch/powerpc/mm/hash_32.S
@@ -0,0 +1,618 @@
+/*
+ * arch/ppc/kernel/hashtable.S
+ *
+ * $Id: hashtable.S,v 1.6 1999/10/08 01:56:15 paulus Exp $
+ *
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ * Adapted for Power Macintosh by Paul Mackerras.
+ * Low-level exception handlers and MMU support
+ * rewritten by Paul Mackerras.
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ * This file contains low-level assembler routines for managing
+ * the PowerPC MMU hash table. (PPC 8xx processors don't use a
+ * hash table, so this file is not used on them.)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_SMP
+ .comm mmu_hash_lock,4
+#endif /* CONFIG_SMP */
+
+/*
+ * Sync CPUs with hash_page taking & releasing the hash
+ * table lock
+ */
+#ifdef CONFIG_SMP
+ .text
+_GLOBAL(hash_page_sync)
+ lis r8,mmu_hash_lock@h
+ ori r8,r8,mmu_hash_lock@l
+ lis r0,0x0fff
+ b 10f
+11: lwz r6,0(r8)
+ cmpwi 0,r6,0
+ bne 11b
+10: lwarx r6,0,r8
+ cmpwi 0,r6,0
+ bne- 11b
+ stwcx. r0,0,r8
+ bne- 10b
+ isync
+ eieio
+ li r0,0
+ stw r0,0(r8)
+ blr
+#endif
+
+/*
+ * Load a PTE into the hash table, if possible.
+ * The address is in r4, and r3 contains an access flag:
+ * _PAGE_RW (0x400) if a write.
+ * r9 contains the SRR1 value, from which we use the MSR_PR bit.
+ * SPRG3 contains the physical address of the current task's thread.
+ *
+ * Returns to the caller if the access is illegal or there is no
+ * mapping for the address. Otherwise it places an appropriate PTE
+ * in the hash table and returns from the exception.
+ * Uses r0, r3 - r8, ctr, lr.
+ */
+ .text
+_GLOBAL(hash_page)
+#ifdef CONFIG_PPC64BRIDGE
+ mfmsr r0
+ clrldi r0,r0,1 /* make sure it's in 32-bit mode */
+ MTMSRD(r0)
+ isync
+#endif
+ tophys(r7,0) /* gets -KERNELBASE into r7 */
+#ifdef CONFIG_SMP
+ addis r8,r7,mmu_hash_lock@h
+ ori r8,r8,mmu_hash_lock@l
+ lis r0,0x0fff
+ b 10f
+11: lwz r6,0(r8)
+ cmpwi 0,r6,0
+ bne 11b
+10: lwarx r6,0,r8
+ cmpwi 0,r6,0
+ bne- 11b
+ stwcx. r0,0,r8
+ bne- 10b
+ isync
+#endif
+ /* Get PTE (linux-style) and check access */
+ lis r0,KERNELBASE@h /* check if kernel address */
+ cmplw 0,r4,r0
+ mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */
+ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
+ lwz r5,PGDIR(r8) /* virt page-table root */
+ blt+ 112f /* assume user more likely */
+ lis r5,swapper_pg_dir@ha /* if kernel address, use */
+ addi r5,r5,swapper_pg_dir@l /* kernel page table */
+ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
+112: add r5,r5,r7 /* convert to phys addr */
+ rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
+ lwz r8,0(r5) /* get pmd entry */
+ rlwinm. r8,r8,0,0,19 /* extract address of pte page */
+#ifdef CONFIG_SMP
+ beq- hash_page_out /* return if no mapping */
+#else
+ /* XXX it seems like the 601 will give a machine fault on the
+ rfi if its alignment is wrong (bottom 4 bits of address are
+ 8 or 0xc) and we have had a not-taken conditional branch
+ to the address following the rfi. */
+ beqlr-
+#endif
+ rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */
+ rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
+ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
+
+ /*
+ * Update the linux PTE atomically. We do the lwarx up-front
+ * because almost always, there won't be a permission violation
+ * and there won't already be an HPTE, and thus we will have
+ * to update the PTE to set _PAGE_HASHPTE. -- paulus.
+ */
+retry:
+ lwarx r6,0,r8 /* get linux-style pte */
+ andc. r5,r3,r6 /* check access & ~permission */
+#ifdef CONFIG_SMP
+ bne- hash_page_out /* return if access not permitted */
+#else
+ bnelr-
+#endif
+ or r5,r0,r6 /* set accessed/dirty bits */
+ stwcx. r5,0,r8 /* attempt to update PTE */
+ bne- retry /* retry if someone got there first */
+
+ mfsrin r3,r4 /* get segment reg for segment */
+ mfctr r0
+ stw r0,_CTR(r11)
+ bl create_hpte /* add the hash table entry */
+
+#ifdef CONFIG_SMP
+ eieio
+ addis r8,r7,mmu_hash_lock@ha
+ li r0,0
+ stw r0,mmu_hash_lock@l(r8)
+#endif
+
+ /* Return from the exception */
+ lwz r5,_CTR(r11)
+ mtctr r5
+ lwz r0,GPR0(r11)
+ lwz r7,GPR7(r11)
+ lwz r8,GPR8(r11)
+ b fast_exception_return
+
+#ifdef CONFIG_SMP
+hash_page_out:
+ eieio
+ addis r8,r7,mmu_hash_lock@ha
+ li r0,0
+ stw r0,mmu_hash_lock@l(r8)
+ blr
+#endif /* CONFIG_SMP */
+
+/*
+ * Add an entry for a particular page to the hash table.
+ *
+ * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval)
+ *
+ * We assume any necessary modifications to the pte (e.g. setting
+ * the accessed bit) have already been done and that there is actually
+ * a hash table in use (i.e. we're not on a 603).
+ */
+_GLOBAL(add_hash_page)
+ mflr r0
+ stw r0,4(r1)
+
+ /* Convert context and va to VSID */
+ mulli r3,r3,897*16 /* multiply context by context skew */
+ rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
+ mulli r0,r0,0x111 /* multiply by ESID skew */
+ add r3,r3,r0 /* note create_hpte trims to 24 bits */
+
+#ifdef CONFIG_SMP
+ rlwinm r8,r1,0,0,18 /* use cpu number to make tag */
+ lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
+ oris r8,r8,12
+#endif /* CONFIG_SMP */
+
+ /*
+ * We disable interrupts here, even on UP, because we don't
+ * want to race with hash_page, and because we want the
+ * _PAGE_HASHPTE bit to be a reliable indication of whether
+ * the HPTE exists (or at least whether one did once).
+ * We also turn off the MMU for data accesses so that we
+ * we can't take a hash table miss (assuming the code is
+ * covered by a BAT). -- paulus
+ */
+ mfmsr r10
+ SYNC
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear MSR_DR */
+ mtmsr r0
+ SYNC_601
+ isync
+
+ tophys(r7,0)
+
+#ifdef CONFIG_SMP
+ addis r9,r7,mmu_hash_lock@ha
+ addi r9,r9,mmu_hash_lock@l
+10: lwarx r0,0,r9 /* take the mmu_hash_lock */
+ cmpi 0,r0,0
+ bne- 11f
+ stwcx. r8,0,r9
+ beq+ 12f
+11: lwz r0,0(r9)
+ cmpi 0,r0,0
+ beq 10b
+ b 11b
+12: isync
+#endif
+
+ /*
+ * Fetch the linux pte and test and set _PAGE_HASHPTE atomically.
+ * If _PAGE_HASHPTE was already set, we don't replace the existing
+ * HPTE, so we just unlock and return.
+ */
+ mr r8,r5
+ rlwimi r8,r4,22,20,29
+1: lwarx r6,0,r8
+ andi. r0,r6,_PAGE_HASHPTE
+ bne 9f /* if HASHPTE already set, done */
+ ori r5,r6,_PAGE_HASHPTE
+ stwcx. r5,0,r8
+ bne- 1b
+
+ bl create_hpte
+
+9:
+#ifdef CONFIG_SMP
+ eieio
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+#endif
+
+ /* reenable interrupts and DR */
+ mtmsr r10
+ SYNC_601
+ isync
+
+ lwz r0,4(r1)
+ mtlr r0
+ blr
+
+/*
+ * This routine adds a hardware PTE to the hash table.
+ * It is designed to be called with the MMU either on or off.
+ * r3 contains the VSID, r4 contains the virtual address,
+ * r5 contains the linux PTE, r6 contains the old value of the
+ * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
+ * offset to be added to addresses (0 if the MMU is on,
+ * -KERNELBASE if it is off).
+ * On SMP, the caller should have the mmu_hash_lock held.
+ * We assume that the caller has (or will) set the _PAGE_HASHPTE
+ * bit in the linux PTE in memory. The value passed in r6 should
+ * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set
+ * this routine will skip the search for an existing HPTE.
+ * This procedure modifies r0, r3 - r6, r8, cr0.
+ * -- paulus.
+ *
+ * For speed, 4 of the instructions get patched once the size and
+ * physical address of the hash table are known. These definitions
+ * of Hash_base and Hash_bits below are just an example.
+ */
+Hash_base = 0xc0180000
+Hash_bits = 12 /* e.g. 256kB hash table */
+Hash_msk = (((1 << Hash_bits) - 1) * 64)
+
+#ifndef CONFIG_PPC64BRIDGE
+/* defines for the PTE format for 32-bit PPCs */
+#define PTE_SIZE 8
+#define PTEG_SIZE 64
+#define LG_PTEG_SIZE 6
+#define LDPTEu lwzu
+#define STPTE stw
+#define CMPPTE cmpw
+#define PTE_H 0x40
+#define PTE_V 0x80000000
+#define TST_V(r) rlwinm. r,r,0,0,0
+#define SET_V(r) oris r,r,PTE_V@h
+#define CLR_V(r,t) rlwinm r,r,0,1,31
+
+#else
+/* defines for the PTE format for 64-bit PPCs */
+#define PTE_SIZE 16
+#define PTEG_SIZE 128
+#define LG_PTEG_SIZE 7
+#define LDPTEu ldu
+#define STPTE std
+#define CMPPTE cmpd
+#define PTE_H 2
+#define PTE_V 1
+#define TST_V(r) andi. r,r,PTE_V
+#define SET_V(r) ori r,r,PTE_V
+#define CLR_V(r,t) li t,PTE_V; andc r,r,t
+#endif /* CONFIG_PPC64BRIDGE */
+
+#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1)
+#define HASH_RIGHT 31-LG_PTEG_SIZE
+
+_GLOBAL(create_hpte)
+ /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
+ rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */
+ rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
+ and r8,r8,r0 /* writable if _RW & _DIRTY */
+ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
+ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
+ ori r8,r8,0xe14 /* clear out reserved bits and M */
+ andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */
+BEGIN_FTR_SECTION
+ ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */
+END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+
+ /* Construct the high word of the PPC-style PTE (r5) */
+#ifndef CONFIG_PPC64BRIDGE
+ rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
+ rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */
+#else /* CONFIG_PPC64BRIDGE */
+ clrlwi r3,r3,8 /* reduce vsid to 24 bits */
+ sldi r5,r3,12 /* shift vsid into position */
+ rlwimi r5,r4,16,20,24 /* put in API (abbrev page index) */
+#endif /* CONFIG_PPC64BRIDGE */
+ SET_V(r5) /* set V (valid) bit */
+
+ /* Get the address of the primary PTE group in the hash table (r3) */
+_GLOBAL(hash_page_patch_A)
+ addis r0,r7,Hash_base@h /* base address of hash table */
+ rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+ rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+ xor r3,r3,r0 /* make primary hash */
+ li r0,8 /* PTEs/group */
+
+ /*
+ * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search
+ * if it is clear, meaning that the HPTE isn't there already...
+ */
+ andi. r6,r6,_PAGE_HASHPTE
+ beq+ 10f /* no PTE: go look for an empty slot */
+ tlbie r4
+
+ addis r4,r7,htab_hash_searches@ha
+ lwz r6,htab_hash_searches@l(r4)
+ addi r6,r6,1 /* count how many searches we do */
+ stw r6,htab_hash_searches@l(r4)
+
+ /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+ mtctr r0
+ addi r4,r3,-PTE_SIZE
+1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */
+ CMPPTE 0,r6,r5
+ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
+ beq+ found_slot
+
+ /* Search the secondary PTEG for a matching PTE */
+ ori r5,r5,PTE_H /* set H (secondary hash) bit */
+_GLOBAL(hash_page_patch_B)
+ xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+ xori r4,r4,(-PTEG_SIZE & 0xffff)
+ addi r4,r4,-PTE_SIZE
+ mtctr r0
+2: LDPTEu r6,PTE_SIZE(r4)
+ CMPPTE 0,r6,r5
+ bdnzf 2,2b
+ beq+ found_slot
+ xori r5,r5,PTE_H /* clear H bit again */
+
+ /* Search the primary PTEG for an empty slot */
+10: mtctr r0
+ addi r4,r3,-PTE_SIZE /* search primary PTEG */
+1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */
+ TST_V(r6) /* test valid bit */
+ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
+ beq+ found_empty
+
+ /* update counter of times that the primary PTEG is full */
+ addis r4,r7,primary_pteg_full@ha
+ lwz r6,primary_pteg_full@l(r4)
+ addi r6,r6,1
+ stw r6,primary_pteg_full@l(r4)
+
+ /* Search the secondary PTEG for an empty slot */
+ ori r5,r5,PTE_H /* set H (secondary hash) bit */
+_GLOBAL(hash_page_patch_C)
+ xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+ xori r4,r4,(-PTEG_SIZE & 0xffff)
+ addi r4,r4,-PTE_SIZE
+ mtctr r0
+2: LDPTEu r6,PTE_SIZE(r4)
+ TST_V(r6)
+ bdnzf 2,2b
+ beq+ found_empty
+ xori r5,r5,PTE_H /* clear H bit again */
+
+ /*
+ * Choose an arbitrary slot in the primary PTEG to overwrite.
+ * Since both the primary and secondary PTEGs are full, and we
+ * have no information that the PTEs in the primary PTEG are
+ * more important or useful than those in the secondary PTEG,
+ * and we know there is a definite (although small) speed
+ * advantage to putting the PTE in the primary PTEG, we always
+ * put the PTE in the primary PTEG.
+ */
+ addis r4,r7,next_slot@ha
+ lwz r6,next_slot@l(r4)
+ addi r6,r6,PTE_SIZE
+ andi. r6,r6,7*PTE_SIZE
+ stw r6,next_slot@l(r4)
+ add r4,r3,r6
+
+#ifndef CONFIG_SMP
+ /* Store PTE in PTEG */
+found_empty:
+ STPTE r5,0(r4)
+found_slot:
+ STPTE r8,PTE_SIZE/2(r4)
+
+#else /* CONFIG_SMP */
+/*
+ * Between the tlbie above and updating the hash table entry below,
+ * another CPU could read the hash table entry and put it in its TLB.
+ * There are 3 cases:
+ * 1. using an empty slot
+ * 2. updating an earlier entry to change permissions (i.e. enable write)
+ * 3. taking over the PTE for an unrelated address
+ *
+ * In each case it doesn't really matter if the other CPUs have the old
+ * PTE in their TLB. So we don't need to bother with another tlbie here,
+ * which is convenient as we've overwritten the register that had the
+ * address. :-) The tlbie above is mainly to make sure that this CPU comes
+ * and gets the new PTE from the hash table.
+ *
+ * We do however have to make sure that the PTE is never in an invalid
+ * state with the V bit set.
+ */
+found_empty:
+found_slot:
+ CLR_V(r5,r0) /* clear V (valid) bit in PTE */
+ STPTE r5,0(r4)
+ sync
+ TLBSYNC
+ STPTE r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+ sync
+ SET_V(r5)
+ STPTE r5,0(r4) /* finally set V bit in PTE */
+#endif /* CONFIG_SMP */
+
+ sync /* make sure pte updates get to memory */
+ blr
+
+ .comm next_slot,4
+ .comm primary_pteg_full,4
+ .comm htab_hash_searches,4
+
+/*
+ * Flush the entry for a particular page from the hash table.
+ *
+ * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval,
+ * int count)
+ *
+ * We assume that there is a hash table in use (Hash != 0).
+ */
+_GLOBAL(flush_hash_pages)
+ tophys(r7,0)
+
+ /*
+ * We disable interrupts here, even on UP, because we want
+ * the _PAGE_HASHPTE bit to be a reliable indication of
+ * whether the HPTE exists (or at least whether one did once).
+ * We also turn off the MMU for data accesses so that we
+ * we can't take a hash table miss (assuming the code is
+ * covered by a BAT). -- paulus
+ */
+ mfmsr r10
+ SYNC
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear MSR_DR */
+ mtmsr r0
+ SYNC_601
+ isync
+
+ /* First find a PTE in the range that has _PAGE_HASHPTE set */
+ rlwimi r5,r4,22,20,29
+1: lwz r0,0(r5)
+ cmpwi cr1,r6,1
+ andi. r0,r0,_PAGE_HASHPTE
+ bne 2f
+ ble cr1,19f
+ addi r4,r4,0x1000
+ addi r5,r5,4
+ addi r6,r6,-1
+ b 1b
+
+ /* Convert context and va to VSID */
+2: mulli r3,r3,897*16 /* multiply context by context skew */
+ rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
+ mulli r0,r0,0x111 /* multiply by ESID skew */
+ add r3,r3,r0 /* note code below trims to 24 bits */
+
+ /* Construct the high word of the PPC-style PTE (r11) */
+#ifndef CONFIG_PPC64BRIDGE
+ rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
+ rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */
+#else /* CONFIG_PPC64BRIDGE */
+ clrlwi r3,r3,8 /* reduce vsid to 24 bits */
+ sldi r11,r3,12 /* shift vsid into position */
+ rlwimi r11,r4,16,20,24 /* put in API (abbrev page index) */
+#endif /* CONFIG_PPC64BRIDGE */
+ SET_V(r11) /* set V (valid) bit */
+
+#ifdef CONFIG_SMP
+ addis r9,r7,mmu_hash_lock@ha
+ addi r9,r9,mmu_hash_lock@l
+ rlwinm r8,r1,0,0,18
+ add r8,r8,r7
+ lwz r8,TI_CPU(r8)
+ oris r8,r8,9
+10: lwarx r0,0,r9
+ cmpi 0,r0,0
+ bne- 11f
+ stwcx. r8,0,r9
+ beq+ 12f
+11: lwz r0,0(r9)
+ cmpi 0,r0,0
+ beq 10b
+ b 11b
+12: isync
+#endif
+
+ /*
+ * Check the _PAGE_HASHPTE bit in the linux PTE. If it is
+ * already clear, we're done (for this pte). If not,
+ * clear it (atomically) and proceed. -- paulus.
+ */
+33: lwarx r8,0,r5 /* fetch the pte */
+ andi. r0,r8,_PAGE_HASHPTE
+ beq 8f /* done if HASHPTE is already clear */
+ rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */
+ stwcx. r8,0,r5 /* update the pte */
+ bne- 33b
+
+ /* Get the address of the primary PTE group in the hash table (r3) */
+_GLOBAL(flush_hash_patch_A)
+ addis r8,r7,Hash_base@h /* base address of hash table */
+ rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+ rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+ xor r8,r0,r8 /* make primary hash */
+
+ /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+ li r0,8 /* PTEs/group */
+ mtctr r0
+ addi r12,r8,-PTE_SIZE
+1: LDPTEu r0,PTE_SIZE(r12) /* get next PTE */
+ CMPPTE 0,r0,r11
+ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
+ beq+ 3f
+
+ /* Search the secondary PTEG for a matching PTE */
+ ori r11,r11,PTE_H /* set H (secondary hash) bit */
+ li r0,8 /* PTEs/group */
+_GLOBAL(flush_hash_patch_B)
+ xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
+ xori r12,r12,(-PTEG_SIZE & 0xffff)
+ addi r12,r12,-PTE_SIZE
+ mtctr r0
+2: LDPTEu r0,PTE_SIZE(r12)
+ CMPPTE 0,r0,r11
+ bdnzf 2,2b
+ xori r11,r11,PTE_H /* clear H again */
+ bne- 4f /* should rarely fail to find it */
+
+3: li r0,0
+ STPTE r0,0(r12) /* invalidate entry */
+4: sync
+ tlbie r4 /* in hw tlb too */
+ sync
+
+8: ble cr1,9f /* if all ptes checked */
+81: addi r6,r6,-1
+ addi r5,r5,4 /* advance to next pte */
+ addi r4,r4,0x1000
+ lwz r0,0(r5) /* check next pte */
+ cmpwi cr1,r6,1
+ andi. r0,r0,_PAGE_HASHPTE
+ bne 33b
+ bgt cr1,81b
+
+9:
+#ifdef CONFIG_SMP
+ TLBSYNC
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+#endif
+
+19: mtmsr r10
+ SYNC_601
+ isync
+ blr
diff --git a/arch/powerpc/mm/init.c b/arch/powerpc/mm/init.c
new file mode 100644
index 000000000000..f4d983a6e521
--- /dev/null
+++ b/arch/powerpc/mm/init.c
@@ -0,0 +1,581 @@
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/btext.h>
+#include <asm/tlb.h>
+#include <asm/bootinfo.h>
+#include <asm/prom.h>
+
+#include "mem_pieces.h"
+#include "mmu_decl.h"
+
+#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
+/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#endif
+#endif
+#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+unsigned long total_memory;
+unsigned long total_lowmem;
+
+unsigned long ppc_memstart;
+unsigned long ppc_memoffset = PAGE_OFFSET;
+
+int mem_init_done;
+int init_bootmem_done;
+int boot_mapsize;
+#ifdef CONFIG_PPC_PMAC
+unsigned long agp_special_page;
+#endif
+
+extern char _end[];
+extern char etext[], _stext[];
+extern char __init_begin, __init_end;
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+EXPORT_SYMBOL(kmap_prot);
+EXPORT_SYMBOL(kmap_pte);
+#endif
+
+void MMU_init(void);
+void set_phys_avail(unsigned long total_ram);
+
+/* XXX should be in current.h -- paulus */
+extern struct task_struct *current_set[NR_CPUS];
+
+char *klimit = _end;
+struct mem_pieces phys_avail;
+struct device_node *memory_node;
+
+/*
+ * this tells the system to map all of ram with the segregs
+ * (i.e. page tables) instead of the bats.
+ * -- Cort
+ */
+int __map_without_bats;
+int __map_without_ltlbs;
+
+/* max amount of RAM to use */
+unsigned long __max_memory;
+/* max amount of low RAM to map in */
+unsigned long __max_low_memory = MAX_LOW_MEM;
+
+/*
+ * Read in a property describing some pieces of memory.
+ */
+static int __init get_mem_prop(char *name, struct mem_pieces *mp)
+{
+ struct reg_property *rp;
+ int i, s;
+ unsigned int *ip;
+ int nac = prom_n_addr_cells(memory_node);
+ int nsc = prom_n_size_cells(memory_node);
+
+ ip = (unsigned int *) get_property(memory_node, name, &s);
+ if (ip == NULL) {
+ printk(KERN_ERR "error: couldn't get %s property on /memory\n",
+ name);
+ return 0;
+ }
+ s /= (nsc + nac) * 4;
+ rp = mp->regions;
+ for (i = 0; i < s; ++i, ip += nac+nsc) {
+ if (nac >= 2 && ip[nac-2] != 0)
+ continue;
+ rp->address = ip[nac-1];
+ if (nsc >= 2 && ip[nac+nsc-2] != 0)
+ rp->size = ~0U;
+ else
+ rp->size = ip[nac+nsc-1];
+ ++rp;
+ }
+ mp->n_regions = rp - mp->regions;
+
+ /* Make sure the pieces are sorted. */
+ mem_pieces_sort(mp);
+ mem_pieces_coalesce(mp);
+ return 1;
+}
+
+/*
+ * Collect information about physical RAM and which pieces are
+ * already in use from the device tree.
+ */
+unsigned long __init find_end_of_memory(void)
+{
+ unsigned long a, total;
+ struct mem_pieces phys_mem;
+
+ /*
+ * Find out where physical memory is, and check that it
+ * starts at 0 and is contiguous. It seems that RAM is
+ * always physically contiguous on Power Macintoshes.
+ *
+ * Supporting discontiguous physical memory isn't hard,
+ * it just makes the virtual <-> physical mapping functions
+ * more complicated (or else you end up wasting space
+ * in mem_map).
+ */
+ memory_node = find_devices("memory");
+ if (memory_node == NULL || !get_mem_prop("reg", &phys_mem)
+ || phys_mem.n_regions == 0)
+ panic("No RAM??");
+ a = phys_mem.regions[0].address;
+ if (a != 0)
+ panic("RAM doesn't start at physical address 0");
+ total = phys_mem.regions[0].size;
+
+ if (phys_mem.n_regions > 1) {
+ printk("RAM starting at 0x%x is not contiguous\n",
+ phys_mem.regions[1].address);
+ printk("Using RAM from 0 to 0x%lx\n", total-1);
+ }
+
+ return total;
+}
+
+/*
+ * Check for command-line options that affect what MMU_init will do.
+ */
+void MMU_setup(void)
+{
+ /* Check for nobats option (used in mapin_ram). */
+ if (strstr(cmd_line, "nobats")) {
+ __map_without_bats = 1;
+ }
+
+ if (strstr(cmd_line, "noltlbs")) {
+ __map_without_ltlbs = 1;
+ }
+
+ /* Look for mem= option on command line */
+ if (strstr(cmd_line, "mem=")) {
+ char *p, *q;
+ unsigned long maxmem = 0;
+
+ for (q = cmd_line; (p = strstr(q, "mem=")) != 0; ) {
+ q = p + 4;
+ if (p > cmd_line && p[-1] != ' ')
+ continue;
+ maxmem = simple_strtoul(q, &q, 0);
+ if (*q == 'k' || *q == 'K') {
+ maxmem <<= 10;
+ ++q;
+ } else if (*q == 'm' || *q == 'M') {
+ maxmem <<= 20;
+ ++q;
+ }
+ }
+ __max_memory = maxmem;
+ }
+}
+
+/*
+ * MMU_init sets up the basic memory mappings for the kernel,
+ * including both RAM and possibly some I/O regions,
+ * and sets up the page tables and the MMU hardware ready to go.
+ */
+void __init MMU_init(void)
+{
+ if (ppc_md.progress)
+ ppc_md.progress("MMU:enter", 0x111);
+
+ /* parse args from command line */
+ MMU_setup();
+
+ /*
+ * Figure out how much memory we have, how much
+ * is lowmem, and how much is highmem. If we were
+ * passed the total memory size from the bootloader,
+ * just use it.
+ */
+ if (boot_mem_size)
+ total_memory = boot_mem_size;
+ else
+ total_memory = ppc_md.find_end_of_memory();
+
+ if (__max_memory && total_memory > __max_memory)
+ total_memory = __max_memory;
+ total_lowmem = total_memory;
+#ifdef CONFIG_FSL_BOOKE
+ /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
+ * entries, so we need to adjust lowmem to match the amount we can map
+ * in the fixed entries */
+ adjust_total_lowmem();
+#endif /* CONFIG_FSL_BOOKE */
+ if (total_lowmem > __max_low_memory) {
+ total_lowmem = __max_low_memory;
+#ifndef CONFIG_HIGHMEM
+ total_memory = total_lowmem;
+#endif /* CONFIG_HIGHMEM */
+ }
+ set_phys_avail(total_lowmem);
+
+ /* Initialize the MMU hardware */
+ if (ppc_md.progress)
+ ppc_md.progress("MMU:hw init", 0x300);
+ MMU_init_hw();
+
+ /* Map in all of RAM starting at KERNELBASE */
+ if (ppc_md.progress)
+ ppc_md.progress("MMU:mapin", 0x301);
+ mapin_ram();
+
+#ifdef CONFIG_HIGHMEM
+ ioremap_base = PKMAP_BASE;
+#else
+ ioremap_base = 0xfe000000UL; /* for now, could be 0xfffff000 */
+#endif /* CONFIG_HIGHMEM */
+ ioremap_bot = ioremap_base;
+
+ /* Map in I/O resources */
+ if (ppc_md.progress)
+ ppc_md.progress("MMU:setio", 0x302);
+ if (ppc_md.setup_io_mappings)
+ ppc_md.setup_io_mappings();
+
+ /* Initialize the context management stuff */
+ mmu_context_init();
+
+ if (ppc_md.progress)
+ ppc_md.progress("MMU:exit", 0x211);
+
+#ifdef CONFIG_BOOTX_TEXT
+ /* By default, we are no longer mapped */
+ boot_text_mapped = 0;
+ /* Must be done last, or ppc_md.progress will die. */
+ map_boot_text();
+#endif
+}
+
+/* This is only called until mem_init is done. */
+void __init *early_get_page(void)
+{
+ void *p;
+
+ if (init_bootmem_done) {
+ p = alloc_bootmem_pages(PAGE_SIZE);
+ } else {
+ p = mem_pieces_find(PAGE_SIZE, PAGE_SIZE);
+ }
+ return p;
+}
+
+/* Free up now-unused memory */
+static void free_sec(unsigned long start, unsigned long end, const char *name)
+{
+ unsigned long cnt = 0;
+
+ while (start < end) {
+ ClearPageReserved(virt_to_page(start));
+ set_page_count(virt_to_page(start), 1);
+ free_page(start);
+ cnt++;
+ start += PAGE_SIZE;
+ }
+ if (cnt) {
+ printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
+ totalram_pages += cnt;
+ }
+}
+
+void free_initmem(void)
+{
+#define FREESEC(TYPE) \
+ free_sec((unsigned long)(&__ ## TYPE ## _begin), \
+ (unsigned long)(&__ ## TYPE ## _end), \
+ #TYPE);
+
+ printk ("Freeing unused kernel memory:");
+ FREESEC(init);
+ printk("\n");
+ ppc_md.progress = NULL;
+#undef FREESEC
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ if (start < end)
+ printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ for (; start < end; start += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(start));
+ set_page_count(virt_to_page(start), 1);
+ free_page(start);
+ totalram_pages++;
+ }
+}
+#endif
+
+/*
+ * Initialize the bootmem system and give it all the memory we
+ * have available.
+ */
+void __init do_init_bootmem(void)
+{
+ unsigned long start, size;
+ int i;
+
+ /*
+ * Find an area to use for the bootmem bitmap.
+ * We look for the first area which is at least
+ * 128kB in length (128kB is enough for a bitmap
+ * for 4GB of memory, using 4kB pages), plus 1 page
+ * (in case the address isn't page-aligned).
+ */
+ start = 0;
+ size = 0;
+ for (i = 0; i < phys_avail.n_regions; ++i) {
+ unsigned long a = phys_avail.regions[i].address;
+ unsigned long s = phys_avail.regions[i].size;
+ if (s <= size)
+ continue;
+ start = a;
+ size = s;
+ if (s >= 33 * PAGE_SIZE)
+ break;
+ }
+ start = PAGE_ALIGN(start);
+
+ min_low_pfn = start >> PAGE_SHIFT;
+ max_low_pfn = (PPC_MEMSTART + total_lowmem) >> PAGE_SHIFT;
+ max_pfn = (PPC_MEMSTART + total_memory) >> PAGE_SHIFT;
+ boot_mapsize = init_bootmem_node(&contig_page_data, min_low_pfn,
+ PPC_MEMSTART >> PAGE_SHIFT,
+ max_low_pfn);
+
+ /* remove the bootmem bitmap from the available memory */
+ mem_pieces_remove(&phys_avail, start, boot_mapsize, 1);
+
+ /* add everything in phys_avail into the bootmem map */
+ for (i = 0; i < phys_avail.n_regions; ++i)
+ free_bootmem(phys_avail.regions[i].address,
+ phys_avail.regions[i].size);
+
+ init_bootmem_done = 1;
+}
+
+/*
+ * paging_init() sets up the page tables - in fact we've already done this.
+ */
+void __init paging_init(void)
+{
+ unsigned long zones_size[MAX_NR_ZONES], i;
+
+#ifdef CONFIG_HIGHMEM
+ map_page(PKMAP_BASE, 0, 0); /* XXX gross */
+ pkmap_page_table = pte_offset_kernel(pmd_offset(pgd_offset_k
+ (PKMAP_BASE), PKMAP_BASE), PKMAP_BASE);
+ map_page(KMAP_FIX_BEGIN, 0, 0); /* XXX gross */
+ kmap_pte = pte_offset_kernel(pmd_offset(pgd_offset_k
+ (KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN);
+ kmap_prot = PAGE_KERNEL;
+#endif /* CONFIG_HIGHMEM */
+
+ /*
+ * All pages are DMA-able so we put them all in the DMA zone.
+ */
+ zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
+ for (i = 1; i < MAX_NR_ZONES; i++)
+ zones_size[i] = 0;
+
+#ifdef CONFIG_HIGHMEM
+ zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT;
+#endif /* CONFIG_HIGHMEM */
+
+ free_area_init(zones_size);
+}
+
+void __init mem_init(void)
+{
+ unsigned long addr;
+ int codepages = 0;
+ int datapages = 0;
+ int initpages = 0;
+#ifdef CONFIG_HIGHMEM
+ unsigned long highmem_mapnr;
+
+ highmem_mapnr = total_lowmem >> PAGE_SHIFT;
+#endif /* CONFIG_HIGHMEM */
+ max_mapnr = total_memory >> PAGE_SHIFT;
+
+ high_memory = (void *) __va(PPC_MEMSTART + total_lowmem);
+ num_physpages = max_mapnr; /* RAM is assumed contiguous */
+
+ totalram_pages += free_all_bootmem();
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ /* if we are booted from BootX with an initial ramdisk,
+ make sure the ramdisk pages aren't reserved. */
+ if (initrd_start) {
+ for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE)
+ ClearPageReserved(virt_to_page(addr));
+ }
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+#ifdef CONFIG_PPC_OF
+ /* mark the RTAS pages as reserved */
+ if ( rtas_data )
+ for (addr = (ulong)__va(rtas_data);
+ addr < PAGE_ALIGN((ulong)__va(rtas_data)+rtas_size) ;
+ addr += PAGE_SIZE)
+ SetPageReserved(virt_to_page(addr));
+#endif
+#ifdef CONFIG_PPC_PMAC
+ if (agp_special_page)
+ SetPageReserved(virt_to_page(agp_special_page));
+#endif
+ for (addr = PAGE_OFFSET; addr < (unsigned long)high_memory;
+ addr += PAGE_SIZE) {
+ if (!PageReserved(virt_to_page(addr)))
+ continue;
+ if (addr < (ulong) etext)
+ codepages++;
+ else if (addr >= (unsigned long)&__init_begin
+ && addr < (unsigned long)&__init_end)
+ initpages++;
+ else if (addr < (ulong) klimit)
+ datapages++;
+ }
+
+#ifdef CONFIG_HIGHMEM
+ {
+ unsigned long pfn;
+
+ for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+ struct page *page = mem_map + pfn;
+
+ ClearPageReserved(page);
+ set_page_count(page, 1);
+ __free_page(page);
+ totalhigh_pages++;
+ }
+ totalram_pages += totalhigh_pages;
+ }
+#endif /* CONFIG_HIGHMEM */
+
+ printk("Memory: %luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
+ (unsigned long)nr_free_pages()<< (PAGE_SHIFT-10),
+ codepages<< (PAGE_SHIFT-10), datapages<< (PAGE_SHIFT-10),
+ initpages<< (PAGE_SHIFT-10),
+ (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)));
+
+#ifdef CONFIG_PPC_PMAC
+ if (agp_special_page)
+ printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page);
+#endif
+
+ mem_init_done = 1;
+}
+
+/*
+ * Set phys_avail to the amount of physical memory,
+ * less the kernel text/data/bss.
+ */
+void __init
+set_phys_avail(unsigned long total_memory)
+{
+ unsigned long kstart, ksize;
+
+ /*
+ * Initially, available physical memory is equivalent to all
+ * physical memory.
+ */
+
+ phys_avail.regions[0].address = PPC_MEMSTART;
+ phys_avail.regions[0].size = total_memory;
+ phys_avail.n_regions = 1;
+
+ /*
+ * Map out the kernel text/data/bss from the available physical
+ * memory.
+ */
+
+ kstart = __pa(_stext); /* should be 0 */
+ ksize = PAGE_ALIGN(klimit - _stext);
+
+ mem_pieces_remove(&phys_avail, kstart, ksize, 0);
+ mem_pieces_remove(&phys_avail, 0, 0x4000, 0);
+
+#if defined(CONFIG_BLK_DEV_INITRD)
+ /* Remove the init RAM disk from the available memory. */
+ if (initrd_start) {
+ mem_pieces_remove(&phys_avail, __pa(initrd_start),
+ initrd_end - initrd_start, 1);
+ }
+#endif /* CONFIG_BLK_DEV_INITRD */
+#ifdef CONFIG_PPC_OF
+ /* remove the RTAS pages from the available memory */
+ if (rtas_data)
+ mem_pieces_remove(&phys_avail, rtas_data, rtas_size, 1);
+#endif
+#ifdef CONFIG_PPC_PMAC
+ /* Because of some uninorth weirdness, we need a page of
+ * memory as high as possible (it must be outside of the
+ * bus address seen as the AGP aperture). It will be used
+ * by the r128 DRM driver
+ *
+ * FIXME: We need to make sure that page doesn't overlap any of the\
+ * above. This could be done by improving mem_pieces_find to be able
+ * to do a backward search from the end of the list.
+ */
+ if (_machine == _MACH_Pmac && find_devices("uni-north-agp")) {
+ agp_special_page = (total_memory - PAGE_SIZE);
+ mem_pieces_remove(&phys_avail, agp_special_page, PAGE_SIZE, 0);
+ agp_special_page = (unsigned long)__va(agp_special_page);
+ }
+#endif /* CONFIG_PPC_PMAC */
+}
+
+/* Mark some memory as reserved by removing it from phys_avail. */
+void __init reserve_phys_mem(unsigned long start, unsigned long size)
+{
+ mem_pieces_remove(&phys_avail, start, size, 1);
+}
diff --git a/arch/powerpc/mm/init64.c b/arch/powerpc/mm/init64.c
new file mode 100644
index 000000000000..81f6745b31ef
--- /dev/null
+++ b/arch/powerpc/mm/init64.c
@@ -0,0 +1,385 @@
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/lmb.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/ppcdebug.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/vdso.h>
+#include <asm/imalloc.h>
+
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
+int mem_init_done;
+unsigned long ioremap_bot = IMALLOC_BASE;
+static unsigned long phbs_io_bot = PHBS_IO_BASE;
+
+extern pgd_t swapper_pg_dir[];
+extern struct task_struct *current_set[NR_CPUS];
+
+unsigned long klimit = (unsigned long)_end;
+
+unsigned long _SDR1=0;
+unsigned long _ASR=0;
+
+/* max amount of RAM to use */
+unsigned long __max_memory;
+
+/* info on what we think the IO hole is */
+unsigned long io_hole_start;
+unsigned long io_hole_size;
+
+/*
+ * Do very early mm setup.
+ */
+void __init mm_init_ppc64(void)
+{
+#ifndef CONFIG_PPC_ISERIES
+ unsigned long i;
+#endif
+
+ ppc64_boot_msg(0x100, "MM Init");
+
+ /* This is the story of the IO hole... please, keep seated,
+ * unfortunately, we are out of oxygen masks at the moment.
+ * So we need some rough way to tell where your big IO hole
+ * is. On pmac, it's between 2G and 4G, on POWER3, it's around
+ * that area as well, on POWER4 we don't have one, etc...
+ * We need that as a "hint" when sizing the TCE table on POWER3
+ * So far, the simplest way that seem work well enough for us it
+ * to just assume that the first discontinuity in our physical
+ * RAM layout is the IO hole. That may not be correct in the future
+ * (and isn't on iSeries but then we don't care ;)
+ */
+
+#ifndef CONFIG_PPC_ISERIES
+ for (i = 1; i < lmb.memory.cnt; i++) {
+ unsigned long base, prevbase, prevsize;
+
+ prevbase = lmb.memory.region[i-1].base;
+ prevsize = lmb.memory.region[i-1].size;
+ base = lmb.memory.region[i].base;
+ if (base > (prevbase + prevsize)) {
+ io_hole_start = prevbase + prevsize;
+ io_hole_size = base - (prevbase + prevsize);
+ break;
+ }
+ }
+#endif /* CONFIG_PPC_ISERIES */
+ if (io_hole_start)
+ printk("IO Hole assumed to be %lx -> %lx\n",
+ io_hole_start, io_hole_start + io_hole_size - 1);
+
+ ppc64_boot_msg(0x100, "MM Init Done");
+}
+
+void free_initmem(void)
+{
+ unsigned long addr;
+
+ addr = (unsigned long)__init_begin;
+ for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
+ memset((void *)addr, 0xcc, PAGE_SIZE);
+ ClearPageReserved(virt_to_page(addr));
+ set_page_count(virt_to_page(addr), 1);
+ free_page(addr);
+ totalram_pages++;
+ }
+ printk ("Freeing unused kernel memory: %luk freed\n",
+ ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ if (start < end)
+ printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ for (; start < end; start += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(start));
+ set_page_count(virt_to_page(start), 1);
+ free_page(start);
+ totalram_pages++;
+ }
+}
+#endif
+
+/*
+ * Initialize the bootmem system and give it all the memory we
+ * have available.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init do_init_bootmem(void)
+{
+ unsigned long i;
+ unsigned long start, bootmap_pages;
+ unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ int boot_mapsize;
+
+ /*
+ * Find an area to use for the bootmem bitmap. Calculate the size of
+ * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
+ * Add 1 additional page in case the address isn't page-aligned.
+ */
+ bootmap_pages = bootmem_bootmap_pages(total_pages);
+
+ start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
+ BUG_ON(!start);
+
+ boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
+
+ max_pfn = max_low_pfn;
+
+ /* Add all physical memory to the bootmem map, mark each area
+ * present.
+ */
+ for (i=0; i < lmb.memory.cnt; i++)
+ free_bootmem(lmb.memory.region[i].base,
+ lmb_size_bytes(&lmb.memory, i));
+
+ /* reserve the sections we're already using */
+ for (i=0; i < lmb.reserved.cnt; i++)
+ reserve_bootmem(lmb.reserved.region[i].base,
+ lmb_size_bytes(&lmb.reserved, i));
+
+ for (i=0; i < lmb.memory.cnt; i++)
+ memory_present(0, lmb_start_pfn(&lmb.memory, i),
+ lmb_end_pfn(&lmb.memory, i));
+}
+
+/*
+ * paging_init() sets up the page tables - in fact we've already done this.
+ */
+void __init paging_init(void)
+{
+ unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long zholes_size[MAX_NR_ZONES];
+ unsigned long total_ram = lmb_phys_mem_size();
+ unsigned long top_of_ram = lmb_end_of_DRAM();
+
+ printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+ top_of_ram, total_ram);
+ printk(KERN_INFO "Memory hole size: %ldMB\n",
+ (top_of_ram - total_ram) >> 20);
+ /*
+ * All pages are DMA-able so we put them all in the DMA zone.
+ */
+ memset(zones_size, 0, sizeof(zones_size));
+ memset(zholes_size, 0, sizeof(zholes_size));
+
+ zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+ zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
+
+ free_area_init_node(0, NODE_DATA(0), zones_size,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
+}
+#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
+
+static struct kcore_list kcore_vmem;
+
+static int __init setup_kcore(void)
+{
+ int i;
+
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long base, size;
+ struct kcore_list *kcore_mem;
+
+ base = lmb.memory.region[i].base;
+ size = lmb.memory.region[i].size;
+
+ /* GFP_ATOMIC to avoid might_sleep warnings during boot */
+ kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
+ if (!kcore_mem)
+ panic("mem_init: kmalloc failed\n");
+
+ kclist_add(kcore_mem, __va(base), size);
+ }
+
+ kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+
+ return 0;
+}
+module_init(setup_kcore);
+
+void __init mem_init(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int nid;
+#endif
+ pg_data_t *pgdat;
+ unsigned long i;
+ struct page *page;
+ unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+
+ num_physpages = max_low_pfn; /* RAM is assumed contiguous */
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ for_each_online_node(nid) {
+ if (NODE_DATA(nid)->node_spanned_pages != 0) {
+ printk("freeing bootmem node %x\n", nid);
+ totalram_pages +=
+ free_all_bootmem_node(NODE_DATA(nid));
+ }
+ }
+#else
+ max_mapnr = num_physpages;
+ totalram_pages += free_all_bootmem();
+#endif
+
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; i++) {
+ page = pgdat_page_nr(pgdat, i);
+ if (PageReserved(page))
+ reservedpages++;
+ }
+ }
+
+ codesize = (unsigned long)&_etext - (unsigned long)&_stext;
+ initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
+ datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
+ bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+
+ printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
+ "%luk reserved, %luk data, %luk bss, %luk init)\n",
+ (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+ num_physpages << (PAGE_SHIFT-10),
+ codesize >> 10,
+ reservedpages << (PAGE_SHIFT-10),
+ datasize >> 10,
+ bsssize >> 10,
+ initsize >> 10);
+
+ mem_init_done = 1;
+
+ /* Initialize the vDSO */
+ vdso_init();
+}
+
+void __iomem * reserve_phb_iospace(unsigned long size)
+{
+ void __iomem *virt_addr;
+
+ if (phbs_io_bot >= IMALLOC_BASE)
+ panic("reserve_phb_iospace(): phb io space overflow\n");
+
+ virt_addr = (void __iomem *) phbs_io_bot;
+ phbs_io_bot += size;
+
+ return virt_addr;
+}
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+ memset(addr, 0, kmem_cache_size(cache));
+}
+
+static const int pgtable_cache_size[2] = {
+ PTE_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+ "pgd_pte_cache", "pud_pmd_cache",
+};
+
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+
+void pgtable_cache_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
+ BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
+ BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
+ BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
+
+ for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
+ int size = pgtable_cache_size[i];
+ const char *name = pgtable_cache_name[i];
+
+ pgtable_cache[i] = kmem_cache_create(name,
+ size, size,
+ SLAB_HWCACHE_ALIGN
+ | SLAB_MUST_HWCACHE_ALIGN,
+ zero_ctor,
+ NULL);
+ if (! pgtable_cache[i])
+ panic("pgtable_cache_init(): could not create %s!\n",
+ name);
+ }
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+ unsigned long size, pgprot_t vma_prot)
+{
+ if (ppc_md.phys_mem_access_prot)
+ return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
+
+ if (!page_is_ram(addr >> PAGE_SHIFT))
+ vma_prot = __pgprot(pgprot_val(vma_prot)
+ | _PAGE_GUARDED | _PAGE_NO_CACHE);
+ return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
new file mode 100644
index 000000000000..345db08e5d20
--- /dev/null
+++ b/arch/powerpc/mm/mem.c
@@ -0,0 +1,299 @@
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/btext.h>
+#include <asm/tlb.h>
+#include <asm/bootinfo.h>
+#include <asm/prom.h>
+
+#include "mem_pieces.h"
+#include "mmu_decl.h"
+
+#ifndef CPU_FTR_COHERENT_ICACHE
+#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
+#define CPU_FTR_NOEXECUTE 0
+#endif
+
+/*
+ * This is called by /dev/mem to know if a given address has to
+ * be mapped non-cacheable or not
+ */
+int page_is_ram(unsigned long pfn)
+{
+ unsigned long paddr = (pfn << PAGE_SHIFT);
+
+#ifndef CONFIG_PPC64 /* XXX for now */
+ return paddr < __pa(high_memory);
+#else
+ int i;
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long base;
+
+ base = lmb.memory.region[i].base;
+
+ if ((paddr >= base) &&
+ (paddr < (base + lmb.memory.region[i].size))) {
+ return 1;
+ }
+ }
+
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(page_is_ram);
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+ unsigned long size, pgprot_t vma_prot)
+{
+ if (ppc_md.phys_mem_access_prot)
+ return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
+
+ if (!page_is_ram(addr >> PAGE_SHIFT))
+ vma_prot = __pgprot(pgprot_val(vma_prot)
+ | _PAGE_GUARDED | _PAGE_NO_CACHE);
+ return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
+
+void show_mem(void)
+{
+ unsigned long total = 0, reserved = 0;
+ unsigned long shared = 0, cached = 0;
+ unsigned long highmem = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
+
+ printk("Mem-info:\n");
+ show_free_areas();
+ printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; i++) {
+ page = pgdat_page_nr(pgdat, i);
+ total++;
+ if (PageHighMem(page))
+ highmem++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ }
+ printk("%ld pages of RAM\n", total);
+#ifdef CONFIG_HIGHMEM
+ printk("%ld pages of HIGHMEM\n", highmem);
+#endif
+ printk("%ld reserved pages\n", reserved);
+ printk("%ld pages shared\n", shared);
+ printk("%ld pages swap cached\n", cached);
+}
+
+/*
+ * This is called when a page has been modified by the kernel.
+ * It just marks the page as not i-cache clean. We do the i-cache
+ * flush later when the page is given to a user process, if necessary.
+ */
+void flush_dcache_page(struct page *page)
+{
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &page->flags))
+ clear_bit(PG_arch_1, &page->flags);
+}
+EXPORT_SYMBOL(flush_dcache_page);
+
+void flush_dcache_icache_page(struct page *page)
+{
+#ifdef CONFIG_BOOKE
+ void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+ __flush_dcache_icache(start);
+ kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+#elif defined(CONFIG_8xx)
+ /* On 8xx there is no need to kmap since highmem is not supported */
+ __flush_dcache_icache(page_address(page));
+#else
+ __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
+#endif
+
+}
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+ clear_page(page);
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /*
+ * We shouldnt have to do this, but some versions of glibc
+ * require it (ld.so assumes zero filled pages are icache clean)
+ * - Anton
+ */
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+EXPORT_SYMBOL(clear_user_page);
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+ struct page *pg)
+{
+ copy_page(vto, vfrom);
+
+ /*
+ * We should be able to use the following optimisation, however
+ * there are two problems.
+ * Firstly a bug in some versions of binutils meant PLT sections
+ * were not marked executable.
+ * Secondly the first word in the GOT section is blrl, used
+ * to establish the GOT address. Until recently the GOT was
+ * not marked executable.
+ * - Anton
+ */
+#if 0
+ if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
+ return;
+#endif
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+ unsigned long addr, int len)
+{
+ unsigned long maddr;
+
+ maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
+ flush_icache_range(maddr, maddr + len);
+ kunmap(page);
+}
+EXPORT_SYMBOL(flush_icache_user_range);
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the mm->page_table_lock held
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t pte)
+{
+ /* handle i-cache coherency */
+ unsigned long pfn = pte_pfn(pte);
+#ifdef CONFIG_PPC32
+ pmd_t *pmd;
+#else
+ unsigned long vsid;
+ void *pgdir;
+ pte_t *ptep;
+ int local = 0;
+ cpumask_t tmp;
+ unsigned long flags;
+#endif
+
+ /* handle i-cache coherency */
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
+ !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ if (!PageReserved(page)
+ && !test_bit(PG_arch_1, &page->flags)) {
+ if (vma->vm_mm == current->active_mm) {
+#ifdef CONFIG_8xx
+ /* On 8xx, cache control instructions (particularly
+ * "dcbst" from flush_dcache_icache) fault as write
+ * operation if there is an unpopulated TLB entry
+ * for the address in question. To workaround that,
+ * we invalidate the TLB here, thus avoiding dcbst
+ * misbehaviour.
+ */
+ _tlbie(address);
+#endif
+ __flush_dcache_icache((void *) address);
+ } else
+ flush_dcache_icache_page(page);
+ set_bit(PG_arch_1, &page->flags);
+ }
+ }
+
+#ifdef CONFIG_PPC_STD_MMU
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(pte) || address >= TASK_SIZE)
+ return;
+#ifdef CONFIG_PPC32
+ if (Hash == 0)
+ return;
+ pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address);
+ if (!pmd_none(*pmd))
+ add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd));
+#else
+ pgdir = vma->vm_mm->pgd;
+ if (pgdir == NULL)
+ return;
+
+ ptep = find_linux_pte(pgdir, ea);
+ if (!ptep)
+ return;
+
+ vsid = get_vsid(vma->vm_mm->context.id, ea);
+
+ local_irq_save(flags);
+ tmp = cpumask_of_cpu(smp_processor_id());
+ if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
+ local = 1;
+
+ __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
+ 0x300, local);
+ local_irq_restore(flags);
+#endif
+#endif
+}
diff --git a/arch/powerpc/mm/mem64.c b/arch/powerpc/mm/mem64.c
new file mode 100644
index 000000000000..ef765a84433f
--- /dev/null
+++ b/arch/powerpc/mm/mem64.c
@@ -0,0 +1,259 @@
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/lmb.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/ppcdebug.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/vdso.h>
+#include <asm/imalloc.h>
+
+/*
+ * This is called by /dev/mem to know if a given address has to
+ * be mapped non-cacheable or not
+ */
+int page_is_ram(unsigned long pfn)
+{
+ int i;
+ unsigned long paddr = (pfn << PAGE_SHIFT);
+
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long base;
+
+ base = lmb.memory.region[i].base;
+
+ if ((paddr >= base) &&
+ (paddr < (base + lmb.memory.region[i].size))) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(page_is_ram);
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+ unsigned long size, pgprot_t vma_prot)
+{
+ if (ppc_md.phys_mem_access_prot)
+ return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
+
+ if (!page_is_ram(addr >> PAGE_SHIFT))
+ vma_prot = __pgprot(pgprot_val(vma_prot)
+ | _PAGE_GUARDED | _PAGE_NO_CACHE);
+ return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
+
+void show_mem(void)
+{
+ unsigned long total = 0, reserved = 0;
+ unsigned long shared = 0, cached = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
+
+ printk("Mem-info:\n");
+ show_free_areas();
+ printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; i++) {
+ page = pgdat_page_nr(pgdat, i);
+ total++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ }
+ printk("%ld pages of RAM\n", total);
+ printk("%ld reserved pages\n", reserved);
+ printk("%ld pages shared\n", shared);
+ printk("%ld pages swap cached\n", cached);
+}
+
+/*
+ * This is called when a page has been modified by the kernel.
+ * It just marks the page as not i-cache clean. We do the i-cache
+ * flush later when the page is given to a user process, if necessary.
+ */
+void flush_dcache_page(struct page *page)
+{
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &page->flags))
+ clear_bit(PG_arch_1, &page->flags);
+}
+EXPORT_SYMBOL(flush_dcache_page);
+
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+ clear_page(page);
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /*
+ * We shouldnt have to do this, but some versions of glibc
+ * require it (ld.so assumes zero filled pages are icache clean)
+ * - Anton
+ */
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+EXPORT_SYMBOL(clear_user_page);
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+ struct page *pg)
+{
+ copy_page(vto, vfrom);
+
+ /*
+ * We should be able to use the following optimisation, however
+ * there are two problems.
+ * Firstly a bug in some versions of binutils meant PLT sections
+ * were not marked executable.
+ * Secondly the first word in the GOT section is blrl, used
+ * to establish the GOT address. Until recently the GOT was
+ * not marked executable.
+ * - Anton
+ */
+#if 0
+ if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
+ return;
+#endif
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+ unsigned long addr, int len)
+{
+ unsigned long maddr;
+
+ maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
+ flush_icache_range(maddr, maddr + len);
+}
+EXPORT_SYMBOL(flush_icache_user_range);
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the mm->page_table_lock held
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
+ pte_t pte)
+{
+ unsigned long vsid;
+ void *pgdir;
+ pte_t *ptep;
+ int local = 0;
+ cpumask_t tmp;
+ unsigned long flags;
+
+ /* handle i-cache coherency */
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
+ !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
+ unsigned long pfn = pte_pfn(pte);
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ if (!PageReserved(page)
+ && !test_bit(PG_arch_1, &page->flags)) {
+ __flush_dcache_icache(page_address(page));
+ set_bit(PG_arch_1, &page->flags);
+ }
+ }
+ }
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(pte))
+ return;
+
+ pgdir = vma->vm_mm->pgd;
+ if (pgdir == NULL)
+ return;
+
+ ptep = find_linux_pte(pgdir, ea);
+ if (!ptep)
+ return;
+
+ vsid = get_vsid(vma->vm_mm->context.id, ea);
+
+ local_irq_save(flags);
+ tmp = cpumask_of_cpu(smp_processor_id());
+ if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
+ local = 1;
+
+ __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
+ 0x300, local);
+ local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/mem_pieces.c b/arch/powerpc/mm/mem_pieces.c
new file mode 100644
index 000000000000..3d639052017e
--- /dev/null
+++ b/arch/powerpc/mm/mem_pieces.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ * Changes to accommodate Power Macintoshes.
+ * Cort Dougan <cort@cs.nmt.edu>
+ * Rewrites.
+ * Grant Erickson <grant@lcse.umn.edu>
+ * General rework and split from mm/init.c.
+ *
+ * Module name: mem_pieces.c
+ *
+ * Description:
+ * Routines and data structures for manipulating and representing
+ * phyiscal memory extents (i.e. address/length pairs).
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <asm/page.h>
+
+#include "mem_pieces.h"
+
+extern struct mem_pieces phys_avail;
+
+static void mem_pieces_print(struct mem_pieces *);
+
+/*
+ * Scan a region for a piece of a given size with the required alignment.
+ */
+void __init *
+mem_pieces_find(unsigned int size, unsigned int align)
+{
+ int i;
+ unsigned a, e;
+ struct mem_pieces *mp = &phys_avail;
+
+ for (i = 0; i < mp->n_regions; ++i) {
+ a = mp->regions[i].address;
+ e = a + mp->regions[i].size;
+ a = (a + align - 1) & -align;
+ if (a + size <= e) {
+ mem_pieces_remove(mp, a, size, 1);
+ return (void *) __va(a);
+ }
+ }
+ panic("Couldn't find %u bytes at %u alignment\n", size, align);
+
+ return NULL;
+}
+
+/*
+ * Remove some memory from an array of pieces
+ */
+void __init
+mem_pieces_remove(struct mem_pieces *mp, unsigned int start, unsigned int size,
+ int must_exist)
+{
+ int i, j;
+ unsigned int end, rs, re;
+ struct reg_property *rp;
+
+ end = start + size;
+ for (i = 0, rp = mp->regions; i < mp->n_regions; ++i, ++rp) {
+ if (end > rp->address && start < rp->address + rp->size)
+ break;
+ }
+ if (i >= mp->n_regions) {
+ if (must_exist)
+ printk("mem_pieces_remove: [%x,%x) not in any region\n",
+ start, end);
+ return;
+ }
+ for (; i < mp->n_regions && end > rp->address; ++i, ++rp) {
+ rs = rp->address;
+ re = rs + rp->size;
+ if (must_exist && (start < rs || end > re)) {
+ printk("mem_pieces_remove: bad overlap [%x,%x) with",
+ start, end);
+ mem_pieces_print(mp);
+ must_exist = 0;
+ }
+ if (start > rs) {
+ rp->size = start - rs;
+ if (end < re) {
+ /* need to split this entry */
+ if (mp->n_regions >= MEM_PIECES_MAX)
+ panic("eek... mem_pieces overflow");
+ for (j = mp->n_regions; j > i + 1; --j)
+ mp->regions[j] = mp->regions[j-1];
+ ++mp->n_regions;
+ rp[1].address = end;
+ rp[1].size = re - end;
+ }
+ } else {
+ if (end < re) {
+ rp->address = end;
+ rp->size = re - end;
+ } else {
+ /* need to delete this entry */
+ for (j = i; j < mp->n_regions - 1; ++j)
+ mp->regions[j] = mp->regions[j+1];
+ --mp->n_regions;
+ --i;
+ --rp;
+ }
+ }
+ }
+}
+
+static void __init
+mem_pieces_print(struct mem_pieces *mp)
+{
+ int i;
+
+ for (i = 0; i < mp->n_regions; ++i)
+ printk(" [%x, %x)", mp->regions[i].address,
+ mp->regions[i].address + mp->regions[i].size);
+ printk("\n");
+}
+
+void __init
+mem_pieces_sort(struct mem_pieces *mp)
+{
+ unsigned long a, s;
+ int i, j;
+
+ for (i = 1; i < mp->n_regions; ++i) {
+ a = mp->regions[i].address;
+ s = mp->regions[i].size;
+ for (j = i - 1; j >= 0; --j) {
+ if (a >= mp->regions[j].address)
+ break;
+ mp->regions[j+1] = mp->regions[j];
+ }
+ mp->regions[j+1].address = a;
+ mp->regions[j+1].size = s;
+ }
+}
+
+void __init
+mem_pieces_coalesce(struct mem_pieces *mp)
+{
+ unsigned long a, s, ns;
+ int i, j, d;
+
+ d = 0;
+ for (i = 0; i < mp->n_regions; i = j) {
+ a = mp->regions[i].address;
+ s = mp->regions[i].size;
+ for (j = i + 1; j < mp->n_regions
+ && mp->regions[j].address - a <= s; ++j) {
+ ns = mp->regions[j].address + mp->regions[j].size - a;
+ if (ns > s)
+ s = ns;
+ }
+ mp->regions[d].address = a;
+ mp->regions[d].size = s;
+ ++d;
+ }
+ mp->n_regions = d;
+}
diff --git a/arch/powerpc/mm/mem_pieces.h b/arch/powerpc/mm/mem_pieces.h
new file mode 100644
index 000000000000..e2b700dc7f18
--- /dev/null
+++ b/arch/powerpc/mm/mem_pieces.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ * Changes to accommodate Power Macintoshes.
+ * Cort Dougan <cort@cs.nmt.edu>
+ * Rewrites.
+ * Grant Erickson <grant@lcse.umn.edu>
+ * General rework and split from mm/init.c.
+ *
+ * Module name: mem_pieces.h
+ *
+ * Description:
+ * Routines and data structures for manipulating and representing
+ * phyiscal memory extents (i.e. address/length pairs).
+ *
+ */
+
+#ifndef __MEM_PIECES_H__
+#define __MEM_PIECES_H__
+
+#include <asm/prom.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Type Definitions */
+
+#define MEM_PIECES_MAX 32
+
+struct mem_pieces {
+ int n_regions;
+ struct reg_property regions[MEM_PIECES_MAX];
+};
+
+/* Function Prototypes */
+
+extern void *mem_pieces_find(unsigned int size, unsigned int align);
+extern void mem_pieces_remove(struct mem_pieces *mp, unsigned int start,
+ unsigned int size, int must_exist);
+extern void mem_pieces_coalesce(struct mem_pieces *mp);
+extern void mem_pieces_sort(struct mem_pieces *mp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __MEM_PIECES_H__ */
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
new file mode 100644
index 000000000000..a8816e0f6a86
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context.c
@@ -0,0 +1,86 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification. This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+mm_context_t next_mmu_context;
+unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+#ifdef FEW_CONTEXTS
+atomic_t nr_free_contexts;
+struct mm_struct *context_mm[LAST_CONTEXT+1];
+void steal_context(void);
+#endif /* FEW_CONTEXTS */
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init
+mmu_context_init(void)
+{
+ /*
+ * Some processors have too few contexts to reserve one for
+ * init_mm, and require using context 0 for a normal task.
+ * Other processors reserve the use of context zero for the kernel.
+ * This code assumes FIRST_CONTEXT < 32.
+ */
+ context_map[0] = (1 << FIRST_CONTEXT) - 1;
+ next_mmu_context = FIRST_CONTEXT;
+#ifdef FEW_CONTEXTS
+ atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+#endif /* FEW_CONTEXTS */
+}
+
+#ifdef FEW_CONTEXTS
+/*
+ * Steal a context from a task that has one at the moment.
+ * This is only used on 8xx and 4xx and we presently assume that
+ * they don't do SMP. If they do then this will have to check
+ * whether the MM we steal is in use.
+ * We also assume that this is only used on systems that don't
+ * use an MMU hash table - this is true for 8xx and 4xx.
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :). This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ * -- paulus
+ */
+void
+steal_context(void)
+{
+ struct mm_struct *mm;
+
+ /* free up context `next_mmu_context' */
+ /* if we shouldn't free context 0, don't... */
+ if (next_mmu_context < FIRST_CONTEXT)
+ next_mmu_context = FIRST_CONTEXT;
+ mm = context_mm[next_mmu_context];
+ flush_tlb_mm(mm);
+ destroy_context(mm);
+}
+#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context64.c b/arch/powerpc/mm/mmu_context64.c
new file mode 100644
index 000000000000..714a84dd8d5d
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context64.c
@@ -0,0 +1,63 @@
+/*
+ * MMU context allocation for 64-bit kernels.
+ *
+ * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDR(mmu_context_idr);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ int index;
+ int err;
+
+again:
+ if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&mmu_context_lock);
+ err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+ spin_unlock(&mmu_context_lock);
+
+ if (err == -EAGAIN)
+ goto again;
+ else if (err)
+ return err;
+
+ if (index > MAX_CONTEXT) {
+ idr_remove(&mmu_context_idr, index);
+ return -ENOMEM;
+ }
+
+ mm->context.id = index;
+
+ return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+ spin_lock(&mmu_context_lock);
+ idr_remove(&mmu_context_idr, mm->context.id);
+ spin_unlock(&mmu_context_lock);
+
+ mm->context.id = NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
new file mode 100644
index 000000000000..540f3292b229
--- /dev/null
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -0,0 +1,85 @@
+/*
+ * Declarations of procedures and variables shared between files
+ * in arch/ppc/mm/.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <asm/tlbflush.h>
+#include <asm/mmu.h>
+
+extern void mapin_ram(void);
+extern int map_page(unsigned long va, phys_addr_t pa, int flags);
+extern void setbat(int index, unsigned long virt, unsigned long phys,
+ unsigned int size, int flags);
+extern void reserve_phys_mem(unsigned long start, unsigned long size);
+extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, int flags, unsigned int pid);
+extern void invalidate_tlbcam_entry(int index);
+
+extern int __map_without_bats;
+extern unsigned long ioremap_base;
+extern unsigned long ioremap_bot;
+extern unsigned int rtas_data, rtas_size;
+
+extern unsigned long total_memory;
+extern unsigned long total_lowmem;
+extern int mem_init_done;
+
+extern PTE *Hash, *Hash_end;
+extern unsigned long Hash_size, Hash_mask;
+
+extern unsigned int num_tlbcam_entries;
+
+/* ...and now those things that may be slightly different between processor
+ * architectures. -- Dan
+ */
+#if defined(CONFIG_8xx)
+#define flush_HPTE(X, va, pg) _tlbie(va)
+#define MMU_init_hw() do { } while(0)
+#define mmu_mapin_ram() (0UL)
+
+#elif defined(CONFIG_4xx)
+#define flush_HPTE(X, va, pg) _tlbie(va)
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+
+#elif defined(CONFIG_FSL_BOOKE)
+#define flush_HPTE(X, va, pg) _tlbie(va)
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+extern void adjust_total_lowmem(void);
+
+#else
+/* anything except 4xx or 8xx */
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+
+/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
+ * which includes all new 82xx processors. We need tlbie/tlbsync here
+ * in that case (I think). -- Dan.
+ */
+static inline void flush_HPTE(unsigned context, unsigned long va,
+ unsigned long pdval)
+{
+ if ((Hash != 0) &&
+ cpu_has_feature(CPU_FTR_HPTE_TABLE))
+ flush_hash_pages(0, va, pdval, 1);
+ else
+ _tlbie(va);
+}
+#endif
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
new file mode 100644
index 000000000000..81a3d7446d37
--- /dev/null
+++ b/arch/powerpc/mm/pgtable.c
@@ -0,0 +1,470 @@
+/*
+ * This file contains the routines setting up the linux page tables.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io.h>
+
+#include "mmu_decl.h"
+
+unsigned long ioremap_base;
+unsigned long ioremap_bot;
+int io_bat_index;
+
+#if defined(CONFIG_6xx) || defined(CONFIG_POWER3)
+#define HAVE_BATS 1
+#endif
+
+#if defined(CONFIG_FSL_BOOKE)
+#define HAVE_TLBCAM 1
+#endif
+
+extern char etext[], _stext[];
+
+#ifdef CONFIG_SMP
+extern void hash_page_sync(void);
+#endif
+
+#ifdef HAVE_BATS
+extern unsigned long v_mapped_by_bats(unsigned long va);
+extern unsigned long p_mapped_by_bats(unsigned long pa);
+void setbat(int index, unsigned long virt, unsigned long phys,
+ unsigned int size, int flags);
+
+#else /* !HAVE_BATS */
+#define v_mapped_by_bats(x) (0UL)
+#define p_mapped_by_bats(x) (0UL)
+#endif /* HAVE_BATS */
+
+#ifdef HAVE_TLBCAM
+extern unsigned int tlbcam_index;
+extern unsigned long v_mapped_by_tlbcam(unsigned long va);
+extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
+#else /* !HAVE_TLBCAM */
+#define v_mapped_by_tlbcam(x) (0UL)
+#define p_mapped_by_tlbcam(x) (0UL)
+#endif /* HAVE_TLBCAM */
+
+#ifdef CONFIG_PTE_64BIT
+/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
+#define PGDIR_ORDER 1
+#else
+#define PGDIR_ORDER 0
+#endif
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *ret;
+
+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+ return ret;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+ free_pages((unsigned long)pgd, PGDIR_ORDER);
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ pte_t *pte;
+ extern int mem_init_done;
+ extern void *early_get_page(void);
+
+ if (mem_init_done) {
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ } else {
+ pte = (pte_t *)early_get_page();
+ if (pte)
+ clear_page(pte);
+ }
+ return pte;
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *ptepage;
+
+#ifdef CONFIG_HIGHPTE
+ int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT;
+#else
+ int flags = GFP_KERNEL | __GFP_REPEAT;
+#endif
+
+ ptepage = alloc_pages(flags, 0);
+ if (ptepage)
+ clear_highpage(ptepage);
+ return ptepage;
+}
+
+void pte_free_kernel(pte_t *pte)
+{
+#ifdef CONFIG_SMP
+ hash_page_sync();
+#endif
+ free_page((unsigned long)pte);
+}
+
+void pte_free(struct page *ptepage)
+{
+#ifdef CONFIG_SMP
+ hash_page_sync();
+#endif
+ __free_page(ptepage);
+}
+
+#ifndef CONFIG_PHYS_64BIT
+void __iomem *
+ioremap(phys_addr_t addr, unsigned long size)
+{
+ return __ioremap(addr, size, _PAGE_NO_CACHE);
+}
+#else /* CONFIG_PHYS_64BIT */
+void __iomem *
+ioremap64(unsigned long long addr, unsigned long size)
+{
+ return __ioremap(addr, size, _PAGE_NO_CACHE);
+}
+
+void __iomem *
+ioremap(phys_addr_t addr, unsigned long size)
+{
+ phys_addr_t addr64 = fixup_bigphys_addr(addr, size);
+
+ return ioremap64(addr64, size);
+}
+#endif /* CONFIG_PHYS_64BIT */
+
+void __iomem *
+__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
+{
+ unsigned long v, i;
+ phys_addr_t p;
+ int err;
+
+ /*
+ * Choose an address to map it to.
+ * Once the vmalloc system is running, we use it.
+ * Before then, we use space going down from ioremap_base
+ * (ioremap_bot records where we're up to).
+ */
+ p = addr & PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - p;
+
+ /*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (p < 16*1024*1024)
+ p += _ISA_MEM_BASE;
+
+ /*
+ * Don't allow anybody to remap normal RAM that we're using.
+ * mem_init() sets high_memory so only do the check after that.
+ */
+ if ( mem_init_done && (p < virt_to_phys(high_memory)) )
+ {
+ printk("__ioremap(): phys addr "PHYS_FMT" is RAM lr %p\n", p,
+ __builtin_return_address(0));
+ return NULL;
+ }
+
+ if (size == 0)
+ return NULL;
+
+ /*
+ * Is it already mapped? Perhaps overlapped by a previous
+ * BAT mapping. If the whole area is mapped then we're done,
+ * otherwise remap it since we want to keep the virt addrs for
+ * each request contiguous.
+ *
+ * We make the assumption here that if the bottom and top
+ * of the range we want are mapped then it's mapped to the
+ * same virt address (and this is contiguous).
+ * -- Cort
+ */
+ if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ )
+ goto out;
+
+ if ((v = p_mapped_by_tlbcam(p)))
+ goto out;
+
+ if (mem_init_done) {
+ struct vm_struct *area;
+ area = get_vm_area(size, VM_IOREMAP);
+ if (area == 0)
+ return NULL;
+ v = (unsigned long) area->addr;
+ } else {
+ v = (ioremap_bot -= size);
+ }
+
+ if ((flags & _PAGE_PRESENT) == 0)
+ flags |= _PAGE_KERNEL;
+ if (flags & _PAGE_NO_CACHE)
+ flags |= _PAGE_GUARDED;
+
+ /*
+ * Should check if it is a candidate for a BAT mapping
+ */
+
+ err = 0;
+ for (i = 0; i < size && err == 0; i += PAGE_SIZE)
+ err = map_page(v+i, p+i, flags);
+ if (err) {
+ if (mem_init_done)
+ vunmap((void *)v);
+ return NULL;
+ }
+
+out:
+ return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+ /*
+ * If mapped by BATs then there is nothing to do.
+ * Calling vfree() generates a benign warning.
+ */
+ if (v_mapped_by_bats((unsigned long)addr)) return;
+
+ if (addr > high_memory && (unsigned long) addr < ioremap_bot)
+ vunmap((void *) (PAGE_MASK & (unsigned long)addr));
+}
+
+void __iomem *ioport_map(unsigned long port, unsigned int len)
+{
+ return (void __iomem *) (port + _IO_BASE);
+}
+
+void ioport_unmap(void __iomem *addr)
+{
+ /* Nothing to do */
+}
+EXPORT_SYMBOL(ioport_map);
+EXPORT_SYMBOL(ioport_unmap);
+
+int
+map_page(unsigned long va, phys_addr_t pa, int flags)
+{
+ pmd_t *pd;
+ pte_t *pg;
+ int err = -ENOMEM;
+
+ spin_lock(&init_mm.page_table_lock);
+ /* Use upper 10 bits of VA to index the first level map */
+ pd = pmd_offset(pgd_offset_k(va), va);
+ /* Use middle 10 bits of VA to index the second-level map */
+ pg = pte_alloc_kernel(&init_mm, pd, va);
+ if (pg != 0) {
+ err = 0;
+ set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
+ if (mem_init_done)
+ flush_HPTE(0, va, pmd_val(*pd));
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ return err;
+}
+
+/*
+ * Map in all of physical memory starting at KERNELBASE.
+ */
+void __init mapin_ram(void)
+{
+ unsigned long v, p, s, f;
+
+ s = mmu_mapin_ram();
+ v = KERNELBASE + s;
+ p = PPC_MEMSTART + s;
+ for (; s < total_lowmem; s += PAGE_SIZE) {
+ if ((char *) v >= _stext && (char *) v < etext)
+ f = _PAGE_RAM_TEXT;
+ else
+ f = _PAGE_RAM;
+ map_page(v, p, f);
+ v += PAGE_SIZE;
+ p += PAGE_SIZE;
+ }
+}
+
+/* is x a power of 2? */
+#define is_power_of_2(x) ((x) != 0 && (((x) & ((x) - 1)) == 0))
+
+/* is x a power of 4? */
+#define is_power_of_4(x) ((x) != 0 && (((x) & (x-1)) == 0) && (ffs(x) & 1))
+
+/*
+ * Set up a mapping for a block of I/O.
+ * virt, phys, size must all be page-aligned.
+ * This should only be called before ioremap is called.
+ */
+void __init io_block_mapping(unsigned long virt, phys_addr_t phys,
+ unsigned int size, int flags)
+{
+ int i;
+
+ if (virt > KERNELBASE && virt < ioremap_bot)
+ ioremap_bot = ioremap_base = virt;
+
+#ifdef HAVE_BATS
+ /*
+ * Use a BAT for this if possible...
+ */
+ if (io_bat_index < 2 && is_power_of_2(size)
+ && (virt & (size - 1)) == 0 && (phys & (size - 1)) == 0) {
+ setbat(io_bat_index, virt, phys, size, flags);
+ ++io_bat_index;
+ return;
+ }
+#endif /* HAVE_BATS */
+
+#ifdef HAVE_TLBCAM
+ /*
+ * Use a CAM for this if possible...
+ */
+ if (tlbcam_index < num_tlbcam_entries && is_power_of_4(size)
+ && (virt & (size - 1)) == 0 && (phys & (size - 1)) == 0) {
+ settlbcam(tlbcam_index, virt, phys, size, flags, 0);
+ ++tlbcam_index;
+ return;
+ }
+#endif /* HAVE_TLBCAM */
+
+ /* No BATs available, put it in the page tables. */
+ for (i = 0; i < size; i += PAGE_SIZE)
+ map_page(virt + i, phys + i, flags);
+}
+
+/* Scan the real Linux page tables and return a PTE pointer for
+ * a virtual address in a context.
+ * Returns true (1) if PTE was found, zero otherwise. The pointer to
+ * the PTE pointer is unmodified if PTE is not found.
+ */
+int
+get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ int retval = 0;
+
+ pgd = pgd_offset(mm, addr & PAGE_MASK);
+ if (pgd) {
+ pmd = pmd_offset(pgd, addr & PAGE_MASK);
+ if (pmd_present(*pmd)) {
+ pte = pte_offset_map(pmd, addr & PAGE_MASK);
+ if (pte) {
+ retval = 1;
+ *ptep = pte;
+ /* XXX caller needs to do pte_unmap, yuck */
+ }
+ }
+ }
+ return(retval);
+}
+
+/* Find physical address for this virtual address. Normally used by
+ * I/O functions, but anyone can call it.
+ */
+unsigned long iopa(unsigned long addr)
+{
+ unsigned long pa;
+
+ /* I don't know why this won't work on PMacs or CHRP. It
+ * appears there is some bug, or there is some implicit
+ * mapping done not properly represented by BATs or in page
+ * tables.......I am actively working on resolving this, but
+ * can't hold up other stuff. -- Dan
+ */
+ pte_t *pte;
+ struct mm_struct *mm;
+
+ /* Check the BATs */
+ pa = v_mapped_by_bats(addr);
+ if (pa)
+ return pa;
+
+ /* Allow mapping of user addresses (within the thread)
+ * for DMA if necessary.
+ */
+ if (addr < TASK_SIZE)
+ mm = current->mm;
+ else
+ mm = &init_mm;
+
+ pa = 0;
+ if (get_pteptr(mm, addr, &pte)) {
+ pa = (pte_val(*pte) & PAGE_MASK) | (addr & ~PAGE_MASK);
+ pte_unmap(pte);
+ }
+
+ return(pa);
+}
+
+/* This is will find the virtual address for a physical one....
+ * Swiped from APUS, could be dangerous :-).
+ * This is only a placeholder until I really find a way to make this
+ * work. -- Dan
+ */
+unsigned long
+mm_ptov (unsigned long paddr)
+{
+ unsigned long ret;
+#if 0
+ if (paddr < 16*1024*1024)
+ ret = ZTWO_VADDR(paddr);
+ else {
+ int i;
+
+ for (i = 0; i < kmap_chunk_count;){
+ unsigned long phys = kmap_chunks[i++];
+ unsigned long size = kmap_chunks[i++];
+ unsigned long virt = kmap_chunks[i++];
+ if (paddr >= phys
+ && paddr < (phys + size)){
+ ret = virt + paddr - phys;
+ goto exit;
+ }
+ }
+
+ ret = (unsigned long) __va(paddr);
+ }
+exit:
+#ifdef DEBUGPV
+ printk ("PTOV(%lx)=%lx\n", paddr, ret);
+#endif
+#else
+ ret = (unsigned long)paddr + KERNELBASE;
+#endif
+ return ret;
+}
+
diff --git a/arch/powerpc/mm/pgtable64.c b/arch/powerpc/mm/pgtable64.c
new file mode 100644
index 000000000000..724f97e5dee5
--- /dev/null
+++ b/arch/powerpc/mm/pgtable64.c
@@ -0,0 +1,357 @@
+/*
+ * This file contains ioremap and related functions for 64-bit machines.
+ *
+ * Derived from arch/ppc64/mm/init.c
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/lmb.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/ppcdebug.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/vdso.h>
+#include <asm/imalloc.h>
+
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
+int mem_init_done;
+unsigned long ioremap_bot = IMALLOC_BASE;
+static unsigned long phbs_io_bot = PHBS_IO_BASE;
+
+extern pgd_t swapper_pg_dir[];
+extern struct task_struct *current_set[NR_CPUS];
+
+unsigned long klimit = (unsigned long)_end;
+
+/* max amount of RAM to use */
+unsigned long __max_memory;
+
+/* info on what we think the IO hole is */
+unsigned long io_hole_start;
+unsigned long io_hole_size;
+
+#ifdef CONFIG_PPC_ISERIES
+
+void __iomem *ioremap(unsigned long addr, unsigned long size)
+{
+ return (void __iomem *)addr;
+}
+
+extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return (void __iomem *)addr;
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+ return;
+}
+
+#else
+
+/*
+ * map_io_page currently only called by __ioremap
+ * map_io_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+{
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ unsigned long vsid;
+
+ if (mem_init_done) {
+ spin_lock(&init_mm.page_table_lock);
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+ __pgprot(flags)));
+ spin_unlock(&init_mm.page_table_lock);
+ } else {
+ unsigned long va, vpn, hash, hpteg;
+
+ /*
+ * If the mm subsystem is not fully up, we cannot create a
+ * linux page table entry for this mapping. Simply bolt an
+ * entry in the hardware page table.
+ */
+ vsid = get_kernel_vsid(ea);
+ va = (vsid << 28) | (ea & 0xFFFFFFF);
+ vpn = va >> PAGE_SHIFT;
+
+ hash = hpt_hash(vpn, 0);
+
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+ /* Panic if a pte grpup is full */
+ if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
+ HPTE_V_BOLTED,
+ _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
+ == -1) {
+ panic("map_io_page: could not insert mapping");
+ }
+ }
+ return 0;
+}
+
+
+static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
+ unsigned long ea, unsigned long size,
+ unsigned long flags)
+{
+ unsigned long i;
+
+ if ((flags & _PAGE_PRESENT) == 0)
+ flags |= pgprot_val(PAGE_KERNEL);
+
+ for (i = 0; i < size; i += PAGE_SIZE)
+ if (map_io_page(ea+i, pa+i, flags))
+ return NULL;
+
+ return (void __iomem *) (ea + (addr & ~PAGE_MASK));
+}
+
+
+void __iomem *
+ioremap(unsigned long addr, unsigned long size)
+{
+ return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
+}
+
+void __iomem * __ioremap(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ unsigned long pa, ea;
+ void __iomem *ret;
+
+ /*
+ * Choose an address to map it to.
+ * Once the imalloc system is running, we use it.
+ * Before that, we map using addresses going
+ * up from ioremap_bot. imalloc will use
+ * the addresses from ioremap_bot through
+ * IMALLOC_END
+ *
+ */
+ pa = addr & PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - pa;
+
+ if (size == 0)
+ return NULL;
+
+ if (mem_init_done) {
+ struct vm_struct *area;
+ area = im_get_free_area(size);
+ if (area == NULL)
+ return NULL;
+ ea = (unsigned long)(area->addr);
+ ret = __ioremap_com(addr, pa, ea, size, flags);
+ if (!ret)
+ im_free(area->addr);
+ } else {
+ ea = ioremap_bot;
+ ret = __ioremap_com(addr, pa, ea, size, flags);
+ if (ret)
+ ioremap_bot += size;
+ }
+ return ret;
+}
+
+#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
+
+int __ioremap_explicit(unsigned long pa, unsigned long ea,
+ unsigned long size, unsigned long flags)
+{
+ struct vm_struct *area;
+ void __iomem *ret;
+
+ /* For now, require page-aligned values for pa, ea, and size */
+ if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
+ !IS_PAGE_ALIGNED(size)) {
+ printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
+ return 1;
+ }
+
+ if (!mem_init_done) {
+ /* Two things to consider in this case:
+ * 1) No records will be kept (imalloc, etc) that the region
+ * has been remapped
+ * 2) It won't be easy to iounmap() the region later (because
+ * of 1)
+ */
+ ;
+ } else {
+ area = im_get_area(ea, size,
+ IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
+ if (area == NULL) {
+ /* Expected when PHB-dlpar is in play */
+ return 1;
+ }
+ if (ea != (unsigned long) area->addr) {
+ printk(KERN_ERR "unexpected addr return from "
+ "im_get_area\n");
+ return 1;
+ }
+ }
+
+ ret = __ioremap_com(pa, pa, ea, size, flags);
+ if (ret == NULL) {
+ printk(KERN_ERR "ioremap_explicit() allocation failure !\n");
+ return 1;
+ }
+ if (ret != (void *) ea) {
+ printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Unmap an IO region and remove it from imalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ * This code is modeled after vmalloc code - unmap_vm_area()
+ *
+ * XXX what about calls before mem_init_done (ie python_countermeasures())
+ */
+void iounmap(volatile void __iomem *token)
+{
+ void *addr;
+
+ if (!mem_init_done)
+ return;
+
+ addr = (void *) ((unsigned long __force) token & PAGE_MASK);
+
+ im_free(addr);
+}
+
+static int iounmap_subset_regions(unsigned long addr, unsigned long size)
+{
+ struct vm_struct *area;
+
+ /* Check whether subsets of this region exist */
+ area = im_get_area(addr, size, IM_REGION_SUPERSET);
+ if (area == NULL)
+ return 1;
+
+ while (area) {
+ iounmap((void __iomem *) area->addr);
+ area = im_get_area(addr, size,
+ IM_REGION_SUPERSET);
+ }
+
+ return 0;
+}
+
+int iounmap_explicit(volatile void __iomem *start, unsigned long size)
+{
+ struct vm_struct *area;
+ unsigned long addr;
+ int rc;
+
+ addr = (unsigned long __force) start & PAGE_MASK;
+
+ /* Verify that the region either exists or is a subset of an existing
+ * region. In the latter case, split the parent region to create
+ * the exact region
+ */
+ area = im_get_area(addr, size,
+ IM_REGION_EXISTS | IM_REGION_SUBSET);
+ if (area == NULL) {
+ /* Determine whether subset regions exist. If so, unmap */
+ rc = iounmap_subset_regions(addr, size);
+ if (rc) {
+ printk(KERN_ERR
+ "%s() cannot unmap nonexistent range 0x%lx\n",
+ __FUNCTION__, addr);
+ return 1;
+ }
+ } else {
+ iounmap((void __iomem *) area->addr);
+ }
+ /*
+ * FIXME! This can't be right:
+ iounmap(area->addr);
+ * Maybe it should be "iounmap(area);"
+ */
+ return 0;
+}
+
+#endif
+
+EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ppc_mmu.c b/arch/powerpc/mm/ppc_mmu.c
new file mode 100644
index 000000000000..9a381ed5eb21
--- /dev/null
+++ b/arch/powerpc/mm/ppc_mmu.c
@@ -0,0 +1,296 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification. This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+#include <asm/prom.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+
+#include "mmu_decl.h"
+#include "mem_pieces.h"
+
+PTE *Hash, *Hash_end;
+unsigned long Hash_size, Hash_mask;
+unsigned long _SDR1;
+
+union ubat { /* BAT register values to be loaded */
+ BAT bat;
+#ifdef CONFIG_PPC64BRIDGE
+ u64 word[2];
+#else
+ u32 word[2];
+#endif
+} BATS[4][2]; /* 4 pairs of IBAT, DBAT */
+
+struct batrange { /* stores address ranges mapped by BATs */
+ unsigned long start;
+ unsigned long limit;
+ unsigned long phys;
+} bat_addrs[4];
+
+/*
+ * Return PA for this VA if it is mapped by a BAT, or 0
+ */
+unsigned long v_mapped_by_bats(unsigned long va)
+{
+ int b;
+ for (b = 0; b < 4; ++b)
+ if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
+ return bat_addrs[b].phys + (va - bat_addrs[b].start);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_mapped_by_bats(unsigned long pa)
+{
+ int b;
+ for (b = 0; b < 4; ++b)
+ if (pa >= bat_addrs[b].phys
+ && pa < (bat_addrs[b].limit-bat_addrs[b].start)
+ +bat_addrs[b].phys)
+ return bat_addrs[b].start+(pa-bat_addrs[b].phys);
+ return 0;
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+#ifdef CONFIG_POWER4
+ return 0;
+#else
+ unsigned long tot, bl, done;
+ unsigned long max_size = (256<<20);
+ unsigned long align;
+
+ if (__map_without_bats)
+ return 0;
+
+ /* Set up BAT2 and if necessary BAT3 to cover RAM. */
+
+ /* Make sure we don't map a block larger than the
+ smallest alignment of the physical address. */
+ /* alignment of PPC_MEMSTART */
+ align = ~(PPC_MEMSTART-1) & PPC_MEMSTART;
+ /* set BAT block size to MIN(max_size, align) */
+ if (align && align < max_size)
+ max_size = align;
+
+ tot = total_lowmem;
+ for (bl = 128<<10; bl < max_size; bl <<= 1) {
+ if (bl * 2 > tot)
+ break;
+ }
+
+ setbat(2, KERNELBASE, PPC_MEMSTART, bl, _PAGE_RAM);
+ done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+ if ((done < tot) && !bat_addrs[3].limit) {
+ /* use BAT3 to cover a bit more */
+ tot -= done;
+ for (bl = 128<<10; bl < max_size; bl <<= 1)
+ if (bl * 2 > tot)
+ break;
+ setbat(3, KERNELBASE+done, PPC_MEMSTART+done, bl, _PAGE_RAM);
+ done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+ }
+
+ return done;
+#endif
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ */
+void __init setbat(int index, unsigned long virt, unsigned long phys,
+ unsigned int size, int flags)
+{
+ unsigned int bl;
+ int wimgxpp;
+ union ubat *bat = BATS[index];
+
+ if (((flags & _PAGE_NO_CACHE) == 0) &&
+ cpu_has_feature(CPU_FTR_NEED_COHERENT))
+ flags |= _PAGE_COHERENT;
+
+ bl = (size >> 17) - 1;
+ if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
+ /* 603, 604, etc. */
+ /* Do DBAT first */
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT | _PAGE_GUARDED);
+ wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+ bat[1].word[0] = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[1].word[1] = phys | wimgxpp;
+#ifndef CONFIG_KGDB /* want user access for breakpoints */
+ if (flags & _PAGE_USER)
+#endif
+ bat[1].bat.batu.vp = 1;
+ if (flags & _PAGE_GUARDED) {
+ /* G bit must be zero in IBATs */
+ bat[0].word[0] = bat[0].word[1] = 0;
+ } else {
+ /* make IBAT same as DBAT */
+ bat[0] = bat[1];
+ }
+ } else {
+ /* 601 cpu */
+ if (bl > BL_8M)
+ bl = BL_8M;
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT);
+ wimgxpp |= (flags & _PAGE_RW)?
+ ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
+ bat->word[0] = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
+ bat->word[1] = phys | bl | 0x40; /* V=1 */
+ }
+
+ bat_addrs[index].start = virt;
+ bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
+ bat_addrs[index].phys = phys;
+}
+
+/*
+ * Initialize the hash table and patch the instructions in hashtable.S.
+ */
+void __init MMU_init_hw(void)
+{
+ unsigned int hmask, mb, mb2;
+ unsigned int n_hpteg, lg_n_hpteg;
+
+ extern unsigned int hash_page_patch_A[];
+ extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
+ extern unsigned int hash_page[];
+ extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
+
+ if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+ /*
+ * Put a blr (procedure return) instruction at the
+ * start of hash_page, since we can still get DSI
+ * exceptions on a 603.
+ */
+ hash_page[0] = 0x4e800020;
+ flush_icache_range((unsigned long) &hash_page[0],
+ (unsigned long) &hash_page[1]);
+ return;
+ }
+
+ if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
+
+#ifdef CONFIG_PPC64BRIDGE
+#define LG_HPTEG_SIZE 7 /* 128 bytes per HPTEG */
+#define SDR1_LOW_BITS (lg_n_hpteg - 11)
+#define MIN_N_HPTEG 2048 /* min 256kB hash table */
+#else
+#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
+#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
+#define MIN_N_HPTEG 1024 /* min 64kB hash table */
+#endif
+
+#ifdef CONFIG_POWER4
+ /* The hash table has already been allocated and initialized
+ in prom.c */
+ n_hpteg = Hash_size >> LG_HPTEG_SIZE;
+ lg_n_hpteg = __ilog2(n_hpteg);
+
+ /* Remove the hash table from the available memory */
+ if (Hash)
+ reserve_phys_mem(__pa(Hash), Hash_size);
+
+#else /* CONFIG_POWER4 */
+ /*
+ * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
+ * This is less than the recommended amount, but then
+ * Linux ain't AIX.
+ */
+ n_hpteg = total_memory / (PAGE_SIZE * 8);
+ if (n_hpteg < MIN_N_HPTEG)
+ n_hpteg = MIN_N_HPTEG;
+ lg_n_hpteg = __ilog2(n_hpteg);
+ if (n_hpteg & (n_hpteg - 1)) {
+ ++lg_n_hpteg; /* round up if not power of 2 */
+ n_hpteg = 1 << lg_n_hpteg;
+ }
+ Hash_size = n_hpteg << LG_HPTEG_SIZE;
+
+ /*
+ * Find some memory for the hash table.
+ */
+ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
+ Hash = mem_pieces_find(Hash_size, Hash_size);
+ cacheable_memzero(Hash, Hash_size);
+ _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
+#endif /* CONFIG_POWER4 */
+
+ Hash_end = (PTE *) ((unsigned long)Hash + Hash_size);
+
+ printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n",
+ total_memory >> 20, Hash_size >> 10, Hash);
+
+
+ /*
+ * Patch up the instructions in hashtable.S:create_hpte
+ */
+ if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
+ Hash_mask = n_hpteg - 1;
+ hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+ mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+ if (lg_n_hpteg > 16)
+ mb2 = 16 - LG_HPTEG_SIZE;
+
+ hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
+ | ((unsigned int)(Hash) >> 16);
+ hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6);
+ hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6);
+ hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask;
+ hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask;
+
+ /*
+ * Ensure that the locations we've patched have been written
+ * out from the data cache and invalidated in the instruction
+ * cache, on those machines with split caches.
+ */
+ flush_icache_range((unsigned long) &hash_page_patch_A[0],
+ (unsigned long) &hash_page_patch_C[1]);
+
+ /*
+ * Patch up the instructions in hashtable.S:flush_hash_page
+ */
+ flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
+ | ((unsigned int)(Hash) >> 16);
+ flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6);
+ flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6);
+ flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask;
+ flush_icache_range((unsigned long) &flush_hash_patch_A[0],
+ (unsigned long) &flush_hash_patch_B[1]);
+
+ if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
+}
diff --git a/arch/powerpc/mm/tlb.c b/arch/powerpc/mm/tlb.c
new file mode 100644
index 000000000000..6c3dc3c44c86
--- /dev/null
+++ b/arch/powerpc/mm/tlb.c
@@ -0,0 +1,183 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Called when unmapping pages to flush entries from the TLB/hash table.
+ */
+void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+ unsigned long ptephys;
+
+ if (Hash != 0) {
+ ptephys = __pa(ptep) & PAGE_MASK;
+ flush_hash_pages(mm->context, addr, ptephys, 1);
+ }
+}
+
+/*
+ * Called by ptep_set_access_flags, must flush on CPUs for which the
+ * DSI handler can't just "fixup" the TLB on a write fault
+ */
+void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (Hash != 0)
+ return;
+ _tlbie(addr);
+}
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+void tlb_flush(struct mmu_gather *tlb)
+{
+ if (Hash == 0) {
+ /*
+ * 603 needs to flush the whole TLB here since
+ * it doesn't use a hash table.
+ */
+ _tlbia();
+ }
+}
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ * -- Cort
+ */
+
+/*
+ * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
+ * the cache operations on the bus. Hence we need to use an IPI
+ * to get the other CPU(s) to invalidate their TLBs.
+ */
+#ifdef CONFIG_SMP_750
+#define FINISH_FLUSH smp_send_tlb_invalidate(0)
+#else
+#define FINISH_FLUSH do { } while (0)
+#endif
+
+static void flush_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ pmd_t *pmd;
+ unsigned long pmd_end;
+ int count;
+ unsigned int ctx = mm->context;
+
+ if (Hash == 0) {
+ _tlbia();
+ return;
+ }
+ start &= PAGE_MASK;
+ if (start >= end)
+ return;
+ end = (end - 1) | ~PAGE_MASK;
+ pmd = pmd_offset(pgd_offset(mm, start), start);
+ for (;;) {
+ pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+ if (pmd_end > end)
+ pmd_end = end;
+ if (!pmd_none(*pmd)) {
+ count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+ flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+ }
+ if (pmd_end == end)
+ break;
+ start = pmd_end + 1;
+ ++pmd;
+ }
+}
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ flush_range(&init_mm, start, end);
+ FINISH_FLUSH;
+}
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void flush_tlb_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *mp;
+
+ if (Hash == 0) {
+ _tlbia();
+ return;
+ }
+
+ for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+ FINISH_FLUSH;
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ struct mm_struct *mm;
+ pmd_t *pmd;
+
+ if (Hash == 0) {
+ _tlbie(vmaddr);
+ return;
+ }
+ mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+ pmd = pmd_offset(pgd_offset(mm, vmaddr), vmaddr);
+ if (!pmd_none(*pmd))
+ flush_hash_pages(mm->context, vmaddr, pmd_val(*pmd), 1);
+ FINISH_FLUSH;
+}
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ flush_range(vma->vm_mm, start, end);
+ FINISH_FLUSH;
+}