18 files changed, 1486 insertions, 157 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 29954dc28942..f5e7b9ce63dd 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void)
 
 	while (s >= LARGE_PAGE_SIZE_16M) {
 		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
 		pmd_val(*pmdp++) = val;
@@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void)
 
 	while (s >= LARGE_PAGE_SIZE_4M) {
 		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
 
 		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
 		pmd_val(*pmdp) = val;
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3e68363405b7..6fb8fc8d2fea 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,6 +13,7 @@ obj-y				:= fault.o mem.o pgtable.o gup.o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
+obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
 obj-$(CONFIG_PPC64)		+= mmap_64.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hash_utils_64.o \
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index bb3d65998e6b..dc93e95b256e 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void)
 	unsigned long virt = PAGE_OFFSET;
 	phys_addr_t phys = memstart_addr;
 
-	while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) {
+	while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) {
 		settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0);
 		virt += cam[tlbcam_index];
 		phys += cam[tlbcam_index];
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 14af8cedab70..b13d58932bf6 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -40,7 +40,7 @@ mmu_hash_lock:
  * The address is in r4, and r3 contains an access flag:
  * _PAGE_RW (0x400) if a write.
  * r9 contains the SRR1 value, from which we use the MSR_PR bit.
- * SPRG3 contains the physical address of the current task's thread.
+ * SPRG_THREAD contains the physical address of the current task's thread.
  *
  * Returns to the caller if the access is illegal or there is no
  * mapping for the address.  Otherwise it places an appropriate PTE
@@ -68,7 +68,7 @@ _GLOBAL(hash_page)
 	/* Get PTE (linux-style) and check access */
 	lis	r0,KERNELBASE@h		/* check if kernel address */
 	cmplw	0,r4,r0
-	mfspr	r8,SPRN_SPRG3		/* current task's THREAD (phys) */
+	mfspr	r8,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
 	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
 	lwz	r5,PGDIR(r8)		/* virt page-table root */
 	blt+	112f			/* assume user more likely */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c46ef2ffa3d9..90df6ffe3a43 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 #define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
 
 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
-	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+	[MMU_PAGE_64K]	= "hugepte_cache_64K",
+	[MMU_PAGE_1M]	= "hugepte_cache_1M",
+	[MMU_PAGE_16M]	= "hugepte_cache_16M",
+	[MMU_PAGE_16G]	= "hugepte_cache_16G",
 };
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
@@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize)
 		if (mmu_huge_psizes[psize] ||
 		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 			return;
+		if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
+			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
 		switch (mmu_psize_defs[psize].shift) {
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3de6a0d93824..3ef5084b90ca 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -54,8 +54,6 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 phys_addr_t total_memory;
 phys_addr_t total_lowmem;
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 68a821add28d..31582329cd67 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
 	return 0;
 }
 
+/* On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+
+#ifdef CONFIG_PPC_BOOK3E
+static void __meminit vmemmap_create_mapping(unsigned long start,
+					     unsigned long page_size,
+					     unsigned long phys)
+{
+	/* Create a PTE encoding without page size */
+	unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+		_PAGE_KERNEL_RW;
+
+	/* PTEs only contain page size encodings up to 32M */
+	BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+	/* Encode the size in the PTE */
+	flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+	/* For each PTE for that area, map things. Note that we don't
+	 * increment phys because all PTEs are of the large size and
+	 * thus must have the low bits clear
+	 */
+	for (i = 0; i < page_size; i += PAGE_SIZE)
+		BUG_ON(map_kernel_page(start + i, phys, flags));
+}
+#else /* CONFIG_PPC_BOOK3E */
+static void __meminit vmemmap_create_mapping(unsigned long start,
+					     unsigned long page_size,
+					     unsigned long phys)
+{
+	int  mapped = htab_bolt_mapping(start, start + page_size, phys,
+					PAGE_KERNEL, mmu_vmemmap_psize,
+					mmu_kernel_ssize);
+	BUG_ON(mapped < 0);
+}
+#endif /* CONFIG_PPC_BOOK3E */
+
 int __meminit vmemmap_populate(struct page *start_page,
 			       unsigned long nr_pages, int node)
 {
@@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page,
 	/* Align to the page size of the linear mapping. */
 	start = _ALIGN_DOWN(start, page_size);
 
+	pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
+		 start_page, nr_pages, node);
+	pr_debug(" -> map %lx..%lx\n", start, end);
+
 	for (; start < end; start += page_size) {
-		int mapped;
 		void *p;
 
 		if (vmemmap_populated(start, page_size))
@@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 		if (!p)
 			return -ENOMEM;
 
-		pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
-			start, p, __pa(p));
+		pr_debug("      * %016lx..%016lx allocated at %p\n",
+			 start, start + page_size, p);
 
-		mapped = htab_bolt_mapping(start, start + page_size, __pa(p),
-					   pgprot_val(PAGE_KERNEL),
-					   mmu_vmemmap_psize, mmu_kernel_ssize);
-		BUG_ON(mapped < 0);
+		vmemmap_create_mapping(start, page_size, __pa(p));
 	}
 
 	return 0;
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index b1a727def15b..c2f93dc470e6 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -25,10 +25,20 @@
  *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
 
-#undef DEBUG
-#define DEBUG_STEAL_ONLY
-#undef DEBUG_MAP_CONSISTENCY
-/*#define DEBUG_CLAMP_LAST_CONTEXT   15 */
+#define DEBUG_MAP_CONSISTENCY
+#define DEBUG_CLAMP_LAST_CONTEXT   31
+//#define DEBUG_HARDER
+
+/* We don't use DEBUG because it tends to be compiled in always nowadays
+ * and this would generate way too much output
+ */
+#ifdef DEBUG_HARDER
+#define pr_hard(args...)	printk(KERN_DEBUG args)
+#define pr_hardcont(args...)	printk(KERN_CONT args)
+#else
+#define pr_hard(args...)	do { } while(0)
+#define pr_hardcont(args...)	do { } while(0)
+#endif
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock);
 static unsigned int steal_context_smp(unsigned int id)
 {
 	struct mm_struct *mm;
-	unsigned int cpu, max;
+	unsigned int cpu, max, i;
 
 	max = last_context - first_context;
 
@@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id)
 				id = first_context;
 			continue;
 		}
-		pr_devel("[%d] steal context %d from mm @%p\n",
-			 smp_processor_id(), id, mm);
+		pr_hardcont(" | steal %d from 0x%p", id, mm);
 
 		/* Mark this mm has having no context anymore */
 		mm->context.id = MMU_NO_CONTEXT;
 
-		/* Mark it stale on all CPUs that used this mm */
-		for_each_cpu(cpu, mm_cpumask(mm))
-			__set_bit(id, stale_map[cpu]);
+		/* Mark it stale on all CPUs that used this mm. For threaded
+		 * implementations, we set it on all threads on each core
+		 * represented in the mask. A future implementation will use
+		 * a core map instead but this will do for now.
+		 */
+		for_each_cpu(cpu, mm_cpumask(mm)) {
+			for (i = cpu_first_thread_in_core(cpu);
+			     i <= cpu_last_thread_in_core(cpu); i++)
+				__set_bit(id, stale_map[i]);
+			cpu = i - 1;
+		}
 		return id;
 	}
 
@@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id)
 	/* Pick up the victim mm */
 	mm = context_mm[id];
 
-	pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm);
+	pr_hardcont(" | steal %d from 0x%p", id, mm);
 
 	/* Flush the TLB for that context */
 	local_flush_tlb_mm(mm);
@@ -173,25 +190,20 @@ static void context_check_map(void) { }
 
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-	unsigned int id, cpu = smp_processor_id();
+	unsigned int i, id, cpu = smp_processor_id();
 	unsigned long *map;
 
 	/* No lockless fast path .. yet */
 	spin_lock(&context_lock);
 
-#ifndef DEBUG_STEAL_ONLY
-	pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n",
-		 cpu, next, next->context.active, next->context.id);
-#endif
+	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
+		cpu, next, next->context.active, next->context.id);
 
 #ifdef CONFIG_SMP
 	/* Mark us active and the previous one not anymore */
 	next->context.active++;
 	if (prev) {
-#ifndef DEBUG_STEAL_ONLY
-		pr_devel(" old context %p active was: %d\n",
-			 prev, prev->context.active);
-#endif
+		pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
 		WARN_ON(prev->context.active < 1);
 		prev->context.active--;
 	}
@@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 
 	/* If we already have a valid assigned context, skip all that */
 	id = next->context.id;
-	if (likely(id != MMU_NO_CONTEXT))
+	if (likely(id != MMU_NO_CONTEXT)) {
+#ifdef DEBUG_MAP_CONSISTENCY
+		if (context_mm[id] != next)
+			pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
+			       next, id, id, context_mm[id]);
+#endif
 		goto ctxt_ok;
+	}
 
 	/* We really don't have a context, let's try to acquire one */
 	id = next_context;
@@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	next_context = id + 1;
 	context_mm[id] = next;
 	next->context.id = id;
-
-#ifndef DEBUG_STEAL_ONLY
-	pr_devel("[%d] picked up new id %d, nrf is now %d\n",
-		 cpu, id, nr_free_contexts);
-#endif
+	pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
 
 	context_check_map();
  ctxt_ok:
@@ -248,15 +262,21 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	 * local TLB for it and unmark it before we use it
 	 */
 	if (test_bit(id, stale_map[cpu])) {
-		pr_devel("[%d] flushing stale context %d for mm @%p !\n",
-			 cpu, id, next);
+		pr_hardcont(" | stale flush %d [%d..%d]",
+			    id, cpu_first_thread_in_core(cpu),
+			    cpu_last_thread_in_core(cpu));
+
 		local_flush_tlb_mm(next);
 
 		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
-		__clear_bit(id, stale_map[cpu]);
+		for (i = cpu_first_thread_in_core(cpu);
+		     i <= cpu_last_thread_in_core(cpu); i++) {
+			__clear_bit(id, stale_map[i]);
+		}
 	}
 
 	/* Flick the MMU and release lock */
+	pr_hardcont(" -> %d\n", id);
 	set_context(id, next->pgd);
 	spin_unlock(&context_lock);
 }
@@ -266,6 +286,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
  */
 int init_new_context(struct task_struct *t, struct mm_struct *mm)
 {
+	pr_hard("initing context for mm @%p\n", mm);
+
 	mm->context.id = MMU_NO_CONTEXT;
 	mm->context.active = 0;
 
@@ -305,7 +327,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
 					    unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
-
+#ifdef CONFIG_HOTPLUG_CPU
+	struct task_struct *p;
+#endif
 	/* We don't touch CPU 0 map, it's allocated at aboot and kept
 	 * around forever
 	 */
@@ -324,8 +348,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
 		pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
 		kfree(stale_map[cpu]);
 		stale_map[cpu] = NULL;
-		break;
-#endif
+
+		/* We also clear the cpu_vm_mask bits of CPUs going away */
+		read_lock(&tasklist_lock);
+		for_each_process(p) {
+			if (p->mm)
+				cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm));
+		}
+		read_unlock(&tasklist_lock);
+	break;
+#endif /* CONFIG_HOTPLUG_CPU */
 	}
 	return NOTIFY_OK;
 }
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d1f9c62dc177..d2e5321d5ea6 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -36,21 +36,37 @@ static inline void _tlbil_pid(unsigned int pid)
 {
 	asm volatile ("sync; tlbia; isync" : : : "memory");
 }
+#define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
+
 #else /* CONFIG_40x || CONFIG_8xx */
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbil_pid_noind(unsigned int pid);
+#else
+#define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
+#endif
 #endif /* !(CONFIG_40x || CONFIG_8xx) */
 
 /*
  * On 8xx, we directly inline tlbie, on others, it's extern
  */
 #ifdef CONFIG_8xx
-static inline void _tlbil_va(unsigned long address, unsigned int pid)
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+			     unsigned int tsize, unsigned int ind)
 {
 	asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
 }
-#else /* CONFIG_8xx */
-extern void _tlbil_va(unsigned long address, unsigned int pid);
+#elif defined(CONFIG_PPC_BOOK3E)
+extern void _tlbil_va(unsigned long address, unsigned int pid,
+		      unsigned int tsize, unsigned int ind);
+#else
+extern void __tlbil_va(unsigned long address, unsigned int pid);
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+			     unsigned int tsize, unsigned int ind)
+{
+	__tlbil_va(address, pid);
+}
 #endif /* CONIFG_8xx */
 
 /*
@@ -58,10 +74,16 @@ extern void _tlbil_va(unsigned long address, unsigned int pid);
  * implementation. When that becomes the case, this will be
  * an extern.
  */
-static inline void _tlbivax_bcast(unsigned long address, unsigned int pid)
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
+			   unsigned int tsize, unsigned int ind);
+#else
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
+				   unsigned int tsize, unsigned int ind)
 {
 	BUG();
 }
+#endif
 
 #else /* CONFIG_PPC_MMU_NOHASH */
 
@@ -99,7 +121,12 @@ extern unsigned int rtas_data, rtas_size;
 struct hash_pte;
 extern struct hash_pte *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
-#endif
+
+#endif /* CONFIG_PPC32 */
+
+#ifdef CONFIG_PPC64
+extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
+#endif /* CONFIG_PPC64 */
 
 extern unsigned long ioremap_bot;
 extern unsigned long __max_low_memory;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 627767d6169b..83f1551ec2c9 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -30,6 +30,16 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+#ifdef CONFIG_SMP
+
+/*
+ * Handle batching of page table freeing on SMP. Page tables are
+ * queued up and send to be freed later by RCU in order to avoid
+ * freeing a page table page that is being walked without locks
+ */
+
 static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 static unsigned long pte_freelist_forced_free;
 
@@ -116,27 +126,7 @@ void pte_free_finish(void)
 	*batchp = NULL;
 }
 
-/*
- * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags()
- */
-static pte_t do_dcache_icache_coherency(pte_t pte)
-{
-	unsigned long pfn = pte_pfn(pte);
-	struct page *page;
-
-	if (unlikely(!pfn_valid(pfn)))
-		return pte;
-	page = pfn_to_page(pfn);
-
-	if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) {
-		pr_devel("do_dcache_icache_coherency... flushing\n");
-		flush_dcache_icache_page(page);
-		set_bit(PG_arch_1, &page->flags);
-	}
-	else
-		pr_devel("do_dcache_icache_coherency... already clean\n");
-	return __pte(pte_val(pte) | _PAGE_HWEXEC);
-}
+#endif /* CONFIG_SMP */
 
 static inline int is_exec_fault(void)
 {
@@ -145,49 +135,139 @@ static inline int is_exec_fault(void)
 
 /* We only try to do i/d cache coherency on stuff that looks like
  * reasonably "normal" PTEs. We currently require a PTE to be present
- * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE
+ * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * on userspace PTEs
  */
 static inline int pte_looks_normal(pte_t pte)
 {
 	return (pte_val(pte) &
-		(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) ==
-		(_PAGE_PRESENT);
+	    (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
+	    (_PAGE_PRESENT | _PAGE_USER);
 }
 
-#if defined(CONFIG_PPC_STD_MMU)
+struct page * maybe_pte_to_page(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+	struct page *page;
+
+	if (unlikely(!pfn_valid(pfn)))
+		return NULL;
+	page = pfn_to_page(pfn);
+	if (PageReserved(page))
+		return NULL;
+	return page;
+}
+
+#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+
 /* Server-style MMU handles coherency when hashing if HW exec permission
- * is supposed per page (currently 64-bit only). Else, we always flush
- * valid PTEs in set_pte.
+ * is supposed per page (currently 64-bit only). If not, then, we always
+ * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
+ * support falls into the same category.
  */
-static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+
+static pte_t set_pte_filter(pte_t pte)
 {
-	return set_pte && pte_looks_normal(pte) &&
-		!(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
-		  cpu_has_feature(CPU_FTR_NOEXECUTE));
+	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+	if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+				       cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+		struct page *pg = maybe_pte_to_page(pte);
+		if (!pg)
+			return pte;
+		if (!test_bit(PG_arch_1, &pg->flags)) {
+			flush_dcache_icache_page(pg);
+			set_bit(PG_arch_1, &pg->flags);
+		}
+	}
+	return pte;
 }
-#elif _PAGE_HWEXEC == 0
-/* Embedded type MMU without HW exec support (8xx only so far), we flush
- * the cache for any present PTE
- */
-static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+				     int dirty)
 {
-	return set_pte && pte_looks_normal(pte);
+	return pte;
 }
-#else
-/* Other embedded CPUs with HW exec support per-page, we flush on exec
- * fault if HWEXEC is not set
+
+#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+
+/* Embedded type MMU with HW exec support. This is a bit more complicated
+ * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
+ * instead we "filter out" the exec permission for non clean pages.
  */
-static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+static pte_t set_pte_filter(pte_t pte)
 {
-	return pte_looks_normal(pte) && is_exec_fault() &&
-		!(pte_val(pte) & _PAGE_HWEXEC);
+	struct page *pg;
+
+	/* No exec permission in the first place, move on */
+	if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+		return pte;
+
+	/* If you set _PAGE_EXEC on weird pages you're on your own */
+	pg = maybe_pte_to_page(pte);
+	if (unlikely(!pg))
+		return pte;
+
+	/* If the page clean, we move on */
+	if (test_bit(PG_arch_1, &pg->flags))
+		return pte;
+
+	/* If it's an exec fault, we flush the cache and make it clean */
+	if (is_exec_fault()) {
+		flush_dcache_icache_page(pg);
+		set_bit(PG_arch_1, &pg->flags);
+		return pte;
+	}
+
+	/* Else, we filter out _PAGE_EXEC */
+	return __pte(pte_val(pte) & ~_PAGE_EXEC);
 }
-#endif
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+				     int dirty)
+{
+	struct page *pg;
+
+	/* So here, we only care about exec faults, as we use them
+	 * to recover lost _PAGE_EXEC and perform I$/D$ coherency
+	 * if necessary. Also if _PAGE_EXEC is already set, same deal,
+	 * we just bail out
+	 */
+	if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+		return pte;
+
+#ifdef CONFIG_DEBUG_VM
+	/* So this is an exec fault, _PAGE_EXEC is not set. If it was
+	 * an error we would have bailed out earlier in do_page_fault()
+	 * but let's make sure of it
+	 */
+	if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
+		return pte;
+#endif /* CONFIG_DEBUG_VM */
+
+	/* If you set _PAGE_EXEC on weird pages you're on your own */
+	pg = maybe_pte_to_page(pte);
+	if (unlikely(!pg))
+		goto bail;
+
+	/* If the page is already clean, we move on */
+	if (test_bit(PG_arch_1, &pg->flags))
+		goto bail;
+
+	/* Clean the page and set PG_arch_1 */
+	flush_dcache_icache_page(pg);
+	set_bit(PG_arch_1, &pg->flags);
+
+ bail:
+	return __pte(pte_val(pte) | _PAGE_EXEC);
+}
+
+#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
 
 /*
  * set_pte stores a linux PTE into the linux page table.
  */
-void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		pte_t pte)
 {
 #ifdef CONFIG_DEBUG_VM
 	WARN_ON(pte_present(*ptep));
@@ -196,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte
 	 * this context might not have been activated yet when this
 	 * is called.
 	 */
-	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-	if (pte_need_exec_flush(pte, 1))
-		pte = do_dcache_icache_coherency(pte);
+	pte = set_pte_filter(pte);
 
 	/* Perform the setting of the PTE */
 	__set_pte_at(mm, addr, ptep, pte, 0);
@@ -215,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 			  pte_t *ptep, pte_t entry, int dirty)
 {
 	int changed;
-	if (!dirty && pte_need_exec_flush(entry, 0))
-		entry = do_dcache_icache_coherency(entry);
+	entry = set_access_flags_filter(entry, vma, dirty);
 	changed = !pte_same(*(ptep), entry);
 	if (changed) {
 		if (!(vma->vm_flags & VM_HUGETLB))
@@ -242,7 +319,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, addr);
 	BUG_ON(!pmd_present(*pmd));
-	BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd)));
+	assert_spin_locked(pte_lockptr(mm, pmd));
 }
 #endif /* CONFIG_DEBUG_VM */
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5422169626ba..cb96cb2e17cc 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
 		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
 
 	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
-	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
 
 	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index bfa7db6b2fd5..853d5565eed5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,6 +33,8 @@
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/lmb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -55,19 +57,36 @@
 
 unsigned long ioremap_bot = IOREMAP_BASE;
 
+
+#ifdef CONFIG_PPC_MMU_NOHASH
+static void *early_alloc_pgtable(unsigned long size)
+{
+	void *pt;
+
+	if (init_bootmem_done)
+		pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
+	else
+		pt = __va(lmb_alloc_base(size, size,
+					 __pa(MAX_DMA_ADDRESS)));
+	memset(pt, 0, size);
+
+	return pt;
+}
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
 /*
- * map_io_page currently only called by __ioremap
- * map_io_page adds an entry to the ioremap page table
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
  * and adds an entry to the HPT, possibly bolting it
  */
-static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
 
-	if (mem_init_done) {
+	if (slab_is_available()) {
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
@@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 	} else {
+#ifdef CONFIG_PPC_MMU_NOHASH
+		/* Warning ! This will blow up if bootmem is not initialized
+		 * which our ppc64 code is keen to do that, we'll need to
+		 * fix it and/or be more careful
+		 */
+		pgdp = pgd_offset_k(ea);
+#ifdef PUD_TABLE_SIZE
+		if (pgd_none(*pgdp)) {
+			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+			BUG_ON(pudp == NULL);
+			pgd_populate(&init_mm, pgdp, pudp);
+		}
+#endif /* PUD_TABLE_SIZE */
+		pudp = pud_offset(pgdp, ea);
+		if (pud_none(*pudp)) {
+			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+			BUG_ON(pmdp == NULL);
+			pud_populate(&init_mm, pudp, pmdp);
+		}
+		pmdp = pmd_offset(pudp, ea);
+		if (!pmd_present(*pmdp)) {
+			ptep = early_alloc_pgtable(PAGE_SIZE);
+			BUG_ON(ptep == NULL);
+			pmd_populate_kernel(&init_mm, pmdp, ptep);
+		}
+		ptep = pte_offset_kernel(pmdp, ea);
+		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+							  __pgprot(flags)));
+#else /* CONFIG_PPC_MMU_NOHASH */
 		/*
 		 * If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an
@@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 			       "memory at %016lx !\n", pa);
 			return -ENOMEM;
 		}
+#endif /* !CONFIG_PPC_MMU_NOHASH */
 	}
 	return 0;
 }
@@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 	WARN_ON(size & ~PAGE_MASK);
 
 	for (i = 0; i < size; i += PAGE_SIZE)
-		if (map_io_page((unsigned long)ea+i, pa+i, flags))
+		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
 			return NULL;
 
 	return (void __iomem *)ea;
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index a685652effeb..1d98ecc8eecd 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -191,7 +191,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 	unsigned long slbie_data = 0;
 	unsigned long pc = KSTK_EIP(tsk);
 	unsigned long stack = KSTK_ESP(tsk);
-	unsigned long unmapped_base;
+	unsigned long exec_base;
 
 	/*
 	 * We need interrupts hard-disabled here, not just soft-disabled,
@@ -227,42 +227,44 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 
 	/*
 	 * preload some userspace segments into the SLB.
+	 * Almost all 32 and 64bit PowerPC executables are linked at
+	 * 0x10000000 so it makes sense to preload this segment.
 	 */
-	if (test_tsk_thread_flag(tsk, TIF_32BIT))
-		unmapped_base = TASK_UNMAPPED_BASE_USER32;
-	else
-		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+	exec_base = 0x10000000;
 
-	if (is_kernel_addr(pc))
-		return;
-	slb_allocate(pc);
-
-	if (esids_match(pc,stack))
+	if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
+	    is_kernel_addr(exec_base))
 		return;
 
-	if (is_kernel_addr(stack))
-		return;
-	slb_allocate(stack);
+	slb_allocate(pc);
 
-	if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base))
-		return;
+	if (!esids_match(pc, stack))
+		slb_allocate(stack);
 
-	if (is_kernel_addr(unmapped_base))
-		return;
-	slb_allocate(unmapped_base);
+	if (!esids_match(pc, exec_base) &&
+	    !esids_match(stack, exec_base))
+		slb_allocate(exec_base);
 }
 
 static inline void patch_slb_encoding(unsigned int *insn_addr,
 				      unsigned int immed)
 {
-	/* Assume the instruction had a "0" immediate value, just
-	 * "or" in the new value
-	 */
-	*insn_addr |= immed;
+	*insn_addr = (*insn_addr & 0xffff0000) | immed;
 	flush_icache_range((unsigned long)insn_addr, 4+
 			   (unsigned long)insn_addr);
 }
 
+void slb_set_size(u16 size)
+{
+	extern unsigned int *slb_compare_rr_to_size;
+
+	if (mmu_slb_size == size)
+		return;
+
+	mmu_slb_size = size;
+	patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
+}
+
 void slb_initialize(void)
 {
 	unsigned long linear_llp, vmalloc_llp, io_llp;
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 65190587a365..8aaa8b7eb324 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb)
 		 */
 		_tlbia();
 	}
+
+	/* Push out batch of freed page tables */
+	pte_free_finish();
 }
 
 /*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 937eb90677d9..2b2f35f6985e 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -33,11 +33,6 @@
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
-/* This is declared as we are using the more or less generic
- * arch/powerpc/include/asm/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 /*
  * A linux PTE was changed and the corresponding hash table entry
  * neesd to be flushed. This function will either perform the flush
@@ -154,6 +149,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 	batch->index = 0;
 }
 
+void tlb_flush(struct mmu_gather *tlb)
+{
+	struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch);
+
+	/* If there's a TLB batch pending, then we must flush it because the
+	 * pages are going to be freed and we really don't want to have a CPU
+	 * access a freed page because it has a stale TLB
+	 */
+	if (tlbbatch->index)
+		__flush_tlb_pending(tlbbatch);
+
+	/* Push out batch of freed page tables */
+	pte_free_finish();
+}
+
 /**
  * __flush_hash_table_range - Flush all HPTEs for a given address range
  *                            from the hash table (and the TLB). But keeps
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
new file mode 100644
index 000000000000..ef1cccf71173
--- /dev/null
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -0,0 +1,770 @@
+/*
+ *  Low leve TLB miss handlers for Book3E
+ *
+ *  Copyright (C) 2008-2009
+ *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/pgtable.h>
+#include <asm/reg.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
+#else
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
+#endif
+#define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+
+
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with TLB reservation and HES support  *
+ *                                                                    *
+ **********************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* The page tables are mapped virtually linear. At this point, though,
+	 * we don't know whether we are trying to fault in a first level
+	 * virtual address or a virtual page table address. We can get that
+	 * from bit 0x1 of the region ID which we have set for a page table
+	 */
+	andi.	r10,r15,0x1
+	bne-	virt_page_table_tlb_miss
+
+	std	r14,EX_TLB_ESR(r12);	/* save ESR */
+	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
+
+	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+	li	r11,_PAGE_PRESENT
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r15,0		/* Check for user region */
+
+	/* We pre-test some combination of permissions to avoid double
+	 * faults:
+	 *
+	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+	 * ESR_ST   is 0x00800000
+	 * _PAGE_BAP_SW is 0x00000010
+	 * So the shift is >> 19. This tests for supervisor writeability.
+	 * If the page happens to be supervisor writeable and not user
+	 * writeable, we will take a new fault later, but that should be
+	 * a rare enough case.
+	 *
+	 * We also move ESR_ST in _PAGE_DIRTY position
+	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+	 *
+	 * MAS1 is preset for all we need except for TID that needs to
+	 * be cleared for kernel translations
+	 */
+	rlwimi	r11,r14,32-19,27,27
+	rlwimi	r11,r14,32-16,19,19
+	beq	normal_tlb_miss
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by writing a crazy value in ESR in our exception frame
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	cmpldi	cr0,r15,0			/* Check for user region */
+	std	r14,EX_TLB_ESR(r12)		/* write crazy -1 to frame */
+	beq	normal_tlb_miss
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of the first-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = region ID
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+normal_tlb_miss:
+	/* So we first construct the page table address. We do that by
+	 * shifting the bottom of the address (not the region ID) by
+	 * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
+	 * or'ing the fourth high bit.
+	 *
+	 * NOTE: For 64K pages, we do things slightly differently in
+	 * order to handle the weird page table format used by linux
+	 */
+	ori	r10,r15,0x1
+#ifdef CONFIG_PPC_64K_PAGES
+	/* For the top bits, 16 bytes per PTE */
+	rldicl	r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
+	/* Now create the bottom bits as 0 in position 0x8000 and
+	 * the rest calculated for 8 bytes per PTE
+	 */
+	rldicl	r15,r16,64-(PAGE_SHIFT-3),64-15
+	/* Insert the bottom bits in */
+	rlwimi	r14,r15,0,16,31
+#else
+	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+#endif
+	sldi	r15,r10,60
+	clrrdi	r14,r14,3
+	or	r10,r15,r14
+
+BEGIN_MMU_FTR_SECTION
+	/* Set the TLB reservation and seach for existing entry. Then load
+	 * the entry.
+	 */
+	PPC_TLBSRX_DOT(0,r16)
+	ld	r14,0(r10)
+	beq	normal_tlb_miss_done
+MMU_FTR_SECTION_ELSE
+	ld	r14,0(r10)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+finish_normal_tlb_miss:
+	/* Check if required permissions are met */
+	andc.	r15,r11,r14
+	bne-	normal_tlb_miss_access_fault
+
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE need change if !base page size, not
+	 *                 yet implemented for now
+	 * MAS 2   :	Defaults not useful, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * TODO: mix up code below for better scheduling
+	 */
+	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	mtspr	SPRN_MAS2,r11
+
+	/* Check page size, if not standard, update MAS1 */
+	rldicl	r11,r14,64-8,64-8
+#ifdef CONFIG_PPC_64K_PAGES
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
+#else
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
+#endif
+	beq-	1f
+	mfspr	r11,SPRN_MAS1
+	rlwimi	r11,r14,31,21,24
+	rlwinm	r11,r11,0,21,19
+	mtspr	SPRN_MAS1,r11
+1:
+	/* Move RPN in position */
+	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	clrldi	r15,r11,12		/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	andi.	r11,r14,_PAGE_DIRTY
+	bne	1f
+	li	r11,MAS3_SW|MAS3_UW
+	andc	r15,r15,r11
+1:
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r15,32
+	mtspr	SPRN_MAS3,r15
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r15
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+normal_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+normal_tlb_miss_access_fault:
+	/* We need to check if it was an instruction miss */
+	andi.	r10,r11,_PAGE_EXEC
+	bne	1f
+	ld	r14,EX_TLB_DEAR(r12)
+	ld	r15,EX_TLB_ESR(r12)
+	mtspr	SPRN_DEAR,r14
+	mtspr	SPRN_ESR,r15
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+	/* Are we hitting a kernel page table ? */
+	andi.	r10,r15,0x8
+
+	/* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+	 * and we happen to have the swapper_pg_dir at offset 8 from the user
+	 * pgdir in the PACA :-).
+	 */
+	add	r11,r10,r13
+
+	/* If kernel, we need to clear MAS1 TID */
+	beq	1f
+	/* XXX replace the RMW cycles with immediate loads + writes */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+1:
+BEGIN_MMU_FTR_SECTION
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 */
+	PPC_TLBSRX_DOT(0,r16)
+	beq	virt_page_table_tlb_miss_done
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+	bne-	virt_page_table_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	ld	r15,PACAPGD(r11)
+	cmpldi	cr0,r15,0
+	beq-	virt_page_table_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	virt_page_table_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	virt_page_table_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	virt_page_table_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create a kernel translation for
+	 * a 4K or 64K page from r16 -> r15.
+	 */
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * So we only do MAS 2 and 3 for now...
+	 */
+	clrldi	r11,r15,4		/* remove region ID from RPN */
+	ori	r10,r11,1		/* Or-in SR */
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+BEGIN_MMU_FTR_SECTION
+virt_page_table_tlb_miss_done:
+
+	/* We have overriden MAS2:EPN but currently our primary TLB miss
+	 * handler will always restore it so that should not be an issue,
+	 * if we ever optimize the primary handler to not write MAS2 on
+	 * some cases, we'll have to restore MAS2:EPN here based on the
+	 * original fault's DEAR. If we do that we have to modify the
+	 * ITLB miss handler to also store SRR0 in the exception frame
+	 * as DEAR.
+	 *
+	 * However, one nasty thing we did is we cleared the reservation
+	 * (well, potentially we did). We do a trick here thus if we
+	 * are not a level 0 exception (we interrupted the TLB miss) we
+	 * offset the return address by -4 in order to replay the tlbsrx
+	 * instruction there
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	1f
+	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+	addi	r10,r11,-4
+	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+1:
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+	/* Return to caller, normal case */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+virt_page_table_tlb_miss_fault:
+	/* If we fault here, things are a little bit tricky. We need to call
+	 * either data or instruction store fault, and we need to retreive
+	 * the original fault address and ESR (for data).
+	 *
+	 * The thing is, we know that in normal circumstances, this is
+	 * always called as a second level tlb miss for SW load or as a first
+	 * level TLB miss for HW load, so we should be able to peek at the
+	 * relevant informations in the first exception frame in the PACA.
+	 *
+	 * However, we do need to double check that, because we may just hit
+	 * a stray kernel pointer or a userland attack trying to hit those
+	 * areas. If that is the case, we do a data fault. (We can't get here
+	 * from an instruction tlb miss anyway).
+	 *
+	 * Note also that when going to a fault, we must unwind the previous
+	 * level as well. Since we are doing that, we don't need to clear or
+	 * restore the TLB reservation neither.
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	virt_page_table_tlb_miss_whacko_fault
+
+	/* We dig the original DEAR and ESR from slot 0 */
+	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+	ld	r16,EX_TLB_ESR+PACA_EXTLB(r13)
+
+	/* We check for the "special" ESR value for instruction faults */
+	cmpdi	cr0,r16,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r15
+	mtspr	SPRN_ESR,r16
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+virt_page_table_tlb_miss_whacko_fault:
+	/* The linear fault will restart everything so ESR and DEAR will
+	 * not have been clobbered, let's just fault with what we have
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+
+/**************************************************************
+ *                                                            *
+ * TLB miss handling for Book3E with hw page table support    *
+ *                                                            *
+ **************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0		/* Check for user region */
+	ld	r15,PACAPGD(r13)	/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)	/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by keeping a crazy value for ESR in r14
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0			/* Check for user region */
+	ld	r15,PACAPGD(r13)		/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)		/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = PGD pointer
+ * r14 = ESR
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will save/restore things for us
+ */
+htw_tlb_miss:
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 *
+	 * MAS1:IND should be already set based on MAS4
+	 */
+	PPC_TLBSRX_DOT(0,r16)
+	beq	htw_tlb_miss_done
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	htw_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	cmpldi	cr0,r15,0
+	beq-	htw_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	htw_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	htw_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpldi	cr0,r15,0
+	beq	htw_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create an indirect entry for
+	 * a 1M or 256M page.
+	 *
+	 * The last trick is now that because we use "half" pages for
+	 * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
+	 * for an added LSB bit to the RPN. For 64K pages, there is no
+	 * problem as we already use 32K arrays (half PTE pages), but for
+	 * 4K page we need to extract a bit from the virtual address and
+	 * insert it into the "PA52" bit of the RPN.
+	 */
+#ifndef CONFIG_PPC_64K_PAGES
+	rlwimi	r15,r16,32-9,20,20
+#endif
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	ori	r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
+#else
+	ori	r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+#endif
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+htw_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+htw_tlb_miss_fault:
+	/* We need to check if it was an instruction miss. We know this
+	 * though because r14 would contain -1
+	 */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r16
+	mtspr	SPRN_ESR,r14
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ *       a few things around
+ */
+tlb_load_linear:
+	/* For now, we assume the linear mapping is contiguous and stops at
+	 * linear_map_top. We also assume the size is a multiple of 1G, thus
+	 * we only use 1G pages for now. That might have to be changed in a
+	 * final implementation, especially when dealing with hypervisors
+	 */
+	ld	r11,PACATOC(r13)
+	ld	r11,linear_map_top@got(r11)
+	ld	r10,0(r11)
+	cmpld	cr0,r10,r16
+	bge	tlb_load_linear_fault
+
+	/* MAS1 need whole new setup. */
+	li	r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+	oris	r15,r15,MAS1_VALID@h	/* MAS1 needs V and TSIZE */
+	mtspr	SPRN_MAS1,r15
+
+	/* Already somebody there ? */
+	PPC_TLBSRX_DOT(0,r16)
+	beq	tlb_load_linear_done
+
+	/* Now we build the remaining MAS. MAS0 and 2 should be fine
+	 * with their defaults, which leaves us with MAS 3 and 7. The
+	 * mapping is linear, so we just take the address, clear the
+	 * region bits, and or in the permission bits which are currently
+	 * hard wired
+	 */
+	clrrdi	r10,r16,30		/* 1G page index */
+	clrldi	r10,r10,4		/* clear region bits */
+	ori	r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+tlb_load_linear_done:
+	/* We use the "error" epilog for success as we do want to
+	 * restore to the initial faulting context, whatever it was.
+	 * We do that because we can't resume a fault within a TLB
+	 * miss handler, due to MAS and TLB reservation being clobbered.
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
+	TLB_MISS_EPILOG_ERROR
+	rfi
+
+tlb_load_linear_fault:
+	/* We keep the DEAR and ESR around, this shouldn't have happened */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_data_storage_book3e
+1:	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_instruction_storage_book3e
+
+
+#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
+.tlb_stat_inc:
+1:	ldarx	r8,0,r9
+	addi	r8,r8,1
+	stdcx.	r8,0,r9
+	bne-	1b
+	blr
+#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ad2eb4d34dd4..2fbc680c2c71 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -7,8 +7,8 @@
  *
  *  -- BenH
  *
- * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
- *                IBM Corp.
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                     IBM Corp.
  *
  *  Derived from arch/ppc/mm/init.c:
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -34,12 +34,71 @@
 #include <linux/pagemap.h>
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/lmb.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
+#include <asm/code-patching.h>
 
 #include "mmu_decl.h"
 
+#ifdef CONFIG_PPC_BOOK3E
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_16K] = {
+		.shift	= 14,
+		.enc	= BOOK3E_PAGESZ_16K,
+	},
+	[MMU_PAGE_64K] = {
+		.shift	= 16,
+		.enc	= BOOK3E_PAGESZ_64K,
+	},
+	[MMU_PAGE_1M] = {
+		.shift	= 20,
+		.enc	= BOOK3E_PAGESZ_1M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+	/* This isn't used on !Book3E for now */
+	return 0;
+}
+#endif
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+#ifdef CONFIG_PPC64
+
+int mmu_linear_psize;		/* Page size used for the linear mapping */
+int mmu_pte_psize;		/* Page size used for PTE pages */
+int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
+int book3e_htw_enabled;		/* Is HW tablewalk enabled ? */
+unsigned long linear_map_top;	/* Top of linear mapping */
+
+#endif /* CONFIG_PPC64 */
+
 /*
  * Base TLB flushing operations:
  *
@@ -67,18 +126,24 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(local_flush_tlb_mm);
 
-void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+			    int tsize, int ind)
 {
 	unsigned int pid;
 
 	preempt_disable();
-	pid = vma ? vma->vm_mm->context.id : 0;
+	pid = mm ? mm->context.id : 0;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbil_va(vmaddr, pid);
+		_tlbil_va(vmaddr, pid, tsize, ind);
 	preempt_enable();
 }
-EXPORT_SYMBOL(local_flush_tlb_page);
 
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	__local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			       mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
 
 /*
  * And here are the SMP non-local implementations
@@ -87,9 +152,17 @@ EXPORT_SYMBOL(local_flush_tlb_page);
 
 static DEFINE_SPINLOCK(tlbivax_lock);
 
+static int mm_is_core_local(struct mm_struct *mm)
+{
+	return cpumask_subset(mm_cpumask(mm),
+			      topology_thread_cpumask(smp_processor_id()));
+}
+
 struct tlb_flush_param {
 	unsigned long addr;
 	unsigned int pid;
+	unsigned int tsize;
+	unsigned int ind;
 };
 
 static void do_flush_tlb_mm_ipi(void *param)
@@ -103,7 +176,7 @@ static void do_flush_tlb_page_ipi(void *param)
 {
 	struct tlb_flush_param *p = param;
 
-	_tlbil_va(p->addr, p->pid);
+	_tlbil_va(p->addr, p->pid, p->tsize, p->ind);
 }
 
 
@@ -131,7 +204,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
-	if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+	if (!mm_is_core_local(mm)) {
 		struct tlb_flush_param p = { .pid = pid };
 		/* Ignores smp_processor_id() even if set. */
 		smp_call_function_many(mm_cpumask(mm),
@@ -143,37 +216,49 @@ void flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(flush_tlb_mm);
 
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+		      int tsize, int ind)
 {
 	struct cpumask *cpu_mask;
 	unsigned int pid;
 
 	preempt_disable();
-	pid = vma ? vma->vm_mm->context.id : 0;
+	pid = mm ? mm->context.id : 0;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto bail;
-	cpu_mask = mm_cpumask(vma->vm_mm);
-	if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) {
+	cpu_mask = mm_cpumask(mm);
+	if (!mm_is_core_local(mm)) {
 		/* If broadcast tlbivax is supported, use it */
 		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
 			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
 			if (lock)
 				spin_lock(&tlbivax_lock);
-			_tlbivax_bcast(vmaddr, pid);
+			_tlbivax_bcast(vmaddr, pid, tsize, ind);
 			if (lock)
 				spin_unlock(&tlbivax_lock);
 			goto bail;
 		} else {
-			struct tlb_flush_param p = { .pid = pid, .addr = vmaddr };
+			struct tlb_flush_param p = {
+				.pid = pid,
+				.addr = vmaddr,
+				.tsize = tsize,
+				.ind = ind,
+			};
 			/* Ignores smp_processor_id() even if set in cpu_mask */
 			smp_call_function_many(cpu_mask,
 					       do_flush_tlb_page_ipi, &p, 1);
 		}
 	}
-	_tlbil_va(vmaddr, pid);
+	_tlbil_va(vmaddr, pid, tsize, ind);
  bail:
 	preempt_enable();
 }
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			 mmu_get_tsize(mmu_virtual_psize), 0);
+}
 EXPORT_SYMBOL(flush_tlb_page);
 
 #endif /* CONFIG_SMP */
@@ -207,3 +292,156 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_mm(vma->vm_mm);
 }
 EXPORT_SYMBOL(flush_tlb_range);
+
+void tlb_flush(struct mmu_gather *tlb)
+{
+	flush_tlb_mm(tlb->mm);
+
+	/* Push out batch of freed page tables */
+	pte_free_finish();
+}
+
+/*
+ * Below are functions specific to the 64-bit variant of Book3E though that
+ * may change in the future
+ */
+
+#ifdef CONFIG_PPC64
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+
+	if (book3e_htw_enabled) {
+		unsigned long start = address & PMD_MASK;
+		unsigned long end = address + PMD_SIZE;
+		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+		/* This isn't the most optimal, ideally we would factor out the
+		 * while preempt & CPU mask mucking around, or even the IPI but
+		 * it will do for now
+		 */
+		while (start < end) {
+			__flush_tlb_page(tlb->mm, start, tsize, 1);
+			start += size;
+		}
+	} else {
+		unsigned long rmask = 0xf000000000000000ul;
+		unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+		unsigned long vpte = address & ~rmask;
+
+#ifdef CONFIG_PPC_64K_PAGES
+		vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
+#else
+		vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+#endif
+		vpte |= rid;
+		__flush_tlb_page(tlb->mm, vpte, tsize, 0);
+	}
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
+	extern unsigned int interrupt_base_book3e;
+	extern unsigned int exc_data_tlb_miss_htw_book3e;
+	extern unsigned int exc_instruction_tlb_miss_htw_book3e;
+
+	unsigned int *ibase = &interrupt_base_book3e;
+	unsigned int mas4;
+
+	/* XXX This will have to be decided at runtime, but right
+	 * now our boot and TLB miss code hard wires it. Ideally
+	 * we should find out a suitable page size and patch the
+	 * TLB miss code (either that or use the PACA to store
+	 * the value we want)
+	 */
+	mmu_linear_psize = MMU_PAGE_1G;
+
+	/* XXX This should be decided at runtime based on supported
+	 * page sizes in the TLB, but for now let's assume 16M is
+	 * always there and a good fit (which it probably is)
+	 */
+	mmu_vmemmap_psize = MMU_PAGE_16M;
+
+	/* Check if HW tablewalk is present, and if yes, enable it by:
+	 *
+	 * - patching the TLB miss handlers to branch to the
+	 *   one dedicates to it
+	 *
+	 * - setting the global book3e_htw_enabled
+	 *
+	 * - Set MAS4:INDD and default page size
+	 */
+
+	/* XXX This code only checks for TLB 0 capabilities and doesn't
+	 *     check what page size combos are supported by the HW. It
+	 *     also doesn't handle the case where a separate array holds
+	 *     the IND entries from the array loaded by the PT.
+	 */
+	if (boot_cpu) {
+		unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+
+		/* Check if HW loader is supported */
+		if ((tlb0cfg & TLBnCFG_IND) &&
+		    (tlb0cfg & TLBnCFG_PT)) {
+			patch_branch(ibase + (0x1c0 / 4),
+			     (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
+			patch_branch(ibase + (0x1e0 / 4),
+			     (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
+			book3e_htw_enabled = 1;
+		}
+		pr_info("MMU: Book3E Page Tables %s\n",
+			book3e_htw_enabled ? "Enabled" : "Disabled");
+	}
+
+	/* Set MAS4 based on page table setting */
+
+	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+	if (book3e_htw_enabled) {
+		mas4 |= mas4 | MAS4_INDD;
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_256M;
+#else
+		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_1M;
+#endif
+	} else {
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
+#else
+		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+#endif
+		mmu_pte_psize = mmu_virtual_psize;
+	}
+	mtspr(SPRN_MAS4, mas4);
+
+	/* Set the global containing the top of the linear mapping
+	 * for use by the TLB miss code
+	 */
+	linear_map_top = lmb_end_of_DRAM();
+
+	/* A sync won't hurt us after mucking around with
+	 * the MMU configuration
+	 */
+	mb();
+}
+
+void __init early_init_mmu(void)
+{
+	__early_init_mmu(1);
+}
+
+void __cpuinit early_init_mmu_secondary(void)
+{
+	__early_init_mmu(0);
+}
+
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 3037911279b1..bbdc5b577b85 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -39,7 +39,7 @@
 /*
  * 40x implementation needs only tlbil_va
  */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
 	/* We run the search with interrupts disabled because we have to change
 	 * the PID and I don't want to preempt when that happens.
 	 */
@@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va)
  * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
  * of the TLB for everything else.
  */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
 	mfspr	r5,SPRN_MMUCR
 	rlwimi	r5,r4,0,24,31			/* Set TID */
 
@@ -124,8 +124,6 @@ _GLOBAL(_tlbil_pid)
  * to have the larger code path before the _SECTION_ELSE
  */
 
-#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
-			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
 /*
  * Flush MMU TLB on the local processor
  */
@@ -170,7 +168,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
  * Flush MMU TLB for a particular address, but only on the local processor
  * (no broadcast)
  */
-_GLOBAL(_tlbil_va)
+_GLOBAL(__tlbil_va)
 	mfmsr	r10
 	wrteei	0
 	slwi	r4,r4,16
@@ -191,6 +189,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
 	isync
 1:	wrtee	r10
 	blr
+#elif defined(CONFIG_PPC_BOOK3E)
+/*
+ * New Book3E (>= 2.06) implementation
+ *
+ * Note: We may be able to get away without the interrupt masking stuff
+ * if we save/restore MAS6 on exceptions that might modify it
+ */
+_GLOBAL(_tlbil_pid)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid_noind)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	ori	r4,r4,MAS6_SIND
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_all)
+	PPC_TLBILX_ALL(0,0)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBILX_VA(0,r3)
+	msync
+	isync
+	wrtee	r10
+	blr
+
+_GLOBAL(_tlbivax_bcast)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBIVAX(0,r3)
+	eieio
+	tlbsync
+	sync
+	wrtee	r10
+	blr
+
+_GLOBAL(set_context)
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r4, 0x4(r5)
+#endif
+	mtspr	SPRN_PID,r3
+	isync			/* Force context change */
+	blr
 #else
 #error Unsupported processor type !
 #endif