diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index ba3fff06fad1..1d97fe1c0e53 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -326,7 +326,7 @@ static void smp_ext_bitcall(int cpu, ec_bit_sig sig)
  */
 void smp_ptlb_callback(void *info)
 {
-	local_flush_tlb();
+	__tlb_flush_local();
 }
 
 void smp_ptlb_all(void)
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index e45d3c9a4b7e..6cbbfe4f6749 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -82,7 +82,6 @@ static inline void pgd_free(pgd_t *pgd)
  */
 #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
 #define pmd_free(x)                     do { } while (0)
-#define __pmd_free_tlb(tlb,x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)      BUG()
 #define pgd_populate_kernel(mm, pmd, pte)	BUG()
 #else /* __s390x__ */
@@ -118,12 +117,6 @@ static inline void pmd_free (pmd_t *pmd)
 	free_pages((unsigned long) pmd, PMD_ALLOC_ORDER);
 }
 
-#define __pmd_free_tlb(tlb,pmd)			\
-	do {					\
-		tlb_flush_mmu(tlb, 0, 0);	\
-		pmd_free(pmd);			\
-	 } while (0)
-
 static inline void
 pgd_populate_kernel(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 {
@@ -224,14 +217,4 @@ static inline void pte_free(struct page *pte)
 	__free_page(pte);
 }
 
-#define __pte_free_tlb(tlb, pte)					\
-({									\
-	struct mmu_gather *__tlb = (tlb);				\
-	struct page *__pte = (pte);					\
-	struct page *shadow_page = get_shadow_page(__pte);		\
-	if (shadow_page)						\
-		tlb_remove_page(__tlb, shadow_page);			\
-	tlb_remove_page(__tlb, __pte);					\
-})
-
 #endif /* _S390_PGALLOC_H */
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index 39bb5192dc31..b424ab21f8bd 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -424,7 +424,8 @@ static inline pgd_t *get_shadow_pgd(pgd_t *pgdp)
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
  */
-static inline void set_pte(pte_t *pteptr, pte_t pteval)
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *pteptr, pte_t pteval)
 {
 	pte_t *shadow_pte = get_shadow_pte(pteptr);
 
@@ -437,7 +438,6 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval)
 			pte_val(*shadow_pte) = _PAGE_TYPE_EMPTY;
 	}
 }
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
 
 /*
  * pgd/pmd/pte query functions
@@ -508,7 +508,8 @@ static inline int pte_file(pte_t pte)
 	return (pte_val(pte) & mask) == _PAGE_TYPE_FILE;
 }
 
-#define pte_same(a,b)	(pte_val(a) == pte_val(b))
+#define __HAVE_ARCH_PTE_SAME
+#define pte_same(a,b)  (pte_val(a) == pte_val(b))
 
 /*
  * query functions pte_write/pte_dirty/pte_young only work if
@@ -663,24 +664,19 @@ static inline pte_t pte_mkyoung(pte_t pte)
 	return pte;
 }
 
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
 
-static inline int
-ptep_clear_flush_young(struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep)
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+					 unsigned long address, pte_t *ptep)
 {
 	/* No need to flush TLB; bits are in storage key */
-	return ptep_test_and_clear_young(vma, address, ptep);
-}
-
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	pte_clear(mm, addr, ptep);
-	return pte;
+	return 0;
 }
 
 static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
@@ -709,6 +705,32 @@ static inline void ptep_invalidate(unsigned long address, pte_t *ptep)
 		__ptep_ipte(address, ptep);
 }
 
+/*
+ * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
+ * both clear the TLB for the unmapped pte. The reason is that
+ * ptep_get_and_clear is used in common code (e.g. change_pte_range)
+ * to modify an active pte. The sequence is
+ *   1) ptep_get_and_clear
+ *   2) set_pte_at
+ *   3) flush_tlb_range
+ * On s390 the tlb needs to get flushed with the modification of the pte
+ * if the pte is active. The only way how this can be implemented is to
+ * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range
+ * is a nop.
+ */
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear(__mm, __address, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (atomic_read(&(__mm)->mm_users) > 1 ||			\
+	    (__mm) != current->active_mm)				\
+		ptep_invalidate(__address, __ptep);			\
+	else								\
+		pte_clear((__mm), (__address), (__ptep));		\
+	__pte;								\
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 				     unsigned long address, pte_t *ptep)
 {
@@ -717,12 +739,40 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 	return pte;
 }
 
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+/*
+ * The batched pte unmap code uses ptep_get_and_clear_full to clear the
+ * ptes. Here an optimization is possible. tlb_gather_mmu flushes all
+ * tlbs of an mm if it can guarantee that the ptes of the mm_struct
+ * cannot be accessed while the batched unmap is running. In this case
+ * full==1 and a simple pte_clear is enough. See tlb.h.
+ */
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long addr,
+					    pte_t *ptep, int full)
 {
-	pte_t old_pte = *ptep;
-	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
+	pte_t pte = *ptep;
+
+	if (full)
+		pte_clear(mm, addr, ptep);
+	else
+		ptep_invalidate(addr, ptep);
+	return pte;
 }
 
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect(__mm, __addr, __ptep)			\
+({									\
+	pte_t __pte = *(__ptep);					\
+	if (pte_write(__pte)) {						\
+		if (atomic_read(&(__mm)->mm_users) > 1 ||		\
+		    (__mm) != current->active_mm)			\
+			ptep_invalidate(__addr, __ptep);		\
+		set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte));	\
+	}								\
+})
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty)	\
 ({									\
 	int __changed = !pte_same(*(__ptep), __entry);			\
@@ -740,11 +790,13 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  * should therefore only be called if it is not mapped in any
  * address space.
  */
+#define __HAVE_ARCH_PAGE_TEST_DIRTY
 static inline int page_test_dirty(struct page *page)
 {
 	return (page_get_storage_key(page_to_phys(page)) & _PAGE_CHANGED) != 0;
 }
 
+#define __HAVE_ARCH_PAGE_CLEAR_DIRTY
 static inline void page_clear_dirty(struct page *page)
 {
 	page_set_storage_key(page_to_phys(page), PAGE_DEFAULT_KEY);
@@ -753,6 +805,7 @@ static inline void page_clear_dirty(struct page *page)
 /*
  * Test and clear referenced bit in storage key.
  */
+#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
 static inline int page_test_and_clear_young(struct page *page)
 {
 	unsigned long physpage = page_to_phys(page);
@@ -930,16 +983,6 @@ extern int remove_shared_memory(unsigned long start, unsigned long size);
 #define __HAVE_ARCH_MEMMAP_INIT
 extern void memmap_init(unsigned long, int, unsigned long, unsigned long);
 
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
-#define __HAVE_ARCH_PAGE_TEST_DIRTY
-#define __HAVE_ARCH_PAGE_CLEAR_DIRTY
-#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
 #include <asm-generic/pgtable.h>
 
 #endif /* _S390_PAGE_H */
diff --git a/include/asm-s390/tlb.h b/include/asm-s390/tlb.h
index 51bd957b85bd..55ae45ef31b5 100644
--- a/include/asm-s390/tlb.h
+++ b/include/asm-s390/tlb.h
@@ -2,19 +2,128 @@
 #define _S390_TLB_H
 
 /*
- * s390 doesn't need any special per-pte or
- * per-vma handling..
+ * TLB flushing on s390 is complicated. The following requirement
+ * from the principles of operation is the most arduous:
+ *
+ * "A valid table entry must not be changed while it is attached
+ * to any CPU and may be used for translation by that CPU except to
+ * (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
+ * or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
+ * table entry, or (3) make a change by means of a COMPARE AND SWAP
+ * AND PURGE instruction that purges the TLB."
+ *
+ * The modification of a pte of an active mm struct therefore is
+ * a two step process: i) invalidate the pte, ii) store the new pte.
+ * This is true for the page protection bit as well.
+ * The only possible optimization is to flush at the beginning of
+ * a tlb_gather_mmu cycle if the mm_struct is currently not in use.
+ *
+ * Pages used for the page tables is a different story. FIXME: more
  */
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <asm/processor.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#ifndef CONFIG_SMP
+#define TLB_NR_PTRS	1
+#else
+#define TLB_NR_PTRS	508
+#endif
+
+struct mmu_gather {
+	struct mm_struct *mm;
+	unsigned int fullmm;
+	unsigned int nr_ptes;
+	unsigned int nr_pmds;
+	void *array[TLB_NR_PTRS];
+};
+
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
+						unsigned int full_mm_flush)
+{
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+
+	tlb->mm = mm;
+	tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
+		(atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm);
+	tlb->nr_ptes = 0;
+	tlb->nr_pmds = TLB_NR_PTRS;
+	if (tlb->fullmm)
+		__tlb_flush_mm(mm);
+	return tlb;
+}
+
+static inline void tlb_flush_mmu(struct mmu_gather *tlb,
+				 unsigned long start, unsigned long end)
+{
+	if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pmds < TLB_NR_PTRS))
+		__tlb_flush_mm(tlb->mm);
+	while (tlb->nr_ptes > 0)
+		pte_free(tlb->array[--tlb->nr_ptes]);
+	while (tlb->nr_pmds < TLB_NR_PTRS)
+		pmd_free((pmd_t *) tlb->array[tlb->nr_pmds++]);
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *tlb,
+				  unsigned long start, unsigned long end)
+{
+	tlb_flush_mmu(tlb, start, end);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
+}
 
 /*
- * .. because we flush the whole mm when it
- * fills up.
+ * Release the page cache reference for a pte removed by
+ * tlb_ptep_clear_flush. In both flush modes the tlb fo a page cache page
+ * has already been freed, so just do free_page_and_swap_cache.
  */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+	free_page_and_swap_cache(page);
+}
 
-#include <asm-generic/tlb.h>
+/*
+ * pte_free_tlb frees a pte table and clears the CRSTE for the
+ * page table from the tlb.
+ */
+static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page)
+{
+	if (!tlb->fullmm) {
+		tlb->array[tlb->nr_ptes++] = page;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pte_free(page);
+}
 
+/*
+ * pmd_free_tlb frees a pmd table and clears the CRSTE for the
+ * segment table entry from the tlb.
+ */
+static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+#ifdef __s390x__
+	if (!tlb->fullmm) {
+		tlb->array[--tlb->nr_pmds] = (struct page *) pmd;
+		if (tlb->nr_ptes >= tlb->nr_pmds)
+			tlb_flush_mmu(tlb, 0, 0);
+	} else
+		pmd_free(pmd);
 #endif
+}
+
+#define tlb_start_vma(tlb, vma)			do { } while (0)
+#define tlb_end_vma(tlb, vma)			do { } while (0)
+#define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
+#define tlb_migrate_finish(mm)			do { } while (0)
+
+#endif /* _S390_TLB_H */
diff --git a/include/asm-s390/tlbflush.h b/include/asm-s390/tlbflush.h
index 6de2632a3e4f..3a9985fbc8af 100644
--- a/include/asm-s390/tlbflush.h
+++ b/include/asm-s390/tlbflush.h
@@ -6,68 +6,19 @@
 #include <asm/pgalloc.h>
 
 /*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs 
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ * Flush all tlb entries on the local cpu.
  */
+static inline void __tlb_flush_local(void)
+{
+	asm volatile("ptlb" : : : "memory");
+}
 
 /*
- * S/390 has three ways of flushing TLBs
- * 'ptlb' does a flush of the local processor
- * 'csp' flushes the TLBs on all PUs of a SMP
- * 'ipte' invalidates a pte in a page table and flushes that out of
- * the TLBs of all PUs of a SMP
+ * Flush all tlb entries on all cpus.
  */
-
-#define local_flush_tlb() \
-do {  asm volatile("ptlb": : :"memory"); } while (0)
-
-#ifndef CONFIG_SMP
-
-/*
- * We always need to flush, since s390 does not flush tlb
- * on each context switch
- */
-
-static inline void flush_tlb(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_all(void)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm) 
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
-{
-	local_flush_tlb();
-}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	local_flush_tlb();
-}
-
-#define flush_tlb_kernel_range(start, end) \
-	local_flush_tlb();
-
-#else
-
-#include <asm/smp.h>
-
-extern void smp_ptlb_all(void);
-
-static inline void global_flush_tlb(void)
+static inline void __tlb_flush_global(void)
 {
+	extern void smp_ptlb_all(void);
 	register unsigned long reg2 asm("2");
 	register unsigned long reg3 asm("3");
 	register unsigned long reg4 asm("4");
@@ -89,66 +40,75 @@ static inline void global_flush_tlb(void)
 }
 
 /*
- * We only have to do global flush of tlb if process run since last
- * flush on any other pu than current. 
- * If we have threads (mm->count > 1) we always do a global flush, 
- * since the process runs on more than one processor at the same time.
+ * Flush all tlb entries of a page table on all cpus.
  */
+static inline void __tlb_flush_idte(pgd_t *pgd)
+{
+	asm volatile(
+		"	.insn	rrf,0xb98e0000,0,%0,%1,0"
+		: : "a" (2048), "a" (__pa(pgd) & PAGE_MASK) : "cc" );
+}
 
-static inline void __flush_tlb_mm(struct mm_struct * mm)
+static inline void __tlb_flush_mm(struct mm_struct * mm)
 {
 	cpumask_t local_cpumask;
 
 	if (unlikely(cpus_empty(mm->cpu_vm_mask)))
 		return;
+	/*
+	 * If the machine has IDTE we prefer to do a per mm flush
+	 * on all cpus instead of doing a local flush if the mm
+	 * only ran on the local cpu.
+	 */
 	if (MACHINE_HAS_IDTE) {
 		pgd_t *shadow_pgd = get_shadow_pgd(mm->pgd);
 
-		if (shadow_pgd) {
-			asm volatile(
-				"	.insn	rrf,0xb98e0000,0,%0,%1,0"
-				: : "a" (2048),
-				"a" (__pa(shadow_pgd) & PAGE_MASK) : "cc" );
-		}
-		asm volatile(
-			"	.insn	rrf,0xb98e0000,0,%0,%1,0"
-			: : "a" (2048), "a" (__pa(mm->pgd)&PAGE_MASK) : "cc");
+		if (shadow_pgd)
+			__tlb_flush_idte(shadow_pgd);
+		__tlb_flush_idte(mm->pgd);
 		return;
 	}
 	preempt_disable();
+	/*
+	 * If the process only ran on the local cpu, do a local flush.
+	 */
 	local_cpumask = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(mm->cpu_vm_mask, local_cpumask))
-		local_flush_tlb();
+		__tlb_flush_local();
 	else
-		global_flush_tlb();
+		__tlb_flush_global();
 	preempt_enable();
 }
 
-static inline void flush_tlb(void)
+static inline void __tlb_flush_mm_cond(struct mm_struct * mm)
 {
-	__flush_tlb_mm(current->mm);
-}
-static inline void flush_tlb_all(void)
-{
-	global_flush_tlb();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm) 
-{
-	__flush_tlb_mm(mm); 
-}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
-{
-	__flush_tlb_mm(vma->vm_mm);
-}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	__flush_tlb_mm(vma->vm_mm); 
+	if (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm)
+		__tlb_flush_mm(mm);
 }
 
-#define flush_tlb_kernel_range(start, end) global_flush_tlb()
+/*
+ * TLB flushing:
+ *  flush_tlb() - flushes the current mm struct TLBs
+ *  flush_tlb_all() - flushes all processes TLBs
+ *  flush_tlb_mm(mm) - flushes the specified mm context TLB's
+ *  flush_tlb_page(vma, vmaddr) - flushes one page
+ *  flush_tlb_range(vma, start, end) - flushes a range of pages
+ *  flush_tlb_kernel_range(start, end) - flushes a range of kernel pages
+ */
 
-#endif
+/*
+ * flush_tlb_mm goes together with ptep_set_wrprotect for the
+ * copy_page_range operation and flush_tlb_range is related to
+ * ptep_get_and_clear for change_protection. ptep_set_wrprotect and
+ * ptep_get_and_clear do not flush the TLBs directly if the mm has
+ * only one user. At the end of the update the flush_tlb_mm and
+ * flush_tlb_range functions need to do the flush.
+ */
+#define flush_tlb()				do { } while (0)
+#define flush_tlb_all()				do { } while (0)
+#define flush_tlb_mm(mm)			__tlb_flush_mm_cond(mm)
+#define flush_tlb_page(vma, addr)		do { } while (0)
+#define flush_tlb_range(vma, start, end)	__tlb_flush_mm_cond(mm)
+#define flush_tlb_kernel_range(start, end)	__tlb_flush_mm(&init_mm)
 
 #endif /* _S390_TLBFLUSH_H */