From a841c673f1352f607fd3ba85de6c9c49ff2c1e12 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 20 Feb 2019 22:18:52 -0800
Subject: [PATCH 01/23] revert "initramfs: cleanup incomplete rootfs"

Revert ff1522bb7d9845 ("initramfs: cleanup incomplete rootfs").

Andy reports

: This breaks my setup where I have U-boot provided more size of initramfs
: than needed.  This allows a bit of flexibility to increase or decrease
: initramfs compressed image without taking care of bootloader.  The proper
: solution is to do this if we sure that we didn't get enough memory,
: otherwise I can't consider the error fatal to clean up rootfs.

Fixes: ff1522bb7d9845 ("initramfs: cleanup incomplete rootfs")
Reported-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Tested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: David Engraf <david.engraf@sysgo.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/initramfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/init/initramfs.c b/init/initramfs.c
index 7cea802d00ef..fca899622937 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -550,6 +550,7 @@ skip:
 	initrd_end = 0;
 }
 
+#ifdef CONFIG_BLK_DEV_RAM
 #define BUF_SIZE 1024
 static void __init clean_rootfs(void)
 {
@@ -596,6 +597,7 @@ static void __init clean_rootfs(void)
 	ksys_close(fd);
 	kfree(buf);
 }
+#endif
 
 static int __init populate_rootfs(void)
 {
@@ -638,10 +640,8 @@ static int __init populate_rootfs(void)
 		printk(KERN_INFO "Unpacking initramfs...\n");
 		err = unpack_to_rootfs((char *)initrd_start,
 			initrd_end - initrd_start);
-		if (err) {
+		if (err)
 			printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
-			clean_rootfs();
-		}
 		free_initrd();
 #endif
 	}

From 050c17f239fd53adb55aa768d4f41bc76c0fe045 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Wed, 20 Feb 2019 22:18:58 -0800
Subject: [PATCH 02/23] numa: change get_mempolicy() to use nr_node_ids instead
 of MAX_NUMNODES

The system call, get_mempolicy() [1], passes an unsigned long *nodemask
pointer and an unsigned long maxnode argument which specifies the length
of the user's nodemask array in bits (which is rounded up).  The manual
page says that if the maxnode value is too small, get_mempolicy will
return EINVAL but there is no system call to return this minimum value.
To determine this value, some programs search /proc/<pid>/status for a
line starting with "Mems_allowed:" and use the number of digits in the
mask to determine the minimum value.  A recent change to the way this line
is formatted [2] causes these programs to compute a value less than
MAX_NUMNODES so get_mempolicy() returns EINVAL.

Change get_mempolicy(), the older compat version of get_mempolicy(), and
the copy_nodes_to_user() function to use nr_node_ids instead of
MAX_NUMNODES, thus preserving the defacto method of computing the minimum
size for the nodemask array and the maxnode argument.

[1] http://man7.org/linux/man-pages/man2/get_mempolicy.2.html
[2] https://lore.kernel.org/lkml/1545405631-6808-1-git-send-email-longman@redhat.com

Link: http://lkml.kernel.org/r/20190211180245.22295-1-rcampbell@nvidia.com
Fixes: 4fb8e5b89bcbbbb ("include/linux/nodemask.h: use nr_node_ids (not MAX_NUMNODES) in __nodemask_pr_numnodes()")
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Waiman Long <longman@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..ee2bce59d2bf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 			      nodemask_t *nodes)
 {
 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
 
 	if (copy > nbytes) {
 		if (copy > PAGE_SIZE)
@@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy,
 	int uninitialized_var(pval);
 	nodemask_t nodes;
 
-	if (nmask != NULL && maxnode < MAX_NUMNODES)
+	if (nmask != NULL && maxnode < nr_node_ids)
 		return -EINVAL;
 
 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	unsigned long nr_bits, alloc_size;
 	DECLARE_BITMAP(bm, MAX_NUMNODES);
 
-	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
 	if (nmask)

From e1db95befb3e9e3476629afec6e0f5d0707b9825 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:01 -0800
Subject: [PATCH 03/23] kasan: fix assigning tags twice

When an object is kmalloc()'ed, two hooks are called: kasan_slab_alloc()
and kasan_kmalloc().  Right now we assign a tag twice, once in each of the
hooks.  Fix it by assigning a tag only in the former hook.

Link: http://lkml.kernel.org/r/ce8c6431da735aa7ec051fd6497153df690eb021.1549921721.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/common.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 73c9cbfdedf4..09b534fbba17 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -361,10 +361,15 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
  *    get different tags.
  */
 static u8 assign_tag(struct kmem_cache *cache, const void *object,
-			bool init, bool krealloc)
+			bool init, bool keep_tag)
 {
-	/* Reuse the same tag for krealloc'ed objects. */
-	if (krealloc)
+	/*
+	 * 1. When an object is kmalloc()'ed, two hooks are called:
+	 *    kasan_slab_alloc() and kasan_kmalloc(). We assign the
+	 *    tag only in the first one.
+	 * 2. We reuse the same tag for krealloc'ed objects.
+	 */
+	if (keep_tag)
 		return get_tag(object);
 
 	/*
@@ -405,12 +410,6 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
 	return (void *)object;
 }
 
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
-					gfp_t flags)
-{
-	return kasan_kmalloc(cache, object, cache->object_size, flags);
-}
-
 static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
 {
 	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -467,7 +466,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 }
 
 static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
-				size_t size, gfp_t flags, bool krealloc)
+				size_t size, gfp_t flags, bool keep_tag)
 {
 	unsigned long redzone_start;
 	unsigned long redzone_end;
@@ -485,7 +484,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				KASAN_SHADOW_SCALE_SIZE);
 
 	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-		tag = assign_tag(cache, object, false, krealloc);
+		tag = assign_tag(cache, object, false, keep_tag);
 
 	/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
 	kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -498,10 +497,16 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 	return set_tag(object, tag);
 }
 
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+					gfp_t flags)
+{
+	return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+
 void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				size_t size, gfp_t flags)
 {
-	return __kasan_kmalloc(cache, object, size, flags, false);
+	return __kasan_kmalloc(cache, object, size, flags, true);
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 

From 53128245b43daad600d9fe72940206570e064112 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:11 -0800
Subject: [PATCH 04/23] kasan, kmemleak: pass tagged pointers to kmemleak

Right now we call kmemleak hooks before assigning tags to pointers in
KASAN hooks.  As a result, when an objects gets allocated, kmemleak sees a
differently tagged pointer, compared to the one it sees when the object
gets freed.  Fix it by calling KASAN hooks before kmemleak's ones.

Link: http://lkml.kernel.org/r/cd825aa4897b0fc37d3316838993881daccbe9f5.1549921721.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reported-by: Qian Cai <cai@lca.pw>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.h        | 6 ++----
 mm/slab_common.c | 2 +-
 mm/slub.c        | 3 ++-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..638ea1b25d39 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -437,11 +437,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 
 	flags &= gfp_allowed_mask;
 	for (i = 0; i < size; i++) {
-		void *object = p[i];
-
-		kmemleak_alloc_recursive(object, s->object_size, 1,
+		p[i] = kasan_slab_alloc(s, p[i], flags);
+		kmemleak_alloc_recursive(p[i], s->object_size, 1,
 					 s->flags, flags);
-		p[i] = kasan_slab_alloc(s, object, flags);
 	}
 
 	if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 81732d05e74a..fe524c8d0246 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1228,8 +1228,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	flags |= __GFP_COMP;
 	page = alloc_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
-	kmemleak_alloc(ret, size, 1, flags);
 	ret = kasan_kmalloc_large(ret, size, flags);
+	kmemleak_alloc(ret, size, 1, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
diff --git a/mm/slub.c b/mm/slub.c
index 1e3d0ec4e200..4a3d7686902f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1374,8 +1374,9 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
  */
 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
+	ptr = kasan_kmalloc_large(ptr, size, flags);
 	kmemleak_alloc(ptr, size, 1, flags);
-	return kasan_kmalloc_large(ptr, size, flags);
+	return ptr;
 }
 
 static __always_inline void kfree_hook(void *x)

From a2f775751d964e638818487544fa8320180d106e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:16 -0800
Subject: [PATCH 05/23] kmemleak: account for tagged pointers when calculating
 pointer range

kmemleak keeps two global variables, min_addr and max_addr, which store
the range of valid (encountered by kmemleak) pointer values, which it
later uses to speed up pointer lookup when scanning blocks.

With tagged pointers this range will get bigger than it needs to be.  This
patch makes kmemleak untag pointers before saving them to min_addr and
max_addr and when performing a lookup.

Link: http://lkml.kernel.org/r/16e887d442986ab87fe87a755815ad92fa431a5f.1550066133.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Qian Cai <cai@lca.pw>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c    | 10 +++++++---
 mm/slab.h        |  1 +
 mm/slab_common.c |  1 +
 mm/slub.c        |  1 +
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f9d9dc250428..707fa5579f66 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 	unsigned long flags;
 	struct kmemleak_object *object, *parent;
 	struct rb_node **link, *rb_parent;
+	unsigned long untagged_ptr;
 
 	object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
 	if (!object) {
@@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 
 	write_lock_irqsave(&kmemleak_lock, flags);
 
-	min_addr = min(min_addr, ptr);
-	max_addr = max(max_addr, ptr + size);
+	untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+	min_addr = min(min_addr, untagged_ptr);
+	max_addr = max(max_addr, untagged_ptr + size);
 	link = &object_tree_root.rb_node;
 	rb_parent = NULL;
 	while (*link) {
@@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end,
 	unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
 	unsigned long *end = _end - (BYTES_PER_POINTER - 1);
 	unsigned long flags;
+	unsigned long untagged_ptr;
 
 	read_lock_irqsave(&kmemleak_lock, flags);
 	for (ptr = start; ptr < end; ptr++) {
@@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end,
 		pointer = *ptr;
 		kasan_enable_current();
 
-		if (pointer < min_addr || pointer >= max_addr)
+		untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+		if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
 			continue;
 
 		/*
diff --git a/mm/slab.h b/mm/slab.h
index 638ea1b25d39..384105318779 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -438,6 +438,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 	flags &= gfp_allowed_mask;
 	for (i = 0; i < size; i++) {
 		p[i] = kasan_slab_alloc(s, p[i], flags);
+		/* As p[i] might get tagged, call kmemleak hook after KASAN. */
 		kmemleak_alloc_recursive(p[i], s->object_size, 1,
 					 s->flags, flags);
 	}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fe524c8d0246..f9d89c1b5977 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1229,6 +1229,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	page = alloc_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
 	ret = kasan_kmalloc_large(ret, size, flags);
+	/* As ret might get tagged, call kmemleak hook after KASAN. */
 	kmemleak_alloc(ret, size, 1, flags);
 	return ret;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 4a3d7686902f..f5a451c49190 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1375,6 +1375,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
 	ptr = kasan_kmalloc_large(ptr, size, flags);
+	/* As ptr might get tagged, call kmemleak hook after KASAN. */
 	kmemleak_alloc(ptr, size, 1, flags);
 	return ptr;
 }

From a71012242837fe5e67d8c999cfc357174ed5dba0 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:23 -0800
Subject: [PATCH 06/23] kasan, slub: move kasan_poison_slab hook before
 page_address

With tag based KASAN page_address() looks at the page flags to see whether
the resulting pointer needs to have a tag set.  Since we don't want to set
a tag when page_address() is called on SLAB pages, we call
page_kasan_tag_reset() in kasan_poison_slab().  However in allocate_slab()
page_address() is called before kasan_poison_slab().  Fix it by changing
the order.

[andreyknvl@google.com: fix compilation error when CONFIG_SLUB_DEBUG=n]
  Link: http://lkml.kernel.org/r/ac27cc0bbaeb414ed77bcd6671a877cf3546d56e.1550066133.git.andreyknvl@google.com
Link: http://lkml.kernel.org/r/cd895d627465a3f1c712647072d17f10883be2a1.1549921721.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index f5a451c49190..a7e7c7f719f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1075,6 +1075,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
 	init_tracking(s, object);
 }
 
+static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+{
+	if (!(s->flags & SLAB_POISON))
+		return;
+
+	metadata_access_enable();
+	memset(addr, POISON_INUSE, PAGE_SIZE << order);
+	metadata_access_disable();
+}
+
 static inline int alloc_consistency_checks(struct kmem_cache *s,
 					struct page *page,
 					void *object, unsigned long addr)
@@ -1330,6 +1340,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
 #else /* !CONFIG_SLUB_DEBUG */
 static inline void setup_object_debug(struct kmem_cache *s,
 			struct page *page, void *object) {}
+static inline void setup_page_debug(struct kmem_cache *s,
+			void *addr, int order) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
 	struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1643,12 +1655,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
+	kasan_poison_slab(page);
+
 	start = page_address(page);
 
-	if (unlikely(s->flags & SLAB_POISON))
-		memset(start, POISON_INUSE, PAGE_SIZE << order);
-
-	kasan_poison_slab(page);
+	setup_page_debug(s, start, order);
 
 	shuffle = shuffle_freelist(s, page);
 

From 18e506610238eda2b0c5a19a123d3d6ec0ab2de6 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:28 -0800
Subject: [PATCH 07/23] kasan, slub: fix conflicts with
 CONFIG_SLAB_FREELIST_HARDENED

CONFIG_SLAB_FREELIST_HARDENED hashes freelist pointer with the address of
the object where the pointer gets stored.  With tag based KASAN we don't
account for that when building freelist, as we call set_freepointer() with
the first argument untagged.  This patch changes the code to properly
propagate tags throughout the loop.

Link: http://lkml.kernel.org/r/3df171559c52201376f246bf7ce3184fe21c1dc7.1549921721.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reported-by: Qian Cai <cai@lca.pw>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index a7e7c7f719f9..80da3a40b74d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -303,11 +303,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 		__p < (__addr) + (__objects) * (__s)->size; \
 		__p += (__s)->size)
 
-#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
-	for (__p = fixup_red_left(__s, __addr), __idx = 1; \
-		__idx <= __objects; \
-		__p += (__s)->size, __idx++)
-
 /* Determine object index from a given position */
 static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
@@ -1664,17 +1659,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	shuffle = shuffle_freelist(s, page);
 
 	if (!shuffle) {
-		for_each_object_idx(p, idx, s, start, page->objects) {
-			if (likely(idx < page->objects)) {
-				next = p + s->size;
-				next = setup_object(s, page, next);
-				set_freepointer(s, p, next);
-			} else
-				set_freepointer(s, p, NULL);
-		}
 		start = fixup_red_left(s, start);
 		start = setup_object(s, page, start);
 		page->freelist = start;
+		for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+			next = p + s->size;
+			next = setup_object(s, page, next);
+			set_freepointer(s, p, next);
+			p = next;
+		}
+		set_freepointer(s, p, NULL);
 	}
 
 	page->inuse = page->objects;

From d36a63a943e37081e92e4abdf4a207fd2e83a006 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:19:32 -0800
Subject: [PATCH 08/23] kasan, slub: fix more conflicts with
 CONFIG_SLAB_FREELIST_HARDENED

When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.  Normally,
this doesn't cause any issues, as both set_freepointer() and
get_freepointer() are called with a pointer with the same tag.  However,
there are some issues with CONFIG_SLUB_DEBUG code.  For example, when
__free_slub() iterates over objects in a cache, it passes untagged
pointers to check_object().  check_object() in turns calls
get_freepointer() with an untagged pointer, which causes the freepointer
to be restored incorrectly.

Add kasan_reset_tag to freelist_ptr(). Also add a detailed comment.

Link: http://lkml.kernel.org/r/bf858f26ef32eb7bd24c665755b3aee4bc58d0e4.1550103861.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 80da3a40b74d..c80e6699357c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
 				 unsigned long ptr_addr)
 {
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
-	return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+	/*
+	 * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+	 * Normally, this doesn't cause any issues, as both set_freepointer()
+	 * and get_freepointer() are called with a pointer with the same tag.
+	 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+	 * example, when __free_slub() iterates over objects in a cache, it
+	 * passes untagged pointers to check_object(). check_object() in turns
+	 * calls get_freepointer() with an untagged pointer, which causes the
+	 * freepointer to be restored incorrectly.
+	 */
+	return (void *)((unsigned long)ptr ^ s->random ^
+			(unsigned long)kasan_reset_tag((void *)ptr_addr));
 #else
 	return ptr;
 #endif

From 338cfaad4993d3bc35a740e28981747770a65f90 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Wed, 20 Feb 2019 22:19:36 -0800
Subject: [PATCH 09/23] slub: fix SLAB_CONSISTENCY_CHECKS + KASAN_SW_TAGS

Enabling SLUB_DEBUG's SLAB_CONSISTENCY_CHECKS with KASAN_SW_TAGS
triggers endless false positives during boot below due to
check_valid_pointer() checks tagged pointers which have no addresses
that is valid within slab pages:

  BUG radix_tree_node (Tainted: G    B            ): Freelist Pointer check fails
  -----------------------------------------------------------------------------

  INFO: Slab objects=69 used=69 fp=0x          (null) flags=0x7ffffffc000200
  INFO: Object @offset=15060037153926966016 fp=0x

  Redzone: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 18 6b 06 00 08 80 ff d0  .........k......
  Object : 18 6b 06 00 08 80 ff d0 00 00 00 00 00 00 00 00  .k..............
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  Redzone: bb bb bb bb bb bb bb bb                          ........
  Padding: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
  CPU: 0 PID: 0 Comm: swapper/0 Tainted: G    B             5.0.0-rc5+ #18
  Call trace:
    dump_backtrace+0x0/0x450
    show_stack+0x20/0x2c
    __dump_stack+0x20/0x28
    dump_stack+0xa0/0xfc
    print_trailer+0x1bc/0x1d0
    object_err+0x40/0x50
    alloc_debug_processing+0xf0/0x19c
    ___slab_alloc+0x554/0x704
    kmem_cache_alloc+0x2f8/0x440
    radix_tree_node_alloc+0x90/0x2fc
    idr_get_free+0x1e8/0x6d0
    idr_alloc_u32+0x11c/0x2a4
    idr_alloc+0x74/0xe0
    worker_pool_assign_id+0x5c/0xbc
    workqueue_init_early+0x49c/0xd50
    start_kernel+0x52c/0xac4
  FIX radix_tree_node: Marking all objects used

Link: http://lkml.kernel.org/r/20190209044128.3290-1-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/slub.c b/mm/slub.c
index c80e6699357c..2d2830134e60 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -513,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
 		return 1;
 
 	base = page_address(page);
+	object = kasan_reset_tag(object);
 	object = restore_red_left(s, object);
 	if (object < base || object >= base + page->objects * s->size ||
 		(object - base) % s->size) {

From b2b469939e93458753cfbf8282ad52636495965e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 20 Feb 2019 22:19:42 -0800
Subject: [PATCH 10/23] proc, oom: do not report alien mms when setting
 oom_score_adj

Tetsuo has reported that creating a thousands of processes sharing MM
without SIGHAND (aka alien threads) and setting
/proc/<pid>/oom_score_adj will swamp the kernel log and takes ages [1]
to finish.  This is especially worrisome that all that printing is done
under RCU lock and this can potentially trigger RCU stall or softlockup
detector.

The primary reason for the printk was to catch potential users who might
depend on the behavior prior to 44a70adec910 ("mm, oom_adj: make sure
processes sharing mm have same view of oom_score_adj") but after more
than 2 years without a single report I guess it is safe to simply remove
the printk altogether.

The next step should be moving oom_score_adj over to the mm struct and
remove all the tasks crawling as suggested by [2]

[1] http://lkml.kernel.org/r/97fce864-6f75-bca5-14bc-12c9f890e740@i-love.sakura.ne.jp
[2] http://lkml.kernel.org/r/20190117155159.GA4087@dhcp22.suse.cz

Link: http://lkml.kernel.org/r/20190212102129.26288-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Yong-Taek Lee <ytk.lee@samsung.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..f5ed9512d193 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1086,10 +1086,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
 
 			task_lock(p);
 			if (!p->vfork_done && process_shares_mm(p, mm)) {
-				pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
-						task_pid_nr(p), p->comm,
-						p->signal->oom_score_adj, oom_adj,
-						task_pid_nr(task), task->comm);
 				p->signal->oom_score_adj = oom_adj;
 				if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
 					p->signal->oom_score_adj_min = (short)oom_adj;

From 311ade0eab192f0abee2f70bce761bf0d66990c4 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 20 Feb 2019 22:19:45 -0800
Subject: [PATCH 11/23] mm/debug.c: fix __dump_page() for poisoned pages

Evaluating page_mapping() on a poisoned page ends up dereferencing junk
and making PF_POISONED_CHECK() considerably crashier than intended:

    Unable to handle kernel NULL pointer dereference at virtual address 0000000000000006
    Mem abort info:
      ESR = 0x96000005
      Exception class = DABT (current EL), IL = 32 bits
      SET = 0, FnV = 0
      EA = 0, S1PTW = 0
    Data abort info:
      ISV = 0, ISS = 0x00000005
      CM = 0, WnR = 0
    user pgtable: 4k pages, 39-bit VAs, pgdp = 00000000c2f6ac38
    [0000000000000006] pgd=0000000000000000, pud=0000000000000000
    Internal error: Oops: 96000005 [#1] PREEMPT SMP
    Modules linked in:
    CPU: 2 PID: 491 Comm: bash Not tainted 5.0.0-rc1+ #1
    Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Dec 17 2018
    pstate: 00000005 (nzcv daif -PAN -UAO)
    pc : page_mapping+0x18/0x118
    lr : __dump_page+0x1c/0x398
    Process bash (pid: 491, stack limit = 0x000000004ebd4ecd)
    Call trace:
     page_mapping+0x18/0x118
     __dump_page+0x1c/0x398
     dump_page+0xc/0x18
     remove_store+0xbc/0x120
     dev_attr_store+0x18/0x28
     sysfs_kf_write+0x40/0x50
     kernfs_fop_write+0x130/0x1d8
     __vfs_write+0x30/0x180
     vfs_write+0xb4/0x1a0
     ksys_write+0x60/0xd0
     __arm64_sys_write+0x18/0x20
     el0_svc_common+0x94/0xf8
     el0_svc_handler+0x68/0x70
     el0_svc+0x8/0xc
    Code: f9400401 d1000422 f240003f 9a801040 (f9400402)
    ---[ end trace cdb5eb5bf435cecb ]---

Fix that by not inspecting the mapping until we've determined that it's
likely to be valid.  Now the above condition still ends up stopping the
kernel, but in the correct manner:

    page:ffffffbf20000000 is uninitialized and poisoned
    raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
    raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
    page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
    ------------[ cut here ]------------
    kernel BUG at ./include/linux/mm.h:1006!
    Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
    Modules linked in:
    CPU: 1 PID: 483 Comm: bash Not tainted 5.0.0-rc1+ #3
    Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Dec 17 2018
    pstate: 40000005 (nZcv daif -PAN -UAO)
    pc : remove_store+0xbc/0x120
    lr : remove_store+0xbc/0x120
    ...

Link: http://lkml.kernel.org/r/03b53ee9d7e76cda4b9b5e1e31eea080db033396.1550071778.git.robin.murphy@arm.com
Fixes: 1c6fb1d89e73 ("mm: print more information about mapping in __dump_page")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/debug.c b/mm/debug.c
index 0abb987dad9b..1611cf00a137 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
 
 void __dump_page(struct page *page, const char *reason)
 {
-	struct address_space *mapping = page_mapping(page);
+	struct address_space *mapping;
 	bool page_poisoned = PagePoisoned(page);
 	int mapcount;
 
@@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason)
 		goto hex_only;
 	}
 
+	mapping = page_mapping(page);
+
 	/*
 	 * Avoid VM_BUG_ON() in page_mapcount().
 	 * page->_mapcount space in struct page is used by sl[aou]b pages to

From 94b3334cbebea34d56a7e6321c6fe9d89b309a49 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 20 Feb 2019 22:19:49 -0800
Subject: [PATCH 12/23] mm, page_alloc: fix a division by zero error when
 boosting watermarks v2

Yury Norov reported that an arm64 KVM instance could not boot since
after v5.0-rc1 and could addressed by reverting the patches

  1c30844d2dfe272d58c ("mm: reclaim small amounts of memory when an external
  73444bc4d8f92e46a20 ("mm, page_alloc: do not wake kswapd with zone lock held")

The problem is that a division by zero error is possible if boosting
occurs very early in boot if the system has very little memory.  This
patch avoids the division by zero error.

Link: http://lkml.kernel.org/r/20190213143012.GT9565@techsingularity.net
Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reported-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f79b78bc829..0b9f577b1a2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2170,6 +2170,18 @@ static inline void boost_watermark(struct zone *zone)
 
 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
 			watermark_boost_factor, 10000);
+
+	/*
+	 * high watermark may be uninitialised if fragmentation occurs
+	 * very early in boot so do not boost. We do not fall
+	 * through and boost by pageblock_nr_pages as failing
+	 * allocations that early means that reclaim is not going
+	 * to help and it may even be impossible to reclaim the
+	 * boosted watermark resulting in a hang.
+	 */
+	if (!max_boost)
+		return;
+
 	max_boost = max(pageblock_nr_pages, max_boost);
 
 	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,

From 6ea183d60c469560e7b08a83c9804299e84ec9eb Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 20 Feb 2019 22:19:54 -0800
Subject: [PATCH 13/23] mm: handle lru_add_drain_all for UP properly

Since for_each_cpu(cpu, mask) added by commit 2d3854a37e8b767a
("cpumask: introduce new API, without changing anything") did not
evaluate the mask argument if NR_CPUS == 1 due to CONFIG_SMP=n,
lru_add_drain_all() is hitting WARN_ON() at __flush_work() added by
commit 4d43d395fed12463 ("workqueue: Try to catch flush_work() without
INIT_WORK().") by unconditionally calling flush_work() [1].

Workaround this issue by using CONFIG_SMP=n specific lru_add_drain_all
implementation.  There is no real need to defer the implementation to
the workqueue as the draining is going to happen on the local cpu.  So
alias lru_add_drain_all to lru_add_drain which does all the necessary
work.

[akpm@linux-foundation.org: fix various build warnings]
[1] https://lkml.kernel.org/r/18a30387-6aa5-6123-e67c-57579ecc3f38@roeck-us.net
Link: http://lkml.kernel.org/r/20190213124334.GH4525@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Debugged-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4929bc1be60e..4d7d37eb3c40 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
-static bool need_activate_page_drain(int cpu)
-{
-	return false;
-}
-
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -653,13 +648,15 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
 }
 
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
 /*
  * Doesn't need any cpu hotplug locking because we do rely on per-cpu
  * kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -702,6 +699,12 @@ void lru_add_drain_all(void)
 
 	mutex_unlock(&lock);
 }
+#else
+void lru_add_drain_all(void)
+{
+	lru_add_drain();
+}
+#endif
 
 /**
  * release_pages - batched put_page()

From 4e37504d1c49eec6434d0cc97278d2b51c9e8763 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Feb 2019 22:19:59 -0800
Subject: [PATCH 14/23] psi: avoid divide-by-zero crash inside virtual machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We've been seeing hard-to-trigger psi crashes when running inside VM
instances:

    divide error: 0000 [#1] SMP PTI
    Modules linked in: [...]
    CPU: 0 PID: 212 Comm: kworker/0:2 Not tainted 4.16.18-119_fbk9_3817_gfe944c98d695 #119
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
    Workqueue: events psi_clock
    RIP: 0010:psi_update_stats+0x270/0x490
    RSP: 0018:ffffc90001117e10 EFLAGS: 00010246
    RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8800a35a13f8
    RDX: 0000000000000000 RSI: ffff8800a35a1340 RDI: 0000000000000000
    RBP: 0000000000000658 R08: ffff8800a35a1470 R09: 0000000000000000
    R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
    R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000f8502
    FS:  0000000000000000(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fbe370fa000 CR3: 00000000b1e3a000 CR4: 00000000000006f0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
     psi_clock+0x12/0x50
     process_one_work+0x1e0/0x390
     worker_thread+0x2b/0x3c0
     ? rescuer_thread+0x330/0x330
     kthread+0x113/0x130
     ? kthread_create_worker_on_cpu+0x40/0x40
     ? SyS_exit_group+0x10/0x10
     ret_from_fork+0x35/0x40
    Code: 48 0f 47 c7 48 01 c2 45 85 e4 48 89 16 0f 85 e6 00 00 00 4c 8b 49 10 4c 8b 51 08 49 69 d9 f2 07 00 00 48 6b c0 64 4c 8b 29 31 d2 <48> f7 f7 49 69 d5 8d 06 00 00 48 89 c5 4c 69 f0 00 98 0b 00 48

The Code-line points to `period` being 0 inside update_stats(), and we
divide by that when calculating that period's pressure percentage.

The elapsed period should never be 0.  The reason this can happen is due
to an off-by-one in the idle time / missing period calculation combined
with a coarse sched_clock() in the virtual machine.

The target time for aggregation is advanced into the future on a fixed
grid to prevent clock drift.  So when an aggregation runs after some idle
period, we can not just set it to "now + psi_period", but have to
calculate the downtime and advance the target time relative to itself.

However, if the aggregator was disabled exactly one psi_period (ns), we
drop one idle period in the calculation due to a > when we should do >=.
In that case, next_update will be advanced from 'now - psi_period' to
'now' when it should be moved to 'now + psi_period'.  The run finishes
with last_update == next_update == sched_clock().

With hardware clocks, this exact nanosecond match isn't likely in the
first place; but if it does happen, the clock will still have moved on and
the period non-zero by the time the worker runs.  A pointlessly short
period, but besides the extra work, no harm no foul.  However, a slow
sched_clock() like we have on VMs might not have advanced either by the
time the worker runs again.  And when we calculate the elapsed period, the
result, our pressure divisor, will be 0.  Ouch.

Fix this by correctly handling the situation when the elapsed time between
aggregation runs is precisely two periods, and advance the expiration
timestamp correctly to period into the future.

Link: http://lkml.kernel.org/r/20190214193157.15788-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Łukasz Siudut <lsiudut@fb.com
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c3484785b179..0e97ca9306ef 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group)
 	expires = group->next_update;
 	if (now < expires)
 		goto out;
-	if (now - expires > psi_period)
+	if (now - expires >= psi_period)
 		missed_periods = div_u64(now - expires, psi_period);
 
 	/*

From 1062af920c07f5b54cf5060fde3339da6df0cf6b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 21 Feb 2019 08:48:09 -0800
Subject: [PATCH 15/23] tmpfs: fix link accounting when a tmpfile is linked in

tmpfs has a peculiarity of accounting hard links as if they were
separate inodes: so that when the number of inodes is limited, as it is
by default, a user cannot soak up an unlimited amount of unreclaimable
dcache memory just by repeatedly linking a file.

But when v3.11 added O_TMPFILE, and the ability to use linkat() on the
fd, we missed accommodating this new case in tmpfs: "df -i" shows that
an extra "inode" remains accounted after the file is unlinked and the fd
closed and the actual inode evicted.  If a user repeatedly links
tmpfiles into a tmpfs, the limit will be hit (ENOSPC) even after they
are deleted.

Just skip the extra reservation from shmem_link() in this case: there's
a sense in which this first link of a tmpfile is then cheaper than a
hard link of another file, but the accounting works out, and there's
still good limiting, so no need to do anything more complicated.

Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1902182134370.7035@eggly.anvils
Fixes: f4e0c30c191 ("allow the temp files created by open() to be linked to")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Matej Kupljen <matej.kupljen@gmail.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 6ece1e2fe76e..0905215fb016 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2854,10 +2854,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	 * No ordinary (disk based) filesystem counts links as inodes;
 	 * but each new link needs a new dentry, pinning lowmem, and
 	 * tmpfs dentries cannot be pruned until they are unlinked.
+	 * But if an O_TMPFILE file is linked into the tmpfs, the
+	 * first link must skip that, to get the accounting right.
 	 */
-	ret = shmem_reserve_inode(inode->i_sb);
-	if (ret)
-		goto out;
+	if (inode->i_nlink) {
+		ret = shmem_reserve_inode(inode->i_sb);
+		if (ret)
+			goto out;
+	}
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);

From 3f41b609382388f95c0a05b69b8db0d706adafb4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:20:15 -0800
Subject: [PATCH 16/23] kasan: fix random seed generation for tag-based mode

There are two issues with assigning random percpu seeds right now:

1. We use for_each_possible_cpu() to iterate over cpus, but cpumask is
   not set up yet at the moment of kasan_init(), and thus we only set
   the seed for cpu #0.

2. A call to get_random_u32() always returns the same number and produces
   a message in dmesg, since the random subsystem is not yet initialized.

Fix 1 by calling kasan_init_tags() after cpumask is set up.

Fix 2 by using get_cycles() instead of get_random_u32(). This gives us
lower quality random numbers, but it's good enough, as KASAN is meant to
be used as a debugging tool and not a mitigation.

Link: http://lkml.kernel.org/r/1f815cc914b61f3516ed4cc9bfd9eeca9bd5d9de.1550677973.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/setup.c  | 3 +++
 arch/arm64/mm/kasan_init.c | 2 --
 mm/kasan/tags.c            | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index d09ec76f08cf..009849328289 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -339,6 +339,9 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
+	/* Init percpu seeds for random tags after cpus are set up. */
+	kasan_init_tags();
+
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	/*
 	 * Make sure init_thread_info.ttbr0 always generates translation
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4b55b15707a3..f37a86d2a69d 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -252,8 +252,6 @@ void __init kasan_init(void)
 	memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
-	kasan_init_tags();
-
 	/* At this point kasan is fully initialized. Enable error messages */
 	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0777649e07c4..63fca3172659 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -46,7 +46,7 @@ void kasan_init_tags(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		per_cpu(prng_state, cpu) = get_random_u32();
+		per_cpu(prng_state, cpu) = (u32)get_cycles();
 }
 
 /*

From dc15a8a2543cb9ebe67998eefe2880ce1a20d42e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:20:20 -0800
Subject: [PATCH 17/23] kasan: prevent tracing of tags.c

Similarly to commit 0d0c8de8788b ("kasan: mark file common so ftrace
doesn't trace it") add the -pg flag to mm/kasan/tags.c to prevent
conflicts with tracing.

Link: http://lkml.kernel.org/r/9c4c3ce5ccfb894c7fe66d91de7c1da2787b4da4.1550602886.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index e2bb06c1b45e..5d1065efbd47 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -7,6 +7,8 @@ KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_generic.o = -pg
+CFLAGS_REMOVE_tags.o = -pg
+
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
 

From 219667c23c68eb3dbc0d5662b9246f28477fe529 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:20:25 -0800
Subject: [PATCH 18/23] kasan, slab: fix conflicts with
 CONFIG_HARDENED_USERCOPY

Similarly to commit 96fedce27e13 ("kasan: make tag based mode work with
CONFIG_HARDENED_USERCOPY"), we need to reset pointer tags in
__check_heap_object() in mm/slab.c before doing any pointer math.

Link: http://lkml.kernel.org/r/9a5c0f958db10e69df5ff9f2b997866b56b7effc.1550602886.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/slab.c b/mm/slab.c
index 78eb8c5bf4e4..c84458281a88 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4408,6 +4408,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 	unsigned int objnr;
 	unsigned long offset;
 
+	ptr = kasan_reset_tag(ptr);
+
 	/* Find and validate object. */
 	cachep = page->slab_cache;
 	objnr = obj_to_index(cachep, page, (void *)ptr);

From 51dedad06b5f6c3eea7ec1069631b1ef7796912a Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:20:28 -0800
Subject: [PATCH 19/23] kasan, slab: make freelist stored without tags

Similarly to "kasan, slub: move kasan_poison_slab hook before
page_address", move kasan_poison_slab() before alloc_slabmgmt(), which
calls page_address(), to make page_address() return value to be
non-tagged.  This, combined with calling kasan_reset_tag() for off-slab
slab management object, leads to freelist being stored non-tagged.

Link: http://lkml.kernel.org/r/dfb53b44a4d00de3879a05a9f04c1f55e584f7a1.1550602886.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index c84458281a88..4ad95fcb1686 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2359,7 +2359,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
 	void *freelist;
 	void *addr = page_address(page);
 
-	page->s_mem = kasan_reset_tag(addr) + colour_off;
+	page->s_mem = addr + colour_off;
 	page->active = 0;
 
 	if (OBJFREELIST_SLAB(cachep))
@@ -2368,6 +2368,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
 		/* Slab management obj is off-slab. */
 		freelist = kmem_cache_alloc_node(cachep->freelist_cache,
 					      local_flags, nodeid);
+		freelist = kasan_reset_tag(freelist);
 		if (!freelist)
 			return NULL;
 	} else {
@@ -2681,6 +2682,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
 
 	offset *= cachep->colour_off;
 
+	/*
+	 * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
+	 * page_address() in the latter returns a non-tagged pointer,
+	 * as it should be for slab pages.
+	 */
+	kasan_poison_slab(page);
+
 	/* Get slab management. */
 	freelist = alloc_slabmgmt(cachep, page, offset,
 			local_flags & ~GFP_CONSTRAINT_MASK, page_node);
@@ -2689,7 +2697,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
 
 	slab_map_pages(cachep, page, freelist);
 
-	kasan_poison_slab(page);
 	cache_init_objs(cachep, page);
 
 	if (gfpflags_allow_blocking(local_flags))

From 557ea25383a231fe3ffc72881ada35c24b960dbc Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 20 Feb 2019 22:20:33 -0800
Subject: [PATCH 20/23] kasan, slab: remove redundant kasan_slab_alloc hooks

kasan_slab_alloc() calls in kmem_cache_alloc() and kmem_cache_alloc_node()
are redundant as they are already called via slab_alloc/slab_alloc_node()->
slab_post_alloc_hook()->kasan_slab_alloc().  Remove them.

Link: http://lkml.kernel.org/r/4ca1655cdcfc4379c49c50f7bf80f81c4ad01485.1550602886.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgeniy Stepanov <eugenis@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 4ad95fcb1686..91c1863df93d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,7 +3547,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 
-	ret = kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
 
@@ -3637,7 +3636,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
-	ret = kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);

From 6373dca16c911b2828ef8d836d7f6f1800e1bbbc Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Wed, 20 Feb 2019 22:20:37 -0800
Subject: [PATCH 21/23] slub: fix a crash with SLUB_DEBUG + KASAN_SW_TAGS

In process_slab(), "p = get_freepointer()" could return a tagged
pointer, but "addr = page_address()" always return a native pointer.  As
the result, slab_index() is messed up here,

    return (p - addr) / s->size;

All other callers of slab_index() have the same situation where "addr"
is from page_address(), so just need to untag "p".

    # cat /sys/kernel/slab/hugetlbfs_inode_cache/alloc_calls

    Unable to handle kernel paging request at virtual address 2bff808aa4856d48
    Mem abort info:
      ESR = 0x96000007
      Exception class = DABT (current EL), IL = 32 bits
      SET = 0, FnV = 0
      EA = 0, S1PTW = 0
    Data abort info:
      ISV = 0, ISS = 0x00000007
      CM = 0, WnR = 0
    swapper pgtable: 64k pages, 48-bit VAs, pgdp = 0000000002498338
    [2bff808aa4856d48] pgd=00000097fcfd0003, pud=00000097fcfd0003, pmd=00000097fca30003, pte=00e8008b24850712
    Internal error: Oops: 96000007 [#1] SMP
    CPU: 3 PID: 79210 Comm: read_all Tainted: G             L    5.0.0-rc7+ #84
    Hardware name: HPE Apollo 70             /C01_APACHE_MB         , BIOS L50_5.13_1.0.6 07/10/2018
    pstate: 00400089 (nzcv daIf +PAN -UAO)
    pc : get_map+0x78/0xec
    lr : get_map+0xa0/0xec
    sp : aeff808989e3f8e0
    x29: aeff808989e3f940 x28: ffff800826200000
    x27: ffff100012d47000 x26: 9700000000002500
    x25: 0000000000000001 x24: 52ff8008200131f8
    x23: 52ff8008200130a0 x22: 52ff800820013098
    x21: ffff800826200000 x20: ffff100013172ba0
    x19: 2bff808a8971bc00 x18: ffff1000148f5538
    x17: 000000000000001b x16: 00000000000000ff
    x15: ffff1000148f5000 x14: 00000000000000d2
    x13: 0000000000000001 x12: 0000000000000000
    x11: 0000000020000002 x10: 2bff808aa4856d48
    x9 : 0000020000000000 x8 : 68ff80082620ebb0
    x7 : 0000000000000000 x6 : ffff1000105da1dc
    x5 : 0000000000000000 x4 : 0000000000000000
    x3 : 0000000000000010 x2 : 2bff808a8971bc00
    x1 : ffff7fe002098800 x0 : ffff80082620ceb0
    Process read_all (pid: 79210, stack limit = 0x00000000f65b9361)
    Call trace:
     get_map+0x78/0xec
     process_slab+0x7c/0x47c
     list_locations+0xb0/0x3c8
     alloc_calls_show+0x34/0x40
     slab_attr_show+0x34/0x48
     sysfs_kf_seq_show+0x2e4/0x570
     kernfs_seq_show+0x12c/0x1a0
     seq_read+0x48c/0xf84
     kernfs_fop_read+0xd4/0x448
     __vfs_read+0x94/0x5d4
     vfs_read+0xcc/0x194
     ksys_read+0x6c/0xe8
     __arm64_sys_read+0x68/0xb0
     el0_svc_handler+0x230/0x3bc
     el0_svc+0x8/0xc
    Code: d3467d2a 9ac92329 8b0a0e6a f9800151 (c85f7d4b)
    ---[ end trace a383a9a44ff13176 ]---
    Kernel panic - not syncing: Fatal exception
    SMP: stopping secondary CPUs
    SMP: failed to stop secondary CPUs 1-7,32,40,127
    Kernel Offset: disabled
    CPU features: 0x002,20000c18
    Memory Limit: none
    ---[ end Kernel panic - not syncing: Fatal exception ]---

Link: http://lkml.kernel.org/r/20190220020251.82039-1-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 2d2830134e60..dc777761b6b7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -317,7 +317,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 /* Determine object index from a given position */
 static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
-	return (p - addr) / s->size;
+	return (kasan_reset_tag(p) - addr) / s->size;
 }
 
 static inline unsigned int order_objects(unsigned int order, unsigned int size)

From 6c8fcc096be9d02f478c508052a41a4430506ab3 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 20 Feb 2019 22:20:42 -0800
Subject: [PATCH 22/23] mm: don't let userspace spam allocations warnings

memdump_user usually gets fed unchecked userspace input.  Blasting a
full backtrace into dmesg every time is a bit excessive - I'm not sure
on the kernel rule in general, but at least in drm we're trying not to
let unpriviledge userspace spam the logs freely.  Definitely not entire
warning backtraces.

It also means more filtering for our CI, because our testsuite exercises
these corner cases and so hits these a lot.

Link: http://lkml.kernel.org/r/20190220204058.11676-1-daniel.vetter@ffwll.ch
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Stancek <jstancek@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/util.c b/mm/util.c
index 1ea055138043..379319b1bcfd 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -150,7 +150,7 @@ void *memdup_user(const void __user *src, size_t len)
 {
 	void *p;
 
-	p = kmalloc_track_caller(len, GFP_USER);
+	p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 

From 891cb2a72d821f930a39d5900cb7a3aa752c1d5b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 20 Feb 2019 22:20:46 -0800
Subject: [PATCH 23/23] mm, memory_hotplug: fix off-by-one in
 is_pageblock_removable

Rong Chen has reported the following boot crash:

    PGD 0 P4D 0
    Oops: 0000 [#1] PREEMPT SMP PTI
    CPU: 1 PID: 239 Comm: udevd Not tainted 5.0.0-rc4-00149-gefad4e4 #1
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
    RIP: 0010:page_mapping+0x12/0x80
    Code: 5d c3 48 89 df e8 0e ad 02 00 85 c0 75 da 89 e8 5b 5d c3 0f 1f 44 00 00 53 48 89 fb 48 8b 43 08 48 8d 50 ff a8 01 48 0f 45 da <48> 8b 53 08 48 8d 42 ff 83 e2 01 48 0f 44 c3 48 83 38 ff 74 2f 48
    RSP: 0018:ffff88801fa87cd8 EFLAGS: 00010202
    RAX: ffffffffffffffff RBX: fffffffffffffffe RCX: 000000000000000a
    RDX: fffffffffffffffe RSI: ffffffff820b9a20 RDI: ffff88801e5c0000
    RBP: 6db6db6db6db6db7 R08: ffff88801e8bb000 R09: 0000000001b64d13
    R10: ffff88801fa87cf8 R11: 0000000000000001 R12: ffff88801e640000
    R13: ffffffff820b9a20 R14: ffff88801f145258 R15: 0000000000000001
    FS:  00007fb2079817c0(0000) GS:ffff88801dd00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000000006 CR3: 000000001fa82000 CR4: 00000000000006a0
    Call Trace:
     __dump_page+0x14/0x2c0
     is_mem_section_removable+0x24c/0x2c0
     removable_show+0x87/0xa0
     dev_attr_show+0x25/0x60
     sysfs_kf_seq_show+0xba/0x110
     seq_read+0x196/0x3f0
     __vfs_read+0x34/0x180
     vfs_read+0xa0/0x150
     ksys_read+0x44/0xb0
     do_syscall_64+0x5e/0x4a0
     entry_SYSCALL_64_after_hwframe+0x49/0xbe

and bisected it down to commit efad4e475c31 ("mm, memory_hotplug:
is_mem_section_removable do not pass the end of a zone").

The reason for the crash is that the mapping is garbage for poisoned
(uninitialized) page.  This shouldn't happen as all pages in the zone's
boundary should be initialized.

Later debugging revealed that the actual problem is an off-by-one when
evaluating the end_page.  'start_pfn + nr_pages' resp 'zone_end_pfn'
refers to a pfn after the range and as such it might belong to a
differen memory section.

This along with CONFIG_SPARSEMEM then makes the loop condition
completely bogus because a pointer arithmetic doesn't work for pages
from two different sections in that memory model.

Fix the issue by reworking is_pageblock_removable to be pfn based and
only use struct page where necessary.  This makes the code slightly
easier to follow and we will remove the problematic pointer arithmetic
completely.

Link: http://lkml.kernel.org/r/20190218181544.14616-1-mhocko@kernel.org
Fixes: efad4e475c31 ("mm, memory_hotplug: is_mem_section_removable do not pass the end of a zone")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: <rong.a.chen@intel.com>
Tested-by: <rong.a.chen@intel.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 124e794867c5..1ad28323fb9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1188,11 +1188,13 @@ static inline int pageblock_free(struct page *page)
 	return PageBuddy(page) && page_order(page) >= pageblock_order;
 }
 
-/* Return the start of the next active pageblock after a given page */
-static struct page *next_active_pageblock(struct page *page)
+/* Return the pfn of the start of the next active pageblock after a given pfn */
+static unsigned long next_active_pageblock(unsigned long pfn)
 {
+	struct page *page = pfn_to_page(pfn);
+
 	/* Ensure the starting page is pageblock-aligned */
-	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+	BUG_ON(pfn & (pageblock_nr_pages - 1));
 
 	/* If the entire pageblock is free, move to the end of free page */
 	if (pageblock_free(page)) {
@@ -1200,16 +1202,16 @@ static struct page *next_active_pageblock(struct page *page)
 		/* be careful. we don't have locks, page_order can be changed.*/
 		order = page_order(page);
 		if ((order < MAX_ORDER) && (order >= pageblock_order))
-			return page + (1 << order);
+			return pfn + (1 << order);
 	}
 
-	return page + pageblock_nr_pages;
+	return pfn + pageblock_nr_pages;
 }
 
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(unsigned long pfn)
 {
+	struct page *page = pfn_to_page(pfn);
 	struct zone *zone;
-	unsigned long pfn;
 
 	/*
 	 * We have to be careful here because we are iterating over memory
@@ -1232,13 +1234,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
 /* Checks if this range of memory is likely to be hot-removable. */
 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
-	struct page *page = pfn_to_page(start_pfn);
-	unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page)));
-	struct page *end_page = pfn_to_page(end_pfn);
+	unsigned long end_pfn, pfn;
+
+	end_pfn = min(start_pfn + nr_pages,
+			zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
 
 	/* Check the starting page of each pageblock within the range */
-	for (; page < end_page; page = next_active_pageblock(page)) {
-		if (!is_pageblock_removable_nolock(page))
+	for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
+		if (!is_pageblock_removable_nolock(pfn))
 			return false;
 		cond_resched();
 	}