From 8ee912dab95f1483156b6e994004bfcc3158d798 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 3 Oct 2017 16:14:15 -0700
Subject: [PATCH 01/51] alpha: fix build failures

The build of alpha allmodconfig is giving error:

  arch/alpha/include/asm/mmu_context.h: In function 'ev5_switch_mm':
  arch/alpha/include/asm/mmu_context.h:160:2: error:
	implicit declaration of function 'task_thread_info';
	did you mean 'init_thread_info'? [-Werror=implicit-function-declaration]

The file 'mmu_context.h' needed an extra header file.

Link: http://lkml.kernel.org/r/1505668810-7497-1-git-send-email-sudipm.mukherjee@gmail.com
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/mmu_context.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 384bd47b5187..45c020a0fe76 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 #include <asm/machvec.h>
 #include <asm/compiler.h>

From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:14:18 -0700
Subject: [PATCH 02/51] kernel/params.c: align add_sysfs_param documentation
 with code

This parameter is named kp, so the documentation should use that.

Fixes: 9b473de87209 ("param: Fix duplicate module prefixes")
Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..1cd8f1a895a8 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
 /*
  * add_sysfs_param - add a parameter to sysfs
  * @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
  * @name: name of parameter
  *
  * Create a kobject if for a (per-module) parameter if mp NULL, and

From e00e5a26e3d37b47ce8ca59611db1e6135790ef8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 3 Oct 2017 16:14:21 -0700
Subject: [PATCH 03/51] scripts/spelling.txt: add more spelling mistakes to
 spelling.txt

Here are some of the more spelling mistakes and typos that I've found
while fixing up spelling mistakes in kernel error message text over the
past eight weeks.

[akpm@linux-foundation.org: s/|/||/, per Joe]
Link: http://lkml.kernel.org/r/20170919090818.5989-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Joe Perches <joe@perches.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 400ef35169c5..aa0cc49ad1ad 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -53,6 +53,7 @@ acumulator||accumulator
 adapater||adapter
 addional||additional
 additionaly||additionally
+additonal||additional
 addres||address
 adddress||address
 addreses||addresses
@@ -67,6 +68,8 @@ adviced||advised
 afecting||affecting
 againt||against
 agaist||against
+aggreataon||aggregation
+aggreation||aggregation
 albumns||albums
 alegorical||allegorical
 algined||aligned
@@ -80,6 +83,8 @@ aligment||alignment
 alignement||alignment
 allign||align
 alligned||aligned
+alllocate||allocate
+alloated||allocated
 allocatote||allocate
 allocatrd||allocated
 allocte||allocate
@@ -171,6 +176,7 @@ availale||available
 availavility||availability
 availble||available
 availiable||available
+availible||available
 avalable||available
 avaliable||available
 aysnc||async
@@ -203,6 +209,7 @@ broadcat||broadcast
 cacluated||calculated
 caculation||calculation
 calender||calendar
+calescing||coalescing
 calle||called
 callibration||calibration
 calucate||calculate
@@ -210,6 +217,7 @@ calulate||calculate
 cancelation||cancellation
 cancle||cancel
 capabilites||capabilities
+capabilty||capability
 capabitilies||capabilities
 capatibilities||capabilities
 capapbilities||capabilities
@@ -302,6 +310,7 @@ containts||contains
 contaisn||contains
 contant||contact
 contence||contents
+continious||continuous
 continous||continuous
 continously||continuously
 continueing||continuing
@@ -393,6 +402,7 @@ differrence||difference
 diffrent||different
 diffrentiate||differentiate
 difinition||definition
+dimesions||dimensions
 diplay||display
 direectly||directly
 disassocation||disassociation
@@ -449,6 +459,7 @@ equiped||equipped
 equivelant||equivalent
 equivilant||equivalent
 eror||error
+errorr||error
 estbalishment||establishment
 etsablishment||establishment
 etsbalishment||establishment
@@ -481,6 +492,7 @@ failied||failed
 faillure||failure
 failue||failure
 failuer||failure
+failng||failing
 faireness||fairness
 falied||failed
 faliure||failure
@@ -493,6 +505,7 @@ fetaure||feature
 fetaures||features
 fileystem||filesystem
 fimware||firmware
+firware||firmware
 finanize||finalize
 findn||find
 finilizes||finalizes
@@ -502,6 +515,7 @@ folloing||following
 followign||following
 followings||following
 follwing||following
+fonud||found
 forseeable||foreseeable
 forse||force
 fortan||fortran
@@ -532,6 +546,7 @@ grabing||grabbing
 grahical||graphical
 grahpical||graphical
 grapic||graphic
+grranted||granted
 guage||gauge
 guarenteed||guaranteed
 guarentee||guarantee
@@ -543,6 +558,7 @@ happend||happened
 harware||hardware
 heirarchically||hierarchically
 helpfull||helpful
+hybernate||hibernate
 hierachy||hierarchy
 hierarchie||hierarchy
 howver||however
@@ -565,16 +581,19 @@ implemenation||implementation
 implementaiton||implementation
 implementated||implemented
 implemention||implementation
+implementd||implemented
 implemetation||implementation
 implemntation||implementation
 implentation||implementation
 implmentation||implementation
 implmenting||implementing
+incative||inactive
 incomming||incoming
 incompatabilities||incompatibilities
 incompatable||incompatible
 inconsistant||inconsistent
 increas||increase
+incremeted||incremented
 incrment||increment
 indendation||indentation
 indended||intended
@@ -619,6 +638,7 @@ interger||integer
 intermittant||intermittent
 internel||internal
 interoprability||interoperability
+interuupt||interrupt
 interrface||interface
 interrrupt||interrupt
 interrup||interrupt
@@ -638,8 +658,10 @@ intrrupt||interrupt
 intterrupt||interrupt
 intuative||intuitive
 invaid||invalid
+invald||invalid
 invalde||invalid
 invalide||invalid
+invalidiate||invalidate
 invalud||invalid
 invididual||individual
 invokation||invocation
@@ -713,6 +735,7 @@ misformed||malformed
 mispelled||misspelled
 mispelt||misspelt
 mising||missing
+mismactch||mismatch
 missmanaged||mismanaged
 missmatch||mismatch
 miximum||maximum
@@ -731,6 +754,7 @@ multidimensionnal||multidimensional
 multple||multiple
 mumber||number
 muticast||multicast
+mutilcast||multicast
 mutiple||multiple
 mutli||multi
 nams||names
@@ -834,6 +858,7 @@ posible||possible
 positon||position
 possibilites||possibilities
 powerfull||powerful
+preample||preamble
 preapre||prepare
 preceeded||preceded
 preceeding||preceding
@@ -1059,6 +1084,7 @@ sturcture||structure
 subdirectoires||subdirectories
 suble||subtle
 substract||subtract
+submition||submission
 succesfully||successfully
 succesful||successful
 successed||succeeded
@@ -1078,6 +1104,7 @@ suppoted||supported
 suppported||supported
 suppport||support
 supress||suppress
+surpressed||suppressed
 surpresses||suppresses
 susbsystem||subsystem
 suspeneded||suspended
@@ -1091,6 +1118,7 @@ swithced||switched
 swithcing||switching
 swithed||switched
 swithing||switching
+swtich||switch
 symetric||symmetric
 synax||syntax
 synchonized||synchronized
@@ -1111,7 +1139,9 @@ therfore||therefore
 thier||their
 threds||threads
 threshhold||threshold
+thresold||threshold
 throught||through
+troughput||throughput
 thses||these
 tiggered||triggered
 tipically||typically
@@ -1120,6 +1150,7 @@ tmis||this
 torerable||tolerable
 tramsmitted||transmitted
 tramsmit||transmit
+tranasction||transaction
 tranfer||transfer
 transciever||transceiver
 transferd||transferred
@@ -1133,6 +1164,7 @@ trasmission||transmission
 treshold||threshold
 trigerring||triggering
 trun||turn
+tunning||tuning
 ture||true
 tyep||type
 udpate||update
@@ -1199,6 +1231,7 @@ visiters||visitors
 vitual||virtual
 wakeus||wakeups
 wating||waiting
+wiat||wait
 wether||whether
 whataver||whatever
 whcih||which

From fa87b91c94a8fd2c8502b6761be2d08a8e9bcf55 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 3 Oct 2017 16:14:24 -0700
Subject: [PATCH 04/51] include/linux/mm.h: fix typo in VM_MPX definition

There's a typo in recent change of VM_MPX definition.  We want it to be
VM_HIGH_ARCH_4, not VM_HIGH_ARCH_BIT_4.

This bug does cause visible regressions.  In arch_vma_name the vmflags
are tested against VM_MPX.  With the incorrect value of VM_MPX, a number
of vmas (such as the stack) test positive and end up being marked as
"[mpx]" in /proc/N/maps instead of their correct names.

This confuses tools like rr which expect to be able to find familiar
vmas.

Fixes: df3735c5b40f ("x86,mpx: make mpx depend on x86-64 to free up VMA flag")
Link: http://lkml.kernel.org/r/20170918140253.36856-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kyle Huey <me@kylehuey.com>
Cc: <stable@vger.kernel.org>	[4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8c10d336e42..065d99deb847 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -240,7 +240,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86_INTEL_MPX)
 /* MPX specific bounds table or bounds directory */
-# define VM_MPX		VM_HIGH_ARCH_BIT_4
+# define VM_MPX		VM_HIGH_ARCH_4
 #else
 # define VM_MPX		VM_NONE
 #endif

From 4b22927f0cbd58303aac689e378d20bf56267a39 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 3 Oct 2017 16:14:27 -0700
Subject: [PATCH 05/51] ksm: fix unlocked iteration over vmas in
 cmp_and_merge_page()

In this place mm is unlocked, so vmas or list may change.  Down read
mmap_sem to protect them from modifications.

Link: http://lkml.kernel.org/r/150512788393.10691.8868381099691121308.stgit@localhost.localdomain
Fixes: e86c59b1b12d ("mm/ksm: improve deduplication of zero pages with colouring")
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: zhong jiang <zhongjiang@huawei.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 15dd7415f7b3..6cb60f46cce5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1990,6 +1990,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
  */
 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 {
+	struct mm_struct *mm = rmap_item->mm;
 	struct rmap_item *tree_rmap_item;
 	struct page *tree_page = NULL;
 	struct stable_node *stable_node;
@@ -2062,9 +2063,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
 		struct vm_area_struct *vma;
 
-		vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
+		down_read(&mm->mmap_sem);
+		vma = find_mergeable_vma(mm, rmap_item->address);
 		err = try_to_merge_one_page(vma, page,
 					    ZERO_PAGE(rmap_item->address));
+		up_read(&mm->mmap_sem);
 		/*
 		 * In case of failure, the page was not really empty, so we
 		 * need to continue. Otherwise we're done.

From 19bfbe22f59a207417b2679e7e83c180419c9ec5 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 3 Oct 2017 16:14:31 -0700
Subject: [PATCH 06/51] mm, hugetlb, soft_offline: save compound page order
 before page migration

This fixes a bug in madvise() where if you'd try to soft offline a
hugepage via madvise(), while walking the address range you'd end up,
using the wrong page offset due to attempting to get the compound order
of a former but presently not compound page, due to dissolving the huge
page (since commit c3114a84f7f9: "mm: hugetlb: soft-offline: dissolve
source hugepage after successful migration").

As a result I ended up with all my free pages except one being offlined.

Link: http://lkml.kernel.org/r/20170912204306.GA12053@gmail.com
Fixes: c3114a84f7f9 ("mm: hugetlb: soft-offline: dissolve source hugepage after successful migration")
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 21261ff0466f..25bade36e9ca 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior,
 {
 	struct page *page;
 	struct zone *zone;
+	unsigned int order;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	for (; start < end; start += PAGE_SIZE <<
-				compound_order(compound_head(page))) {
+
+	for (; start < end; start += PAGE_SIZE << order) {
 		int ret;
 
 		ret = get_user_pages_fast(start, 1, 0, &page);
 		if (ret != 1)
 			return ret;
 
+		/*
+		 * When soft offlining hugepages, after migrating the page
+		 * we dissolve it, therefore in the second loop "page" will
+		 * no longer be a compound page, and order will be 0.
+		 */
+		order = compound_order(compound_head(page));
+
 		if (PageHWPoison(page)) {
 			put_page(page);
 			continue;

From b78412b8300a8453b78d2c1b0b925b66493bb011 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 3 Oct 2017 16:14:34 -0700
Subject: [PATCH 07/51] sh: sh7722: remove nonexistent GPIO_PTQ7 to fix pinctrl
 registration

Patch series "sh: sh7722/sh7757i/sh7264/sh7269: Fix pinctrl registration",
v2.

Magnus Damm reported that on sh7722/Migo-R, pinctrl registration fails
with:

    sh-pfc pfc-sh7722: pin 0 already registered
    sh-pfc pfc-sh7722: error during pin registration
    sh-pfc pfc-sh7722: could not register: -22
    sh-pfc: probe of pfc-sh7722 failed with error -22

pinmux_pins[] is initialized through PINMUX_GPIO(), using designated
array initializers, where the GPIO_* enums serve as indices.  Apparently
GPIO_PTQ7 was defined in the enum, but never used.  If enum values are
defined, but never used, pinmux_pins[] contains (zero-filled) holes.
Hence such entries are treated as pin zero, which was registered before,
and pinctrl registration fails.

I can't see how this ever worked, as at the time of commit f5e25ae52fef
("sh-pfc: Add sh7722 pinmux support"), pinmux_gpios[] in
drivers/pinctrl/sh-pfc/pfc-sh7722.c already had the hole, and
drivers/pinctrl/core.c already had the check.

Some scripting revealed a few more broken drivers:
  - sh7757 has four holes, due to nonexistent GPIO_PT[JLNQ]7_RESV.
  - sh7264 and sh7269 define GPIO_PH[0-7], but don't use it with
    PINMUX_GPIO().

Patch 1 fixes the issue on sh7722, and was tested.  Patches 3-4 should
fix the issue on the other 3 SoCs, but was untested due to lack of
hardware.

This patch (of 4):

On sh7722/Migo-R, pinctrl registration fails with:

    sh-pfc pfc-sh7722: pin 0 already registered
    sh-pfc pfc-sh7722: error during pin registration
    sh-pfc pfc-sh7722: could not register: -22
    sh-pfc: probe of pfc-sh7722 failed with error -22

pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array
initializers, where the GPIO_* enums serve as indices.  As GPIO_PTQ7 is
defined in the enum, but never used, pinmux_pins[] contains a
(zero-filled) hole.  Hence this entry is treated as pin zero, which was
registered before, and pinctrl registration fails.

According to the datasheet, port PTQ7 does not exist.  Hence remove
GPIO_PTQ7 from the enum to fix this.

Link: http://lkml.kernel.org/r/1505205657-18012-2-git-send-email-geert+renesas@glider.be
Fixes: 8d7b5b0af7e070b9 ("sh: Add sh7722 pinmux code")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reported-by: Magnus Damm <magnus.damm@gmail.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Tested-by: Jacopo Mondi <jacopo+renesas@jmondi.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7722.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h
index 3bb74e534d0f..78961ab78a5a 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7722.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h
@@ -67,7 +67,7 @@ enum {
 	GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0,
 
 	/* PTQ */
-	GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4,
+	GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4,
 	GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0,
 
 	/* PTR */

From d8ce38f69843a56da044e56b6c16aecfbc3c6e39 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 3 Oct 2017 16:14:37 -0700
Subject: [PATCH 08/51] sh: sh7757: remove nonexistent GPIO_PT[JLNQ]7_RESV to
 fix pinctrl registration

Commit 3810e96056ff ("sh: modify pinmux for SH7757 2nd cut") renamed
GPIO_PT[JLNQ]7 to GPIO_PT[JLNQ]7_RESV, and removed the existing users
from the pinmux_pins[] array.

However, pinmux_pins[] is initialized through PINMUX_GPIO(), using
designated array initializers, where the GPIO_* enums serve as indices.
Hence entries were not really removed, but replaced by (zero-filled)
holes.  Such entries are treated as pin zero, which was registered
before, thus leading to pinctrl registration failures, as seen on
sh7722:

    sh-pfc pfc-sh7722: pin 0 already registered
    sh-pfc pfc-sh7722: error during pin registration
    sh-pfc pfc-sh7722: could not register: -22
    sh-pfc: probe of pfc-sh7722 failed with error -22

Remove GPIO_PT[JLNQ]7_RESV from the enum to fix this.

Link: http://lkml.kernel.org/r/1505205657-18012-3-git-send-email-geert+renesas@glider.be
Fixes: 3810e96056ffddf6 ("sh: modify pinmux for SH7757 2nd cut")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Jacopo Mondi <jacopo+renesas@jmondi.org>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7757.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7757.h b/arch/sh/include/cpu-sh4/cpu/sh7757.h
index 5340f3bc1863..b40fb541e72a 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7757.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7757.h
@@ -40,7 +40,7 @@ enum {
 
 	/* PTJ */
 	GPIO_PTJ0, GPIO_PTJ1, GPIO_PTJ2, GPIO_PTJ3,
-	GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6, GPIO_PTJ7_RESV,
+	GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6,
 
 	/* PTK */
 	GPIO_PTK0, GPIO_PTK1, GPIO_PTK2, GPIO_PTK3,
@@ -48,7 +48,7 @@ enum {
 
 	/* PTL */
 	GPIO_PTL0, GPIO_PTL1, GPIO_PTL2, GPIO_PTL3,
-	GPIO_PTL4, GPIO_PTL5, GPIO_PTL6, GPIO_PTL7_RESV,
+	GPIO_PTL4, GPIO_PTL5, GPIO_PTL6,
 
 	/* PTM */
 	GPIO_PTM0, GPIO_PTM1, GPIO_PTM2, GPIO_PTM3,
@@ -56,7 +56,7 @@ enum {
 
 	/* PTN */
 	GPIO_PTN0, GPIO_PTN1, GPIO_PTN2, GPIO_PTN3,
-	GPIO_PTN4, GPIO_PTN5, GPIO_PTN6, GPIO_PTN7_RESV,
+	GPIO_PTN4, GPIO_PTN5, GPIO_PTN6,
 
 	/* PTO */
 	GPIO_PTO0, GPIO_PTO1, GPIO_PTO2, GPIO_PTO3,
@@ -68,7 +68,7 @@ enum {
 
 	/* PTQ */
 	GPIO_PTQ0, GPIO_PTQ1, GPIO_PTQ2, GPIO_PTQ3,
-	GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6, GPIO_PTQ7_RESV,
+	GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6,
 
 	/* PTR */
 	GPIO_PTR0, GPIO_PTR1, GPIO_PTR2, GPIO_PTR3,

From eae3df7e82318d798f45dedf111e241805ec7a4a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 3 Oct 2017 16:14:41 -0700
Subject: [PATCH 09/51] sh: sh7264: remove nonexistent GPIO_PH[0-7] to fix
 pinctrl registration

Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated
array initializers, where the GPIO_* enums serve as indices.  If enum
values are defined, but never used, pinmux_pins[] contains (zero-filled)
holes.  Such entries are treated as pin zero, which was registered
before, thus leading to pinctrl registration failures, as seen on
sh7722:

    sh-pfc pfc-sh7722: pin 0 already registered
    sh-pfc pfc-sh7722: error during pin registration
    sh-pfc pfc-sh7722: could not register: -22
    sh-pfc: probe of pfc-sh7722 failed with error -22

Remove GPIO_PH[0-7] from the enum to fix this.

Link: http://lkml.kernel.org/r/1505205657-18012-4-git-send-email-geert+renesas@glider.be
Fixes: 41797f75486d8ca3 ("sh: Add pinmux for sh7264")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Jacopo Mondi <jacopo+renesas@jmondi.org>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/include/cpu-sh2a/cpu/sh7264.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7264.h b/arch/sh/include/cpu-sh2a/cpu/sh7264.h
index 4d1ef6d74bd6..2ae0e938b657 100644
--- a/arch/sh/include/cpu-sh2a/cpu/sh7264.h
+++ b/arch/sh/include/cpu-sh2a/cpu/sh7264.h
@@ -43,9 +43,7 @@ enum {
 	GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4,
 	GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0,
 
-	/* Port H */
-	GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4,
-	GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0,
+	/* Port H - Port H does not have a Data Register */
 
 	/* Port I - not on device */
 

From d9d73e81fe82fdf4ee65a48c26531edc04108349 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 3 Oct 2017 16:14:44 -0700
Subject: [PATCH 10/51] sh: sh7269: remove nonexistent GPIO_PH[0-7] to fix
 pinctrl registration

Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated
array initializers, where the GPIO_* enums serve as indices.  If enum
values are defined, but never used, pinmux_pins[] contains (zero-filled)
holes.  Such entries are treated as pin zero, which was registered
before, thus leading to pinctrl registration failures, as seen on
sh7722:

    sh-pfc pfc-sh7722: pin 0 already registered
    sh-pfc pfc-sh7722: error during pin registration
    sh-pfc pfc-sh7722: could not register: -22
    sh-pfc: probe of pfc-sh7722 failed with error -22

Remove GPIO_PH[0-7] from the enum to fix this.

Link: http://lkml.kernel.org/r/1505205657-18012-5-git-send-email-geert+renesas@glider.be
Fixes: ef0fa5331a73e479 ("sh: Add pinmux for sh7269")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: Jacopo Mondi <jacopo+renesas@jmondi.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/include/cpu-sh2a/cpu/sh7269.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7269.h b/arch/sh/include/cpu-sh2a/cpu/sh7269.h
index 2a0ca8780f0d..13c495a9fc00 100644
--- a/arch/sh/include/cpu-sh2a/cpu/sh7269.h
+++ b/arch/sh/include/cpu-sh2a/cpu/sh7269.h
@@ -45,9 +45,7 @@ enum {
 	GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4,
 	GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0,
 
-	/* Port H */
-	GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4,
-	GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0,
+	/* Port H - Port H does not have a Data Register */
 
 	/* Port I - not on device */
 

From d5567c9df1ef001b2a7e6684b3b3498371ee4cae Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Tue, 3 Oct 2017 16:14:47 -0700
Subject: [PATCH 11/51] z3fold: fix potential race in z3fold_reclaim_page

It is possible that on a (partially) unsuccessful page reclaim,
kref_put() called in z3fold_reclaim_page() does not yield page release,
but the page is released shortly afterwards by another thread.  Then
z3fold_reclaim_page() would try to list_add() that (released) page again
which is obviously a bug.

To avoid that, spin_lock() has to be taken earlier, before the
kref_put() call mentioned earlier.

Link: http://lkml.kernel.org/r/20170913162937.bfff21c7d12b12a5f47639fd@gmail.com
Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: <Oleksiy.Avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 486550df32be..b04fa3ba1bf2 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -875,16 +875,18 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 				goto next;
 		}
 next:
+		spin_lock(&pool->lock);
 		if (test_bit(PAGE_HEADLESS, &page->private)) {
 			if (ret == 0) {
+				spin_unlock(&pool->lock);
 				free_z3fold_page(page);
 				return 0;
 			}
 		} else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
 			atomic64_dec(&pool->pages_nr);
+			spin_unlock(&pool->lock);
 			return 0;
 		}
-		spin_lock(&pool->lock);
 
 		/*
 		 * Add to the beginning of LRU.

From 4d4bbd8526a8fbeb2c090ea360211fceff952383 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:14:50 -0700
Subject: [PATCH 12/51] mm, oom_reaper: skip mm structs with mmu notifiers

Andrea has noticed that the oom_reaper doesn't invalidate the range via
mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can
corrupt the memory of the kvm guest for example.

tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not
sufficient as per Andrea:

 "mmu_notifier_invalidate_range cannot be used in replacement of
  mmu_notifier_invalidate_range_start/end. For KVM
  mmu_notifier_invalidate_range is a noop and rightfully so. A MMU
  notifier implementation has to implement either ->invalidate_range
  method or the invalidate_range_start/end methods, not both. And if you
  implement invalidate_range_start/end like KVM is forced to do, calling
  mmu_notifier_invalidate_range in common code is a noop for KVM.

  For those MMU notifiers that can get away only implementing
  ->invalidate_range, the ->invalidate_range is implicitly called by
  mmu_notifier_invalidate_range_end(). And only those secondary MMUs
  that share the same pagetable with the primary MMU (like AMD iommuv2)
  can get away only implementing ->invalidate_range"

As the callback is allowed to sleep and the implementation is out of
hand of the MM it is safer to simply bail out if there is an mmu
notifier registered.  In order to not fail too early make the
mm_has_notifiers check under the oom_lock and have a little nap before
failing to give the current oom victim some more time to exit.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h |  5 +++++
 mm/oom_kill.c                | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 7b2e31b1745a..6866e8126982 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -400,6 +400,11 @@ extern void mmu_notifier_synchronize(void);
 
 #else /* CONFIG_MMU_NOTIFIER */
 
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 99736e026712..dee0f75c3013 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -40,6 +40,7 @@
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/init.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -494,6 +495,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto unlock_oom;
 	}
 
+	/*
+	 * If the mm has notifiers then we would need to invalidate them around
+	 * unmap_page_range and that is risky because notifiers can sleep and
+	 * what they do is basically undeterministic.  So let's have a short
+	 * sleep to give the oom victim some more time.
+	 * TODO: we really want to get rid of this ugly hack and make sure that
+	 * notifiers cannot block for unbounded amount of time and add
+	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 */
+	if (mm_has_notifiers(mm)) {
+		up_read(&mm->mmap_sem);
+		schedule_timeout_idle(HZ);
+		goto unlock_oom;
+	}
+
 	/*
 	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
 	 * work on the mm anymore. The check for MMF_OOM_SKIP must run

From 72f0184c8a00c70179cfed6266e2e06b4d400065 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:14:53 -0700
Subject: [PATCH 13/51] mm, memcg: remove hotplug locking from try_charge

The following lockdep splat has been noticed during LTP testing

  ======================================================
  WARNING: possible circular locking dependency detected
  4.13.0-rc3-next-20170807 #12 Not tainted
  ------------------------------------------------------
  a.out/4771 is trying to acquire lock:
   (cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff812b4668>] drain_all_stock.part.35+0x18/0x140

  but task is already holding lock:
   (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #3 (&mm->mmap_sem){++++++}:
         lock_acquire+0xc9/0x230
         __might_fault+0x70/0xa0
         _copy_to_user+0x23/0x70
         filldir+0xa7/0x110
         xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs]
         xfs_readdir+0x1fa/0x2c0 [xfs]
         xfs_file_readdir+0x30/0x40 [xfs]
         iterate_dir+0x17a/0x1a0
         SyS_getdents+0xb0/0x160
         entry_SYSCALL_64_fastpath+0x1f/0xbe

  -> #2 (&type->i_mutex_dir_key#3){++++++}:
         lock_acquire+0xc9/0x230
         down_read+0x51/0xb0
         lookup_slow+0xde/0x210
         walk_component+0x160/0x250
         link_path_walk+0x1a6/0x610
         path_openat+0xe4/0xd50
         do_filp_open+0x91/0x100
         file_open_name+0xf5/0x130
         filp_open+0x33/0x50
         kernel_read_file_from_path+0x39/0x80
         _request_firmware+0x39f/0x880
         request_firmware_direct+0x37/0x50
         request_microcode_fw+0x64/0xe0
         reload_store+0xf7/0x180
         dev_attr_store+0x18/0x30
         sysfs_kf_write+0x44/0x60
         kernfs_fop_write+0x113/0x1a0
         __vfs_write+0x37/0x170
         vfs_write+0xc7/0x1c0
         SyS_write+0x58/0xc0
         do_syscall_64+0x6c/0x1f0
         return_from_SYSCALL_64+0x0/0x7a

  -> #1 (microcode_mutex){+.+.+.}:
         lock_acquire+0xc9/0x230
         __mutex_lock+0x88/0x960
         mutex_lock_nested+0x1b/0x20
         microcode_init+0xbb/0x208
         do_one_initcall+0x51/0x1a9
         kernel_init_freeable+0x208/0x2a7
         kernel_init+0xe/0x104
         ret_from_fork+0x2a/0x40

  -> #0 (cpu_hotplug_lock.rw_sem){++++++}:
         __lock_acquire+0x153c/0x1550
         lock_acquire+0xc9/0x230
         cpus_read_lock+0x4b/0x90
         drain_all_stock.part.35+0x18/0x140
         try_charge+0x3ab/0x6e0
         mem_cgroup_try_charge+0x7f/0x2c0
         shmem_getpage_gfp+0x25f/0x1050
         shmem_fault+0x96/0x200
         __do_fault+0x1e/0xa0
         __handle_mm_fault+0x9c3/0xe00
         handle_mm_fault+0x16e/0x380
         __do_page_fault+0x24a/0x530
         do_page_fault+0x30/0x80
         page_fault+0x28/0x30

  other info that might help us debug this:

  Chain exists of:
    cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&mm->mmap_sem);
                                 lock(&type->i_mutex_dir_key#3);
                                 lock(&mm->mmap_sem);
    lock(cpu_hotplug_lock.rw_sem);

   *** DEADLOCK ***

  2 locks held by a.out/4771:
   #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
   #1:  (percpu_charge_mutex){+.+...}, at: [<ffffffff812b4c97>] try_charge+0x397/0x6e0

The problem is very similar to the one fixed by commit a459eeb7b852
("mm, page_alloc: do not depend on cpu hotplug locks inside the
allocator").  We are taking hotplug locks while we can be sitting on top
of basically arbitrary locks.  This just calls for problems.

We can get rid of {get,put}_online_cpus, fortunately.  We do not have to
be worried about races with memory hotplug because drain_local_stock,
which is called from both the WQ draining and the memory hotplug
contexts, is always operating on the local cpu stock with IRQs disabled.

The only thing to be careful about is that the target memcg doesn't
vanish while we are still in drain_all_stock so take a reference on it.

Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 15af3da5af02..696c6529e900 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1777,6 +1777,10 @@ static void drain_local_stock(struct work_struct *dummy)
 	struct memcg_stock_pcp *stock;
 	unsigned long flags;
 
+	/*
+	 * The only protection from memory hotplug vs. drain_stock races is
+	 * that we always operate on local CPU stock here with IRQ disabled
+	 */
 	local_irq_save(flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
@@ -1821,27 +1825,33 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 	/* If someone's already draining, avoid adding running more workers. */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
-	/* Notify other cpus that system-wide "drain" is running */
-	get_online_cpus();
+	/*
+	 * Notify other cpus that system-wide "drain" is running
+	 * We do not care about races with the cpu hotplug because cpu down
+	 * as well as workers from this path always operate on the local
+	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
+	 */
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 
 		memcg = stock->cached;
-		if (!memcg || !stock->nr_pages)
+		if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
 			continue;
-		if (!mem_cgroup_is_descendant(memcg, root_memcg))
+		if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
+			css_put(&memcg->css);
 			continue;
+		}
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
+		css_put(&memcg->css);
 	}
 	put_cpu();
-	put_online_cpus();
 	mutex_unlock(&percpu_charge_mutex);
 }
 

From 3f2eb0287ebd62ec8d6d544f830285302279e6bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Tue, 3 Oct 2017 16:14:57 -0700
Subject: [PATCH 14/51] mm/memcg: avoid page count check for zone device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix for 4.14, zone device page always have an elevated refcount of one
and thus page count sanity check in uncharge_page() is inappropriate for
them.

[mhocko@suse.com: nano-optimize VM_BUG_ON in uncharge_page]
Link: http://lkml.kernel.org/r/20170914190011.5217-1-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 696c6529e900..d5f3a62887cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5658,7 +5658,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 {
 	VM_BUG_ON_PAGE(PageLRU(page), page);
-	VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+	VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
+			!PageHWPoison(page) , page);
 
 	if (!page->mem_cgroup)
 		return;

From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Tue, 3 Oct 2017 16:15:00 -0700
Subject: [PATCH 15/51] android: binder: drop lru lock in isolate callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the global lru lock in isolate callback before calling
zap_page_range which calls cond_resched, and re-acquire the global lru
lock before returning.  Also change return code to LRU_REMOVED_RETRY.

Use mmput_async when fail to acquire mmap sem in an atomic context.

Fix "BUG: sleeping function called from invalid context"
errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled.

Also restore mmput_async, which was initially introduced in commit
ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom
reaper context"), and was removed in commit 212925802454 ("mm: oom: let
oom_reap_task and exit_mmap run concurrently").

Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com
Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder")
Signed-off-by: Sherry Yang <sherryy@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Kyle Yan <kyan@codeaurora.org>
Acked-by: Arve Hjønnevåg <arve@android.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Martijn Coenen <maco@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Riley Andrews <riandrews@android.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hoeun Ryu <hoeun.ryu@gmail.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/android/binder_alloc.c | 18 ++++++++++++------
 include/linux/sched/mm.h       |  6 ++++++
 kernel/fork.c                  | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 8fe165844e47..064f5e31ec55 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	struct binder_alloc *alloc;
 	uintptr_t page_addr;
 	size_t index;
+	struct vm_area_struct *vma;
 
 	alloc = page->alloc;
 	if (!mutex_trylock(&alloc->mutex))
@@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	index = page - alloc->pages;
 	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
-	if (alloc->vma) {
+	vma = alloc->vma;
+	if (vma) {
 		mm = get_task_mm(alloc->tsk);
 		if (!mm)
 			goto err_get_task_mm_failed;
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
+	}
 
+	list_lru_isolate(lru, item);
+	spin_unlock(lock);
+
+	if (vma) {
 		trace_binder_unmap_user_start(alloc, index);
 
-		zap_page_range(alloc->vma,
+		zap_page_range(vma,
 			       page_addr + alloc->user_buffer_offset,
 			       PAGE_SIZE);
 
@@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	trace_binder_unmap_kernel_end(alloc, index);
 
-	list_lru_isolate(lru, item);
-
+	spin_lock(lock);
 	mutex_unlock(&alloc->mutex);
-	return LRU_REMOVED;
+	return LRU_REMOVED_RETRY;
 
 err_down_write_mmap_sem_failed:
-	mmput(mm);
+	mmput_async(mm);
 err_get_task_mm_failed:
 err_page_already_freed:
 	mutex_unlock(&alloc->mutex);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3a19c253bdb1..ae53e413fb13 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
+ * be called from the atomic context as well
+ */
+void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct,
+					    async_put_work);
+
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+#endif
+
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *

From 6818600ff094ca255a7fe31838ad50c29656c3c5 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Oct 2017 16:15:03 -0700
Subject: [PATCH 16/51] mm,compaction: serialize waitqueue_active() checks (for
 real)

Andrea brought to my attention that the L->{L,S} guarantees are
completely bogus for this case.  I was looking at the diagram, from the
offending commit, when that _is_ the race, we had the load reordered
already.

What we need is at least S->L semantics, thus simply use
wq_has_sleeper() to serialize the call for good.

Link: http://lkml.kernel.org/r/20170914175313.GB811@linux-80c1.suse
Fixes: 46acef048a6 (mm,compaction: serialize waitqueue_active() checks)
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reported-by: Andrea Parri <parri.andrea@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index fb548e4c7bd4..03d31a875341 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1999,17 +1999,14 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
 	if (pgdat->kcompactd_max_order < order)
 		pgdat->kcompactd_max_order = order;
 
-	/*
-	 * Pairs with implicit barrier in wait_event_freezable()
-	 * such that wakeups are not missed in the lockless
-	 * waitqueue_active() call.
-	 */
-	smp_acquire__after_ctrl_dep();
-
 	if (pgdat->kcompactd_classzone_idx > classzone_idx)
 		pgdat->kcompactd_classzone_idx = classzone_idx;
 
-	if (!waitqueue_active(&pgdat->kcompactd_wait))
+	/*
+	 * Pairs with implicit barrier in wait_event_freezable()
+	 * such that wakeups are not missed.
+	 */
+	if (!wq_has_sleeper(&pgdat->kcompactd_wait))
 		return;
 
 	if (!kcompactd_node_suitable(pgdat))

From 3552935742e0d5f0dafd823736f45bdaa7ba672c Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Tue, 3 Oct 2017 16:15:06 -0700
Subject: [PATCH 17/51] z3fold: fix stale list handling

Fix the situation when clear_bit() is called for page->private before
the page pointer is actually assigned.  While at it, remove work_busy()
check because it is costly and does not give 100% guarantee anyway.

Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: <Oleksiy.Avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index b04fa3ba1bf2..b2ba2ba585f3 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -250,6 +250,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 
 	WARN_ON(!list_empty(&zhdr->buddy));
 	set_bit(PAGE_STALE, &page->private);
+	clear_bit(NEEDS_COMPACTING, &page->private);
 	spin_lock(&pool->lock);
 	if (!list_empty(&page->lru))
 		list_del(&page->lru);
@@ -303,7 +304,6 @@ static void free_pages_work(struct work_struct *w)
 		list_del(&zhdr->buddy);
 		if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
 			continue;
-		clear_bit(NEEDS_COMPACTING, &page->private);
 		spin_unlock(&pool->stale_lock);
 		cancel_work_sync(&zhdr->work);
 		free_z3fold_page(page);
@@ -624,10 +624,8 @@ lookup:
 	 * stale pages list. cancel_work_sync() can sleep so we must make
 	 * sure it won't be called in case we're in atomic context.
 	 */
-	if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
-	    !unlikely(work_busy(&zhdr->work)))) {
+	if (zhdr && (can_sleep || !work_pending(&zhdr->work))) {
 		list_del(&zhdr->buddy);
-		clear_bit(NEEDS_COMPACTING, &page->private);
 		spin_unlock(&pool->stale_lock);
 		if (can_sleep)
 			cancel_work_sync(&zhdr->work);

From 57148a64e823bb1f49112fa52a92a7f372cda892 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Oct 2017 16:15:10 -0700
Subject: [PATCH 18/51] mm: meminit: mark init_reserved_page as __meminit

The function is called from __meminit context and calls other __meminit
functions but isn't it self mark as such today:

  WARNING: vmlinux.o(.text.unlikely+0x4516): Section mismatch in reference from the function init_reserved_page() to the function .meminit.text:early_pfn_to_nid()
  The function init_reserved_page() references the function __meminit early_pfn_to_nid().
  This is often because init_reserved_page lacks a __meminit annotation or the annotation of early_pfn_to_nid is wrong.

On most compilers, we don't notice this because the function gets
inlined all the time.  Adding __meminit here fixes the harmless warning
for the old versions and is generally the correct annotation.

Link: http://lkml.kernel.org/r/20170915193149.901180-1-arnd@arndb.de
Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af88836a..38d165a87860 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
 {
 	pg_data_t *pgdat;
 	int nid, zid;

From 31d1e130f4a0f8f629a460167569577cac9b17c1 Mon Sep 17 00:00:00 2001
From: Ioan Nicu <ioan.nicu.ext@nokia.com>
Date: Tue, 3 Oct 2017 16:15:13 -0700
Subject: [PATCH 19/51] rapidio: remove global irq spinlocks from the subsystem

Locking of config and doorbell operations should be done only if the
underlying hardware requires it.

This patch removes the global spinlocks from the rapidio subsystem and
moves them to the mport drivers (fsl_rio and tsi721), only to the
necessary places.  For example, local config space read and write
operations (lcread/lcwrite) are atomic in all existing drivers, so there
should be no need for locking, while the cread/cwrite operations which
generate maintenance transactions need to be synchronized with a lock.

Later, each driver could chose to use a per-port lock instead of a
global one, or even more granular locking.

Link: http://lkml.kernel.org/r/20170824113023.GD50104@nokia.com
Signed-off-by: Ioan Nicu <ioan.nicu.ext@nokia.com>
Signed-off-by: Frank Kunz <frank.kunz@nokia.com>
Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/sysdev/fsl_rio.c    | 17 ++++++++++++--
 arch/powerpc/sysdev/fsl_rmu.c    |  8 +++++++
 drivers/rapidio/devices/tsi721.c |  7 ++++++
 drivers/rapidio/rio-access.c     | 40 ++++----------------------------
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index 9234be1e66f5..5011ffea4e4b 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -71,6 +71,8 @@
 #define RIWAR_WRTYP_ALLOC	0x00006000
 #define RIWAR_SIZE_MASK		0x0000003F
 
+static DEFINE_SPINLOCK(fsl_rio_config_lock);
+
 #define __fsl_read_rio_config(x, addr, err, op)		\
 	__asm__ __volatile__(				\
 		"1:	"op" %1,0(%2)\n"		\
@@ -184,6 +186,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid,
 			u8 hopcount, u32 offset, int len, u32 *val)
 {
 	struct rio_priv *priv = mport->priv;
+	unsigned long flags;
 	u8 *data;
 	u32 rval, err = 0;
 
@@ -197,6 +200,8 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid,
 	if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len))
 		return -EINVAL;
 
+	spin_lock_irqsave(&fsl_rio_config_lock, flags);
+
 	out_be32(&priv->maint_atmu_regs->rowtar,
 		 (destid << 22) | (hopcount << 12) | (offset >> 12));
 	out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10));
@@ -213,6 +218,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid,
 		__fsl_read_rio_config(rval, data, err, "lwz");
 		break;
 	default:
+		spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
 		return -EINVAL;
 	}
 
@@ -221,6 +227,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid,
 			 err, destid, hopcount, offset);
 	}
 
+	spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
 	*val = rval;
 
 	return err;
@@ -244,7 +251,10 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid,
 			u8 hopcount, u32 offset, int len, u32 val)
 {
 	struct rio_priv *priv = mport->priv;
+	unsigned long flags;
 	u8 *data;
+	int ret = 0;
+
 	pr_debug
 		("fsl_rio_config_write:"
 		" index %d destid %d hopcount %d offset %8.8x len %d val %8.8x\n",
@@ -255,6 +265,8 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid,
 	if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len))
 		return -EINVAL;
 
+	spin_lock_irqsave(&fsl_rio_config_lock, flags);
+
 	out_be32(&priv->maint_atmu_regs->rowtar,
 		 (destid << 22) | (hopcount << 12) | (offset >> 12));
 	out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10));
@@ -271,10 +283,11 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid,
 		out_be32((u32 *) data, val);
 		break;
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
 	}
+	spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
 
-	return 0;
+	return ret;
 }
 
 static void fsl_rio_inbound_mem_init(struct rio_priv *priv)
diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c
index ab7a74c75be8..88b35a3dcdc5 100644
--- a/arch/powerpc/sysdev/fsl_rmu.c
+++ b/arch/powerpc/sysdev/fsl_rmu.c
@@ -104,6 +104,8 @@
 
 #define DOORBELL_MESSAGE_SIZE	0x08
 
+static DEFINE_SPINLOCK(fsl_rio_doorbell_lock);
+
 struct rio_msg_regs {
 	u32 omr;
 	u32 osr;
@@ -626,9 +628,13 @@ err_out:
 int fsl_rio_doorbell_send(struct rio_mport *mport,
 				int index, u16 destid, u16 data)
 {
+	unsigned long flags;
+
 	pr_debug("fsl_doorbell_send: index %d destid %4.4x data %4.4x\n",
 		 index, destid, data);
 
+	spin_lock_irqsave(&fsl_rio_doorbell_lock, flags);
+
 	/* In the serial version silicons, such as MPC8548, MPC8641,
 	 * below operations is must be.
 	 */
@@ -638,6 +644,8 @@ int fsl_rio_doorbell_send(struct rio_mport *mport,
 	out_be32(&dbell->dbell_regs->oddatr, (index << 20) | data);
 	out_be32(&dbell->dbell_regs->odmr, 0x00000001);
 
+	spin_unlock_irqrestore(&fsl_rio_doorbell_lock, flags);
+
 	return 0;
 }
 
diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index 315a4be8dc1e..9a68914100ad 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -51,6 +51,8 @@ module_param(mbox_sel, byte, S_IRUGO);
 MODULE_PARM_DESC(mbox_sel,
 		 "RIO Messaging MBOX Selection Mask (default: 0x0f = all)");
 
+static DEFINE_SPINLOCK(tsi721_maint_lock);
+
 static void tsi721_omsg_handler(struct tsi721_device *priv, int ch);
 static void tsi721_imsg_handler(struct tsi721_device *priv, int ch);
 
@@ -124,12 +126,15 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size,
 	void __iomem *regs = priv->regs + TSI721_DMAC_BASE(priv->mdma.ch_id);
 	struct tsi721_dma_desc *bd_ptr;
 	u32 rd_count, swr_ptr, ch_stat;
+	unsigned long flags;
 	int i, err = 0;
 	u32 op = do_wr ? MAINT_WR : MAINT_RD;
 
 	if (offset > (RIO_MAINT_SPACE_SZ - len) || (len != sizeof(u32)))
 		return -EINVAL;
 
+	spin_lock_irqsave(&tsi721_maint_lock, flags);
+
 	bd_ptr = priv->mdma.bd_base;
 
 	rd_count = ioread32(regs + TSI721_DMAC_DRDCNT);
@@ -197,7 +202,9 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size,
 	 */
 	swr_ptr = ioread32(regs + TSI721_DMAC_DSWP);
 	iowrite32(swr_ptr, regs + TSI721_DMAC_DSRP);
+
 err_out:
+	spin_unlock_irqrestore(&tsi721_maint_lock, flags);
 
 	return err;
 }
diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c
index a3824baca2e5..3ee9af83b638 100644
--- a/drivers/rapidio/rio-access.c
+++ b/drivers/rapidio/rio-access.c
@@ -13,17 +13,9 @@
 #include <linux/rio.h>
 #include <linux/module.h>
 
-/*
- * These interrupt-safe spinlocks protect all accesses to RIO
- * configuration space and doorbell access.
- */
-static DEFINE_SPINLOCK(rio_config_lock);
-static DEFINE_SPINLOCK(rio_doorbell_lock);
-
 /*
  *  Wrappers for all RIO configuration access functions.  They just check
- *  alignment, do locking and call the low-level functions pointed to
- *  by rio_mport->ops.
+ *  alignment and call the low-level functions pointed to by rio_mport->ops.
  */
 
 #define RIO_8_BAD 0
@@ -44,13 +36,10 @@ int __rio_local_read_config_##size \
 	(struct rio_mport *mport, u32 offset, type *value)		\
 {									\
 	int res;							\
-	unsigned long flags;						\
 	u32 data = 0;							\
 	if (RIO_##size##_BAD) return RIO_BAD_SIZE;			\
-	spin_lock_irqsave(&rio_config_lock, flags);			\
 	res = mport->ops->lcread(mport, mport->id, offset, len, &data);	\
 	*value = (type)data;						\
-	spin_unlock_irqrestore(&rio_config_lock, flags);		\
 	return res;							\
 }
 
@@ -67,13 +56,8 @@ int __rio_local_read_config_##size \
 int __rio_local_write_config_##size \
 	(struct rio_mport *mport, u32 offset, type value)		\
 {									\
-	int res;							\
-	unsigned long flags;						\
 	if (RIO_##size##_BAD) return RIO_BAD_SIZE;			\
-	spin_lock_irqsave(&rio_config_lock, flags);			\
-	res = mport->ops->lcwrite(mport, mport->id, offset, len, value);\
-	spin_unlock_irqrestore(&rio_config_lock, flags);		\
-	return res;							\
+	return mport->ops->lcwrite(mport, mport->id, offset, len, value);\
 }
 
 RIO_LOP_READ(8, u8, 1)
@@ -104,13 +88,10 @@ int rio_mport_read_config_##size \
 	(struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type *value)	\
 {									\
 	int res;							\
-	unsigned long flags;						\
 	u32 data = 0;							\
 	if (RIO_##size##_BAD) return RIO_BAD_SIZE;			\
-	spin_lock_irqsave(&rio_config_lock, flags);			\
 	res = mport->ops->cread(mport, mport->id, destid, hopcount, offset, len, &data); \
 	*value = (type)data;						\
-	spin_unlock_irqrestore(&rio_config_lock, flags);		\
 	return res;							\
 }
 
@@ -127,13 +108,9 @@ int rio_mport_read_config_##size \
 int rio_mport_write_config_##size \
 	(struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type value)	\
 {									\
-	int res;							\
-	unsigned long flags;						\
 	if (RIO_##size##_BAD) return RIO_BAD_SIZE;			\
-	spin_lock_irqsave(&rio_config_lock, flags);			\
-	res = mport->ops->cwrite(mport, mport->id, destid, hopcount, offset, len, value); \
-	spin_unlock_irqrestore(&rio_config_lock, flags);		\
-	return res;							\
+	return mport->ops->cwrite(mport, mport->id, destid, hopcount,	\
+			offset, len, value);				\
 }
 
 RIO_OP_READ(8, u8, 1)
@@ -162,14 +139,7 @@ EXPORT_SYMBOL_GPL(rio_mport_write_config_32);
  */
 int rio_mport_send_doorbell(struct rio_mport *mport, u16 destid, u16 data)
 {
-	int res;
-	unsigned long flags;
-
-	spin_lock_irqsave(&rio_doorbell_lock, flags);
-	res = mport->ops->dsend(mport, mport->id, destid, data);
-	spin_unlock_irqrestore(&rio_doorbell_lock, flags);
-
-	return res;
+	return mport->ops->dsend(mport, mport->id, destid, data);
 }
 
 EXPORT_SYMBOL_GPL(rio_mport_send_doorbell);

From a872eb2131e91ce7c89a8888974a5e22a272b12f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 3 Oct 2017 16:15:16 -0700
Subject: [PATCH 20/51] mm: fix RODATA_TEST failure "rodata_test: test data was
 not read only"

On powerpc, RODATA_TEST fails with message the following messages:

  Freeing unused kernel memory: 528K
  rodata_test: test data was not read only

This is because GCC allocates it to .data section:

  c0695034 g     O .data	00000004 rodata_test_data

Since commit 056b9d8a7692 ("mm: remove rodata_test_data export, add
pr_fmt"), rodata_test_data is used only inside rodata_test.c By
declaring it static, it gets properly allocated into .rodata section
instead of .data:

  c04df710 l     O .rodata	00000004 rodata_test_data

Fixes: 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt")
Link: http://lkml.kernel.org/r/20170921093729.1080368AC1@po15668-vm-win7.idsi0.si.c-s.fr
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rodata_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6bb4deb12e78..d908c8769b48 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -14,7 +14,7 @@
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
-const int rodata_test_data = 0xC3;
+static const int rodata_test_data = 0xC3;
 
 void rodata_test(void)
 {

From ae94264ed4b0cf7cd887947650db4c69acb62072 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 3 Oct 2017 16:15:19 -0700
Subject: [PATCH 21/51] zram: fix null dereference of handle

In testing I found handle passed to zs_map_object in __zram_bvec_read is
NULL so eh kernel goes oops in pin_object().

The reason is there is no routine to check the slot's freeing after
getting the slot's lock.  This patch fixes it.

[minchan@kernel.org: v2]
  Link: http://lkml.kernel.org/r/1505887347-10881-1-git-send-email-minchan@kernel.org
Link: http://lkml.kernel.org/r/1505788488-26723-1-git-send-email-minchan@kernel.org
Fixes: 1f7319c74275 ("zram: partial IO refactoring")
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 36 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2981c27d3aae..f149d3e61234 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -766,27 +766,6 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
 	bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
-static bool zram_same_page_read(struct zram *zram, u32 index,
-				struct page *page,
-				unsigned int offset, unsigned int len)
-{
-	zram_slot_lock(zram, index);
-	if (unlikely(!zram_get_handle(zram, index) ||
-			zram_test_flag(zram, index, ZRAM_SAME))) {
-		void *mem;
-
-		zram_slot_unlock(zram, index);
-		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len,
-					zram_get_element(zram, index));
-		kunmap_atomic(mem);
-		return true;
-	}
-	zram_slot_unlock(zram, index);
-
-	return false;
-}
-
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -884,11 +863,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
 		zram_slot_unlock(zram, index);
 	}
 
-	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
-		return 0;
-
 	zram_slot_lock(zram, index);
 	handle = zram_get_handle(zram, index);
+	if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
+		unsigned long value;
+		void *mem;
+
+		value = handle ? zram_get_element(zram, index) : 0;
+		mem = kmap_atomic(page);
+		zram_fill_page(mem, PAGE_SIZE, value);
+		kunmap_atomic(mem);
+		zram_slot_unlock(zram, index);
+		return 0;
+	}
+
 	size = zram_get_obj_size(zram, index);
 
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);

From 5bdfca6435b8294490ffb5b7c8b7d8eac3814b06 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 3 Oct 2017 16:15:23 -0700
Subject: [PATCH 22/51] m32r: define CPU_BIG_ENDIAN

The build of m32r allmodconfig is giving lots of build warnings about:

  include/linux/byteorder/big_endian.h:7:2:
	warning: #warning inconsistent configuration,
		needs CONFIG_CPU_BIG_ENDIAN [-Wcpp]
	#warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN

Define CPU_BIG_ENDIAN like the way CPU_LITTLE_ENDIAN is defined.

Link: http://lkml.kernel.org/r/1505678083-10320-1-git-send-email-sudipm.mukherjee@gmail.com
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m32r/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 87cde1e4b38c..0777f3a8a1f3 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -194,6 +194,10 @@ config TIMER_DIVIDE
 	int "Timer divider (integer)"
 	default "128"
 
+config CPU_BIG_ENDIAN
+        bool "Generate big endian code"
+	default n
+
 config CPU_LITTLE_ENDIAN
         bool "Generate little endian code"
 	default n

From f4e222c56c83b2aed7cc2b329fca7435508eefa1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 3 Oct 2017 16:15:25 -0700
Subject: [PATCH 23/51] mm: have filemap_check_and_advance_wb_err clear
 AS_EIO/AS_ENOSPC

Eryu noticed that he could sometimes get a leftover error reported when
it shouldn't be on fsync with ext2 and non-journalled ext4.

The problem is that writeback_single_inode still uses filemap_fdatawait.
That picks up a previously set AS_EIO flag, which would ordinarily have
been cleared before.

Since we're mostly using this function as a replacement for
filemap_check_errors, have filemap_check_and_advance_wb_err clear AS_EIO
and AS_ENOSPC when reporting an error.  That should allow the new
function to better emulate the behavior of the old with respect to these
flags.

Link: http://lkml.kernel.org/r/20170922133331.28812-1-jlayton@kernel.org
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reported-by: Eryu Guan <eguan@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/filemap.c b/mm/filemap.c
index db250d0e0565..594d73fef8b4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -620,6 +620,14 @@ int file_check_and_advance_wb_err(struct file *file)
 		trace_file_check_and_advance_wb_err(file, old);
 		spin_unlock(&file->f_lock);
 	}
+
+	/*
+	 * We're mostly using this function as a drop in replacement for
+	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
+	 * that the legacy code would have had on these flags.
+	 */
+	clear_bit(AS_EIO, &mapping->flags);
+	clear_bit(AS_ENOSPC, &mapping->flags);
 	return err;
 }
 EXPORT_SYMBOL(file_check_and_advance_wb_err);

From 24c92eb7dce0a299b8e1a8c5fa585844a53bf7f0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 3 Oct 2017 16:15:29 -0700
Subject: [PATCH 24/51] mm: avoid marking swap cached page as lazyfree

MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear
SwapBacked).  There is no lock to prevent the page is added to swap
cache between these two steps by page reclaim.  Page reclaim could add
the page to swap cache and unmap the page.  After page reclaim, the page
is added back to lru.  At that time, we probably start draining per-cpu
pagevec and mark the page lazyfree.  So the page could be in a state
with SwapBacked cleared and PG_swapcache set.  Next time there is a
refault in the virtual address, do_swap_page can find the page from swap
cache but the page has PageSwapCache false because SwapBacked isn't set,
so do_swap_page will bail out and do nothing.  The task will keep
running into fault handler.

Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Link: http://lkml.kernel.org/r/6537ef3814398c0073630b03f176263bc81f0902.1506446061.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>	[4.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 9295ae960d66..a77d68f2c1b6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -575,7 +575,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
 {
 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
-	    !PageUnevictable(page)) {
+	    !PageSwapCache(page) && !PageUnevictable(page)) {
 		bool active = PageActive(page);
 
 		del_page_from_lru_list(page, lruvec,
@@ -665,7 +665,7 @@ void deactivate_file_page(struct page *page)
 void mark_page_lazyfree(struct page *page)
 {
 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
-	    !PageUnevictable(page)) {
+	    !PageSwapCache(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 
 		get_page(page);

From 9625456cc76391b7f3f2809579126542a8ed4d39 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 3 Oct 2017 16:15:32 -0700
Subject: [PATCH 25/51] mm: fix data corruption caused by lazyfree page

MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear
SwapBacked).  There is no lock to prevent the page is added to swap
cache between these two steps by page reclaim.  If page reclaim finds
such page, it will simply add the page to swap cache without pageout the
page to swap because the page is marked as clean.  Next time, page fault
will read data from the swap slot which doesn't have the original data,
so we have a data corruption.  To fix issue, we mark the page dirty and
pageout the page.

However, we shouldn't dirty all pages which is clean and in swap cache.
swapin page is swap cache and clean too.  So we only dirty page which is
added into swap cache in page reclaim, which shouldn't be swapin page.
As Minchan suggested, simply dirty the page in add_to_swap can do the
job.

Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Link: http://lkml.kernel.org/r/08c84256b007bf3f63c91d94383bd9eb6fee2daa.1506446061.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>	[4.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 71ce2d1ccbf7..ed91091d1e68 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -242,6 +242,17 @@ int add_to_swap(struct page *page)
 		 * clear SWAP_HAS_CACHE flag.
 		 */
 		goto fail;
+	/*
+	 * Normally the page will be dirtied in unmap because its pte should be
+	 * dirty. A special case is MADV_FREE page. The page'e pte could have
+	 * dirty bit cleared but the page's SwapBacked bit is still set because
+	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
+	 * such page, unmap will not set dirty bit for it, so page reclaim will
+	 * not write the page out. This can cause data corruption when the page
+	 * is swap in later. Always setting the dirty bit for the page solves
+	 * the problem.
+	 */
+	set_page_dirty(page);
 
 	return 1;
 

From 7d790d2da386a52cfebcf0c898ba927bece9d4ab Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 16:15:35 -0700
Subject: [PATCH 26/51] mm/device-public-memory: fix edge case in
 _vm_normal_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With device public pages at the end of my memory space, I'm getting
output from _vm_normal_page():

  BUG: Bad page map in process migrate_pages  pte:c0800001ffff0d06 pmd:f95d3000
  addr:00007fff89330000 vm_flags:00100073 anon_vma:c0000000fa899320 mapping:          (null) index:7fff8933
  file:          (null) fault:          (null) mmap:          (null) readpage:          (null)
  CPU: 0 PID: 13963 Comm: migrate_pages Tainted: P    B      OE 4.14.0-rc1-wip #155
  Call Trace:
     dump_stack+0xb0/0xf4 (unreliable)
     print_bad_pte+0x28c/0x340
     _vm_normal_page+0xc0/0x140
     zap_pte_range+0x664/0xc10
     unmap_page_range+0x318/0x670
     unmap_vmas+0x74/0xe0
     exit_mmap+0xe8/0x1f0
     mmput+0xac/0x1f0
     do_exit+0x348/0xcd0
     do_group_exit+0x5c/0xf0
     SyS_exit_group+0x1c/0x20
     system_call+0x58/0x6c

The pfn causing this is the very last one.  Correct the bounds check
accordingly.

Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU")
Link: http://lkml.kernel.org/r/1506092178-20351-1-git-send-email-arbab@linux.vnet.ibm.com
Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index ec4e15494901..a728bed16c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		 * vm_normal_page() so that we do not have to special case all
 		 * call site of vm_normal_page().
 		 */
-		if (likely(pfn < highest_memmap_pfn)) {
+		if (likely(pfn <= highest_memmap_pfn)) {
 			struct page *page = pfn_to_page(pfn);
 
 			if (is_device_public_page(page)) {

From 384632e67e0829deb8015ee6ad916b180049d252 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 3 Oct 2017 16:15:38 -0700
Subject: [PATCH 27/51] userfaultfd: non-cooperative: fix fork use after free

When reading the event from the uffd, we put it on a temporary
fork_event list to detect if we can still access it after releasing and
retaking the event_wqh.lock.

If fork aborts and removes the event from the fork_event all is fine as
long as we're still in the userfault read context and fork_event head is
still alive.

We've to put the event allocated in the fork kernel stack, back from
fork_event list-head to the event_wqh head, before returning from
userfaultfd_ctx_read, because the fork_event head lifetime is limited to
the userfaultfd_ctx_read stack lifetime.

Forgetting to move the event back to its event_wqh place then results in
__remove_wait_queue(&ctx->event_wqh, &ewq->wq); in
userfaultfd_event_wait_completion to remove it from a head that has been
already freed from the reader stack.

This could only happen if resolve_userfault_fork failed (for example if
there are no file descriptors available to allocate the fork uffd).  If
it succeeded it was put back correctly.

Furthermore, after find_userfault_evt receives a fork event, the forked
userfault context in fork_nctx and uwq->msg.arg.reserved.reserved1 can
be released by the fork thread as soon as the event_wqh.lock is
released.  Taking a reference on the fork_nctx before dropping the lock
prevents an use after free in resolve_userfault_fork().

If the fork side aborted and it already released everything, we still
try to succeed resolve_userfault_fork(), if possible.

Fixes: 893e26e61d04eac9 ("userfaultfd: non-cooperative: Add fork() event")
Link: http://lkml.kernel.org/r/20170920180413.26713-1-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 66 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ef4b48d1ea42..1c713fd5b3e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -588,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 			break;
 		if (ACCESS_ONCE(ctx->released) ||
 		    fatal_signal_pending(current)) {
+			/*
+			 * &ewq->wq may be queued in fork_event, but
+			 * __remove_wait_queue ignores the head
+			 * parameter. It would be a problem if it
+			 * didn't.
+			 */
 			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 			if (ewq->msg.event == UFFD_EVENT_FORK) {
 				struct userfaultfd_ctx *new;
@@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 					(unsigned long)
 					uwq->msg.arg.reserved.reserved1;
 				list_move(&uwq->wq.entry, &fork_event);
+				/*
+				 * fork_nctx can be freed as soon as
+				 * we drop the lock, unless we take a
+				 * reference on it.
+				 */
+				userfaultfd_ctx_get(fork_nctx);
 				spin_unlock(&ctx->event_wqh.lock);
 				ret = 0;
 				break;
@@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 
 	if (!ret && msg->event == UFFD_EVENT_FORK) {
 		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+		spin_lock(&ctx->event_wqh.lock);
+		if (!list_empty(&fork_event)) {
+			/*
+			 * The fork thread didn't abort, so we can
+			 * drop the temporary refcount.
+			 */
+			userfaultfd_ctx_put(fork_nctx);
 
-		if (!ret) {
-			spin_lock(&ctx->event_wqh.lock);
-			if (!list_empty(&fork_event)) {
-				uwq = list_first_entry(&fork_event,
-						       typeof(*uwq),
-						       wq.entry);
-				list_del(&uwq->wq.entry);
-				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+			uwq = list_first_entry(&fork_event,
+					       typeof(*uwq),
+					       wq.entry);
+			/*
+			 * If fork_event list wasn't empty and in turn
+			 * the event wasn't already released by fork
+			 * (the event is allocated on fork kernel
+			 * stack), put the event back to its place in
+			 * the event_wq. fork_event head will be freed
+			 * as soon as we return so the event cannot
+			 * stay queued there no matter the current
+			 * "ret" value.
+			 */
+			list_del(&uwq->wq.entry);
+			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+
+			/*
+			 * Leave the event in the waitqueue and report
+			 * error to userland if we failed to resolve
+			 * the userfault fork.
+			 */
+			if (likely(!ret))
 				userfaultfd_event_complete(ctx, uwq);
-			}
-			spin_unlock(&ctx->event_wqh.lock);
+		} else {
+			/*
+			 * Here the fork thread aborted and the
+			 * refcount from the fork thread on fork_nctx
+			 * has already been released. We still hold
+			 * the reference we took before releasing the
+			 * lock above. If resolve_userfault_fork
+			 * failed we've to drop it because the
+			 * fork_nctx has to be freed in such case. If
+			 * it succeeded we'll hold it because the new
+			 * uffd references it.
+			 */
+			if (ret)
+				userfaultfd_ctx_put(fork_nctx);
 		}
+		spin_unlock(&ctx->event_wqh.lock);
 	}
 
 	return ret;

From c2315c187fa0d3ab363fdebe22718170b40473e3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:42 -0700
Subject: [PATCH 28/51] exec: load_script: kill the onstack
 interp[BINPRM_BUF_SIZE] array

Patch series "exec: binfmt_misc: fix use-after-free, kill
iname[BINPRM_BUF_SIZE]".

It looks like this code was always wrong, then commit 948b701a607f
("binfmt_misc: add persistent opened binary handler for containers")
added more problems.

This patch (of 6):

load_script() can simply use i_name instead, it points into bprm->buf[]
and nobody can change this memory until we call prepare_binprm().

The only complication is that we need to also change the signature of
bprm_change_interp() but this change looks good too.

While at it, do whitespace/style cleanups.

NOTE: the real motivation for this change is that people want to
increase BINPRM_BUF_SIZE, we need to change load_misc_binary() too but
this looks more complicated because afaics it is very buggy.

Link: http://lkml.kernel.org/r/20170918163446.GA26793@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Travis Gummels <tgummels@redhat.com>
Cc: Ben Woodard <woodard@redhat.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_script.c      | 17 +++++++++--------
 fs/exec.c               |  2 +-
 include/linux/binfmts.h |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index afdf4e3cafc2..7cde3f46ad26 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm)
 	const char *i_arg, *i_name;
 	char *cp;
 	struct file *file;
-	char interp[BINPRM_BUF_SIZE];
 	int retval;
 
 	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
@@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm)
 			break;
 	}
 	for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
-	if (*cp == '\0') 
+	if (*cp == '\0')
 		return -ENOEXEC; /* No interpreter name found */
 	i_name = cp;
 	i_arg = NULL;
@@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm)
 		*cp++ = '\0';
 	if (*cp)
 		i_arg = cp;
-	strcpy (interp, i_name);
 	/*
 	 * OK, we've parsed out the interpreter name and
 	 * (optional) argument.
@@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 	retval = copy_strings_kernel(1, &bprm->interp, bprm);
-	if (retval < 0) return retval; 
+	if (retval < 0)
+		return retval;
 	bprm->argc++;
 	if (i_arg) {
 		retval = copy_strings_kernel(1, &i_arg, bprm);
-		if (retval < 0) return retval; 
+		if (retval < 0)
+			return retval;
 		bprm->argc++;
 	}
 	retval = copy_strings_kernel(1, &i_name, bprm);
-	if (retval) return retval; 
+	if (retval)
+		return retval;
 	bprm->argc++;
-	retval = bprm_change_interp(interp, bprm);
+	retval = bprm_change_interp(i_name, bprm);
 	if (retval < 0)
 		return retval;
 
 	/*
 	 * OK, now restart the process with the interpreter's dentry.
 	 */
-	file = open_exec(interp);
+	file = open_exec(i_name);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
diff --git a/fs/exec.c b/fs/exec.c
index ac34d9724684..5470d3c1892a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm)
 	kfree(bprm);
 }
 
-int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
 {
 	/* If a binfmt changed the interp, free it first. */
 	if (bprm->interp != bprm->filename)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index fb44d6180ca0..18d05b5491f3 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -131,7 +131,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   int executable_stack);
 extern int transfer_args_to_stack(struct linux_binprm *bprm,
 				  unsigned long *sp_location);
-extern int bprm_change_interp(char *interp, struct linux_binprm *bprm);
+extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc, const char *const *argv,
 			       struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);

From baba1b29731c79d605100087b8f02f9e1cf5a344 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:45 -0700
Subject: [PATCH 29/51] exec: binfmt_misc: don't nullify Node->dentry in
 kill_node()

kill_node() nullifies/checks Node->dentry to avoid double free.  This
complicates the next changes and this is very confusing:

 - we do not need to check dentry != NULL under entries_lock,
   kill_node() is always called under inode_lock(d_inode(root)) and we
   rely on this inode_lock() anyway, without this lock the
   MISC_FMT_OPEN_FILE cleanup could race with itself.

 - if kill_inode() was already called and ->dentry == NULL we should not
   even try to close e->interp_file.

We can change bm_entry_write() to simply check !list_empty(list) before
kill_node.  Again, we rely on inode_lock(), in particular it saves us
from the race with bm_status_write(), another caller of kill_node().

Link: http://lkml.kernel.org/r/20170922143641.GA17210@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ben Woodard <woodard@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Travis Gummels <tgummels@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index ce7181ea60fa..6451e7520e05 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -603,11 +603,7 @@ static void kill_node(Node *e)
 	struct dentry *dentry;
 
 	write_lock(&entries_lock);
-	dentry = e->dentry;
-	if (dentry) {
-		list_del_init(&e->list);
-		e->dentry = NULL;
-	}
+	list_del_init(&e->list);
 	write_unlock(&entries_lock);
 
 	if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) {
@@ -615,12 +611,11 @@ static void kill_node(Node *e)
 		e->interp_file = NULL;
 	}
 
-	if (dentry) {
-		drop_nlink(d_inode(dentry));
-		d_drop(dentry);
-		dput(dentry);
-		simple_release_fs(&bm_mnt, &entry_count);
-	}
+	dentry = e->dentry;
+	drop_nlink(d_inode(dentry));
+	d_drop(dentry);
+	dput(dentry);
+	simple_release_fs(&bm_mnt, &entry_count);
 }
 
 /* /<entry> */
@@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		root = file_inode(file)->i_sb->s_root;
 		inode_lock(d_inode(root));
 
-		kill_node(e);
+		if (!list_empty(&e->list))
+			kill_node(e);
 
 		inode_unlock(d_inode(root));
 		break;
@@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		inode_lock(d_inode(root));
 
 		while (!list_empty(&entries))
-			kill_node(list_entry(entries.next, Node, list));
+			kill_node(list_first_entry(&entries, Node, list));
 
 		inode_unlock(d_inode(root));
 		break;

From 83f918274e4b841d6fb817861ea0c896fba0c179 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:48 -0700
Subject: [PATCH 30/51] exec: binfmt_misc: shift filp_close(interp_file) from
 kill_node() to bm_evict_inode()

To ensure that load_misc_binary() can't use the partially destroyed
Node, see also the next patch.

The current logic looks wrong in any case, once we close interp_file it
doesn't make any sense to delay kfree(inode->i_private), this Node is no
longer valid.  Even if the MISC_FMT_OPEN_FILE/interp_file checks were
not racy (they are), load_misc_binary() should not try to reopen
->interpreter if MISC_FMT_OPEN_FILE is set but ->interp_file is NULL.

And I can't understand why do we use filp_close(), not fput().

Link: http://lkml.kernel.org/r/20170922143644.GA17216@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ben Woodard <woodard@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Travis Gummels <tgummels@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6451e7520e05..203598ccb40a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 static void bm_evict_inode(struct inode *inode)
 {
+	Node *e = inode->i_private;
+
+	if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file)
+		filp_close(e->interp_file, NULL);
+
 	clear_inode(inode);
-	kfree(inode->i_private);
+	kfree(e);
 }
 
 static void kill_node(Node *e)
@@ -606,11 +611,6 @@ static void kill_node(Node *e)
 	list_del_init(&e->list);
 	write_unlock(&entries_lock);
 
-	if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) {
-		filp_close(e->interp_file, NULL);
-		e->interp_file = NULL;
-	}
-
 	dentry = e->dentry;
 	drop_nlink(d_inode(dentry));
 	d_drop(dentry);

From eb23aa0317eb1f08e8d9d36b8753d42f03b32764 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:51 -0700
Subject: [PATCH 31/51] exec: binfmt_misc: remove the confusing e->interp_file
 != NULL checks

If MISC_FMT_OPEN_FILE flag is set e->interp_file must be valid or we
have a bug which should not be silently ignored.

Link: http://lkml.kernel.org/r/20170922143647.GA17222@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ben Woodard <woodard@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Travis Gummels <tgummels@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 203598ccb40a..babfe7bd56f0 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (retval < 0)
 		goto error;
 
-	if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) {
+	if (fmt->flags & MISC_FMT_OPEN_FILE) {
 		interp_file = filp_clone_open(fmt->interp_file);
 		if (!IS_ERR(interp_file))
 			deny_write_access(interp_file);
@@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode)
 {
 	Node *e = inode->i_private;
 
-	if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file)
+	if (e->flags & MISC_FMT_OPEN_FILE)
 		filp_close(e->interp_file, NULL);
 
 	clear_inode(inode);

From 43a4f2619038002f48c78698c42c05692d4b4eb2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:55 -0700
Subject: [PATCH 32/51] exec: binfmt_misc: fix race between load_misc_binary()
 and kill_node()

load_misc_binary() makes a local copy of fmt->interpreter under
entries_lock to avoid the race with kill_node() but this is not enough;
the whole Node can be freed after we drop entries_lock, not only the
->interpreter string.

Add dget/dput(fmt->dentry) to ensure bm_evict_inode() can't destroy/free
this Node.

Link: http://lkml.kernel.org/r/20170922143650.GA17227@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ben Woodard <woodard@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: Travis Gummels <tgummels@redhat.com>
Cc: <tdhooge@llnl.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index babfe7bd56f0..f5f8c2541790 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -138,20 +138,23 @@ static int load_misc_binary(struct linux_binprm *bprm)
 
 	retval = -ENOEXEC;
 	if (!enabled)
-		goto ret;
+		return retval;
 
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
-	if (fmt)
+	if (fmt) {
+		dget(fmt->dentry);
 		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
+	}
 	read_unlock(&entries_lock);
 	if (!fmt)
-		goto ret;
+		return retval;
 
 	/* Need to be able to load the file after exec */
+	retval = -ENOENT;
 	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
-		return -ENOENT;
+		goto ret;
 
 	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
 		retval = remove_arg_zero(bprm);
@@ -238,6 +241,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
 		goto error;
 
 ret:
+	dput(fmt->dentry);
 	return retval;
 error:
 	if (fd_binary > 0)

From 50097f74934e3ec8fb1e6f3087568b958972817d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:58 -0700
Subject: [PATCH 33/51] exec: binfmt_misc: kill the onstack
 iname[BINPRM_BUF_SIZE] array

After the previous change "fmt" can't go away, we can kill
iname/iname_addr and use fmt->interpreter.

Link: http://lkml.kernel.org/r/20170922143653.GA17232@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ben Woodard <woodard@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Travis Gummels <tgummels@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f5f8c2541790..2a46762def31 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -54,7 +54,7 @@ typedef struct {
 	int size;			/* size of magic/mask */
 	char *magic;			/* magic or filename extension */
 	char *mask;			/* mask, NULL for exact match */
-	char *interpreter;		/* filename of interpreter */
+	const char *interpreter;	/* filename of interpreter */
 	char *name;
 	struct dentry *dentry;
 	struct file *interp_file;
@@ -131,8 +131,6 @@ static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file *interp_file = NULL;
-	char iname[BINPRM_BUF_SIZE];
-	const char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
 
@@ -143,10 +141,8 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
-	if (fmt) {
+	if (fmt)
 		dget(fmt->dentry);
-		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
-	}
 	read_unlock(&entries_lock);
 	if (!fmt)
 		return retval;
@@ -198,13 +194,13 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	bprm->argc++;
 
 	/* add the interp as argv[0] */
-	retval = copy_strings_kernel(1, &iname_addr, bprm);
+	retval = copy_strings_kernel(1, &fmt->interpreter, bprm);
 	if (retval < 0)
 		goto error;
 	bprm->argc++;
 
 	/* Update interp in case binfmt_script needs it. */
-	retval = bprm_change_interp(iname, bprm);
+	retval = bprm_change_interp(fmt->interpreter, bprm);
 	if (retval < 0)
 		goto error;
 
@@ -213,7 +209,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
 		if (!IS_ERR(interp_file))
 			deny_write_access(interp_file);
 	} else {
-		interp_file = open_exec(iname);
+		interp_file = open_exec(fmt->interpreter);
 	}
 	retval = PTR_ERR(interp_file);
 	if (IS_ERR(interp_file))

From 8cb5d7482810b7eb1bb05bf4f71bc93ce35e5896 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 3 Oct 2017 16:16:01 -0700
Subject: [PATCH 34/51] lib/lz4: make arrays static const, reduces object code
 size

Don't populate the read-only arrays dec32table and dec64table on the
stack, instead make them both static const.  Makes the object code
smaller by over 10K bytes:

  Before:
     text	   data	    bss	    dec	    hex	filename
    31500	      0	      0	  31500	   7b0c	lib/lz4/lz4_decompress.o

  After:
     text	   data	    bss	    dec	    hex	filename
    20237	    176	      0	  20413	   4fbd	lib/lz4/lz4_decompress.o

(gcc version 7.2.0 x86_64)

Link: http://lkml.kernel.org/r/20170921221939.20820-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/lz4/lz4_decompress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index bd3574312b82..141734d255e4 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -85,8 +85,8 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	const BYTE * const lowLimit = lowPrefix - dictSize;
 
 	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
-	const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
+	static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
+	static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
 
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));

From 7240767450d6d8380fb153e2998a1bb4ede7b029 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 3 Oct 2017 16:16:04 -0700
Subject: [PATCH 35/51] include/linux/bitfield.h: remove 32bit from FIELD_GET
 comment block

I do not see anything that restricts this macro to 32 bit width.

Link: http://lkml.kernel.org/r/1505921975-23379-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitfield.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 8b9d6fff002d..f2deb71958b2 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -92,7 +92,7 @@
 /**
  * FIELD_GET() - extract a bitfield element
  * @_mask: shifted mask defining the field's length and position
- * @_reg:  32bit value of entire bitfield
+ * @_reg:  value of entire bitfield
  *
  * FIELD_GET() extracts the field specified by @_mask from the
  * bitfield passed in as @_reg by masking and shifting it down.

From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Tue, 3 Oct 2017 16:16:07 -0700
Subject: [PATCH 36/51] kernel/sysctl.c: remove duplicate UINT_MAX check on
 do_proc_douintvec_conv()

do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one.
This has no functional changes other than fixing a compiler warning:

  kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false

Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support")
Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: David Binderman <dcb314@hotmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 423554ad3610..4da9e622471f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
 				  int write, void *data)
 {
 	if (write) {
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		if (*lvalp > UINT_MAX)
 			return -EINVAL;
 		*valp = *lvalp;

From f80c7dab95a1f0f968acbafe4426ee9525b6f6ab Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 3 Oct 2017 16:16:10 -0700
Subject: [PATCH 37/51] mm: memcontrol: use vmalloc fallback for large kmem
 memcg arrays

For quick per-memcg indexing, slab caches and list_lru structures
maintain linear arrays of descriptors.  As the number of concurrent
memory cgroups in the system goes up, this requires large contiguous
allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for every
existing slab cache and list_lru, which can easily fail on loaded
systems.  E.g.:

  mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null)
  CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty #481
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
  Call Trace:
   ? __alloc_pages_direct_compact+0x4c/0x110
   __alloc_pages_nodemask+0xf50/0x1430
   alloc_pages_current+0x60/0xc0
   kmalloc_order_trace+0x29/0x1b0
   __kmalloc+0x1f4/0x320
   memcg_update_all_list_lrus+0xca/0x2e0
   mem_cgroup_css_alloc+0x612/0x670
   cgroup_apply_control_enable+0x19e/0x360
   cgroup_mkdir+0x322/0x490
   kernfs_iop_mkdir+0x55/0x80
   vfs_mkdir+0xd0/0x120
   SyS_mkdirat+0x6c/0xe0
   SyS_mkdir+0x14/0x20
   entry_SYSCALL_64_fastpath+0x18/0xad
  Mem-Info:
  active_anon:2965 inactive_anon:19 isolated_anon:0
   active_file:100270 inactive_file:98846 isolated_file:0
   unevictable:0 dirty:0 writeback:0 unstable:0
   slab_reclaimable:7328 slab_unreclaimable:16402
   mapped:771 shmem:52 pagetables:278 bounce:0
   free:13718 free_pcp:0 free_cma:0

This output is from an artificial reproducer, but we have repeatedly
observed order-7 failures in production in the Facebook fleet.  These
systems become useless as they cannot run more jobs, even though there
is plenty of memory to allocate 128 individual pages.

Use kvmalloc and kvzalloc to fall back to vmalloc space if these arrays
prove too large for allocating them physically contiguous.

Link: http://lkml.kernel.org/r/20170918184919.20644-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/list_lru.c    | 12 ++++++------
 mm/slab_common.c | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7a40fa2be858..f141f0c80ff3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -325,12 +325,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 {
 	int size = memcg_nr_cache_ids;
 
-	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+	nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL);
 	if (!nlru->memcg_lrus)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
-		kfree(nlru->memcg_lrus);
+		kvfree(nlru->memcg_lrus);
 		return -ENOMEM;
 	}
 
@@ -340,7 +340,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 {
 	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
-	kfree(nlru->memcg_lrus);
+	kvfree(nlru->memcg_lrus);
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -351,12 +351,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	BUG_ON(old_size > new_size);
 
 	old = nlru->memcg_lrus;
-	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
-		kfree(new);
+		kvfree(new);
 		return -ENOMEM;
 	}
 
@@ -373,7 +373,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	nlru->memcg_lrus = new;
 	spin_unlock_irq(&nlru->lock);
 
-	kfree(old);
+	kvfree(old);
 	return 0;
 }
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 904a83be82de..80164599ca5d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -165,9 +165,9 @@ static int init_memcg_params(struct kmem_cache *s,
 	if (!memcg_nr_cache_ids)
 		return 0;
 
-	arr = kzalloc(sizeof(struct memcg_cache_array) +
-		      memcg_nr_cache_ids * sizeof(void *),
-		      GFP_KERNEL);
+	arr = kvzalloc(sizeof(struct memcg_cache_array) +
+		       memcg_nr_cache_ids * sizeof(void *),
+		       GFP_KERNEL);
 	if (!arr)
 		return -ENOMEM;
 
@@ -178,15 +178,23 @@ static int init_memcg_params(struct kmem_cache *s,
 static void destroy_memcg_params(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
-		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static void free_memcg_params(struct rcu_head *rcu)
+{
+	struct memcg_cache_array *old;
+
+	old = container_of(rcu, struct memcg_cache_array, rcu);
+	kvfree(old);
 }
 
 static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	new = kzalloc(sizeof(struct memcg_cache_array) +
-		      new_array_size * sizeof(void *), GFP_KERNEL);
+	new = kvzalloc(sizeof(struct memcg_cache_array) +
+		       new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -198,7 +206,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 
 	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
 	if (old)
-		kfree_rcu(old, rcu);
+		call_rcu(&old->rcu, free_memcg_params);
 	return 0;
 }
 

From a70e43a59de9316e6fbad3b65557d0a24c099aca Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 3 Oct 2017 16:16:13 -0700
Subject: [PATCH 38/51] lib/idr.c: fix comment for idr_replace()

idr_replace() returns the old value on success, not 0.

Link: http://lkml.kernel.org/r/20170918162642.37511-1-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/idr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/idr.c b/lib/idr.c
index f9adf4805fd7..edd9b2be1651 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -146,8 +146,8 @@ EXPORT_SYMBOL(idr_get_next_ext);
  * idr_alloc() and idr_remove() (as long as the ID being removed is not
  * the one being replaced!).
  *
- * Returns: 0 on success.  %-ENOENT indicates that @id was not found.
- * %-EINVAL indicates that @id or @ptr were not valid.
+ * Returns: the old value on success.  %-ENOENT indicates that @id was not
+ * found.  %-EINVAL indicates that @id or @ptr were not valid.
  */
 void *idr_replace(struct idr *idr, void *ptr, int id)
 {

From f64ac5e6e30668216cf489d73ba8a96e372d78c6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:16 -0700
Subject: [PATCH 39/51] mm, memory_hotplug: add scheduling point to __add_pages

Patch series "mm, memory_hotplug: fix few soft lockups in memory
hotadd".

Johannes has noticed few soft lockups when adding a large nvdimm device.
All of them were caused by a long loop without any explicit cond_resched
which is a problem for !PREEMPT kernels.

The fix is quite straightforward.  Just make sure that cond_resched gets
called from time to time.

This patch (of 3):

__add_pages gets a pfn range to add and there is no upper bound for a
single call.  This is usually a memory block aligned size for the
regular memory hotplug - smaller sizes are usual for memory balloning
drivers, or the whole NUMA node for physical memory online.  There is no
explicit scheduling point in that code path though.

This can lead to long latencies while __add_pages is executed and we
have even seen a soft lockup report during nvdimm initialization with
!PREEMPT kernel

  NMI watchdog: BUG: soft lockup - CPU#11 stuck for 23s! [kworker/u641:3:832]
  [...]
  Workqueue: events_unbound async_run_entry_fn
  task: ffff881809270f40 ti: ffff881809274000 task.ti: ffff881809274000
  RIP: _raw_spin_unlock_irqrestore+0x11/0x20
  RSP: 0018:ffff881809277b10  EFLAGS: 00000286
  [...]
  Call Trace:
    sparse_add_one_section+0x13d/0x18e
    __add_pages+0x10a/0x1d0
    arch_add_memory+0x4a/0xc0
    devm_memremap_pages+0x29d/0x430
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70
  DWARF2 unwinder stuck at ret_from_fork+0x3f/0x70

Fix this by adding cond_resched once per each memory section in the
given pfn range.  Each section is constant amount of work which itself
is not too expensive but many of them will just add up.

Link: http://lkml.kernel.org/r/20170918121410.24466-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e882cb6da994..23d5bd968950 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -328,6 +328,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 		if (err && (err != -EEXIST))
 			break;
 		err = 0;
+		cond_resched();
 	}
 	vmemmap_populate_print_last();
 out:

From 9b6e63cbf85b89b2dbffa4955dbf2df8250e5375 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:19 -0700
Subject: [PATCH 40/51] mm, page_alloc: add scheduling point to
 memmap_init_zone

memmap_init_zone gets a pfn range to initialize and it can be really
large resulting in a soft lockup on non-preemptible kernels

  NMI watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [kworker/u642:5:1720]
  [...]
  task: ffff88ecd7e902c0 ti: ffff88eca4e50000 task.ti: ffff88eca4e50000
  RIP: move_pfn_range_to_zone+0x185/0x1d0
  [...]
  Call Trace:
    devm_memremap_pages+0x2c7/0x430
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70

Fix this by adding a scheduling point once per page block.

Link: http://lkml.kernel.org/r/20170918121410.24466-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 38d165a87860..77e4d3c5c57b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5367,6 +5367,7 @@ not_early:
 
 			__init_single_page(page, pfn, zone, nid);
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+			cond_resched();
 		} else {
 			__init_single_pfn(pfn, zone, nid);
 		}

From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:23 -0700
Subject: [PATCH 41/51] memremap: add scheduling point to devm_memremap_pages

devm_memremap_pages is initializing struct pages in for_each_device_pfn
and that can take quite some time.  We have even seen a soft lockup
triggering on a non preemptive kernel

  NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808]
  [...]
  RIP: 0010:[<ffffffff8118b6b7>]  [<ffffffff8118b6b7>] devm_memremap_pages+0x327/0x430
  [...]
  Call Trace:
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70

fix this by adding cond_resched every 1024 pages.

Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
-	int error, nid, is_ram;
+	int error, nid, is_ram, i = 0;
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		list_del(&page->lru);
 		page->pgmap = pgmap;
 		percpu_ref_get(ref);
+		if (!(++i % 1024))
+			cond_resched();
 	}
 	devres_add(dev, page_map);
 	return __va(res->start);

From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 3 Oct 2017 16:16:26 -0700
Subject: [PATCH 42/51] kernel/kcmp.c: drop branch leftover typo

The else branch been left over and escaped the source code refresh.  Not
a problem but better clean it up.

Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files")
Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan
Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
 	if (filp_epoll) {
 		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
 		fput(filp_epoll);
-	} else
+	}
 
 	if (IS_ERR(filp_tgt))
 		return PTR_ERR(filp_tgt);

From 1dd2bfc86818ddbc95f98e312e7704350223fd7d Mon Sep 17 00:00:00 2001
From: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Date: Tue, 3 Oct 2017 16:16:29 -0700
Subject: [PATCH 43/51] mm/memory_hotplug: change
 pfn_to_section_nr/section_nr_to_pfn macro to inline function

pfn_to_section_nr() and section_nr_to_pfn() are defined as macro.
pfn_to_section_nr() has no issue even if it is defined as macro.  But
section_nr_to_pfn() has overflow issue if sec is defined as int.

section_nr_to_pfn() just shifts sec by PFN_SECTION_SHIFT.  If sec is
defined as unsigned long, section_nr_to_pfn() returns pfn as 64 bit value.
But if sec is defined as int, section_nr_to_pfn() returns pfn as 32 bit
value.

__remove_section() calculates start_pfn using section_nr_to_pfn() and
scn_nr defined as int.  So if hot-removed memory address is over 16TB,
overflow issue occurs and section_nr_to_pfn() does not calculate correct
pfn.

To make callers use proper arg, the patch changes the macros to inline
functions.

Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory")
Link: http://lkml.kernel.org/r/e643a387-e573-6bbf-d418-c60c8ee3d15e@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 10 ++++++++--
 mm/memory_hotplug.c    |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 356a814e7c8e..c8f89417740b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1094,8 +1094,14 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 
-#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
-#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+static inline unsigned long pfn_to_section_nr(unsigned long pfn)
+{
+	return pfn >> PFN_SECTION_SHIFT;
+}
+static inline unsigned long section_nr_to_pfn(unsigned long sec)
+{
+	return sec << PFN_SECTION_SHIFT;
+}
 
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 23d5bd968950..efd1ad37bb57 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
 		return ret;
 
 	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn(scn_nr);
+	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
 	sparse_remove_one_section(zone, ms, map_offset);

From d09b0137d204bebeaafed672bc5a244e9ac92edb Mon Sep 17 00:00:00 2001
From: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Date: Tue, 3 Oct 2017 16:16:32 -0700
Subject: [PATCH 44/51] mm/memory_hotplug: define
 find_{smallest|biggest}_section_pfn as unsigned long

find_{smallest|biggest}_section_pfn()s find the smallest/biggest section
and return the pfn of the section.  But the functions are defined as int.
So the functions always return 0x00000000 - 0xffffffff.  It means if
memory address is over 16TB, the functions does not work correctly.

To handle 64 bit value, the patch defines
find_{smallest|biggest}_section_pfn() as unsigned long.

Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory")
Link: http://lkml.kernel.org/r/d9d5593a-d0a4-c4be-ab08-493df59a85c6@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index efd1ad37bb57..d4b5f29906b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(__add_pages);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
-static int find_smallest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
 				     unsigned long end_pfn)
 {
@@ -363,7 +363,7 @@ static int find_smallest_section_pfn(int nid, struct zone *zone,
 }
 
 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
-static int find_biggest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
 				    unsigned long start_pfn,
 				    unsigned long end_pfn)
 {

From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:35 -0700
Subject: [PATCH 45/51] kernel/params.c: fix the maximum length in
 param_get_string

The length parameter of strlcpy() is supposed to reflect the size of the
target buffer, not of the source string.  Harmless in this case as the
buffer is PAGE_SIZE long and the source string is always much shorter than
this, but conceptually wrong, so let's fix it.

Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/params.c b/kernel/params.c
index 1cd8f1a895a8..8283ba045f4f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring);
 int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
-	return strlcpy(buffer, kps->string, kps->maxlen);
+	return strlcpy(buffer, kps->string, PAGE_SIZE);
 }
 EXPORT_SYMBOL(param_get_string);
 

From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:38 -0700
Subject: [PATCH 46/51] kernel/params.c: fix an overflow in param_attr_show

Function param_attr_show could overflow the buffer it is operating on.

The buffer size is PAGE_SIZE, and the string returned by
attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE,
...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the
very end of the buffer.  Calling strcat(..., "\n") on this isn't safe, as
the '\0' will be replaced by '\n' (OK) and then another '\0' will be added
past the end of the buffer (not OK.)

Simply add the trailing '\n' when writing the attribute contents to the
buffer originally.  This is safe, and also faster.

Credits to Teradata for discovering this issue.

Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/kernel/params.c b/kernel/params.c
index 8283ba045f4f..0cca488263dd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
 	}								\
 	int param_get_##name(char *buffer, const struct kernel_param *kp) \
 	{								\
-		return scnprintf(buffer, PAGE_SIZE, format,		\
+		return scnprintf(buffer, PAGE_SIZE, format "\n",	\
 				*((type *)kp->arg));			\
 	}								\
 	const struct kernel_param_ops param_ops_##name = {			\
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
 
 int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
-	return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
 }
 EXPORT_SYMBOL(param_get_charp);
 
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
 	/* Y and N chosen as being relatively non-coder friendly */
-	return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+	return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
 
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
 
 int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
-	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+	return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 EXPORT_SYMBOL(param_get_invbool);
 
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 	struct kernel_param p = *kp;
 
 	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+		/* Replace \n with comma */
 		if (i)
-			buffer[off++] = ',';
+			buffer[off - 1] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
 		check_kparam_locked(p.mod);
 		ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
 int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
-	return strlcpy(buffer, kps->string, PAGE_SIZE);
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
 }
 EXPORT_SYMBOL(param_get_string);
 
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	kernel_param_lock(mk->mod);
 	count = attribute->param->ops->get(buf, attribute->param);
 	kernel_param_unlock(mk->mod);
-	if (count > 0) {
-		strcat(buf, "\n");
-		++count;
-	}
 	return count;
 }
 

From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 3 Oct 2017 16:16:41 -0700
Subject: [PATCH 47/51] kernel/params.c: improve STANDARD_PARAM_DEF readability

Align the parameters passed to STANDARD_PARAM_DEF for clarity.

Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/params.c b/kernel/params.c
index 0cca488263dd..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
 	EXPORT_SYMBOL(param_ops_##name)
 
 
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte,	unsigned char,		"%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short,	short,			"%hi",  kstrtos16);
+STANDARD_PARAM_DEF(ushort,	unsigned short,		"%hu",  kstrtou16);
+STANDARD_PARAM_DEF(int,		int,			"%i",   kstrtoint);
+STANDARD_PARAM_DEF(uint,	unsigned int,		"%u",   kstrtouint);
+STANDARD_PARAM_DEF(long,	long,			"%li",  kstrtol);
+STANDARD_PARAM_DEF(ulong,	unsigned long,		"%lu",  kstrtoul);
+STANDARD_PARAM_DEF(ullong,	unsigned long long,	"%llu", kstrtoull);
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {

From 656d61ce9666209c4c4a13c71902d3ee70d1ff6f Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Tue, 3 Oct 2017 16:16:45 -0700
Subject: [PATCH 48/51] lib/ratelimit.c: use deferred printk() version

printk_ratelimit() invokes ___ratelimit() which may invoke a normal
printk() (pr_warn() in this particular case) to warn about suppressed
output.  Given that printk_ratelimit() may be called from anywhere, that
pr_warn() is dangerous - it may end up deadlocking the system.  Fix
___ratelimit() by using deferred printk().

Sasha reported the following lockdep error:

 : Unregister pv shared memory for cpu 8
 : select_fallback_rq: 3 callbacks suppressed
 : process 8583 (trinity-c78) no longer affine to cpu8
 :
 : ======================================================
 : WARNING: possible circular locking dependency detected
 : 4.14.0-rc2-next-20170927+ #252 Not tainted
 : ------------------------------------------------------
 : migration/8/62 is trying to acquire lock:
 : (&port_lock_key){-.-.}, at: serial8250_console_write()
 :
 : but task is already holding lock:
 : (&rq->lock){-.-.}, at: sched_cpu_dying()
 :
 : which lock already depends on the new lock.
 :
 :
 : the existing dependency chain (in reverse order) is:
 :
 : -> #3 (&rq->lock){-.-.}:
 : __lock_acquire()
 : lock_acquire()
 : _raw_spin_lock()
 : task_fork_fair()
 : sched_fork()
 : copy_process.part.31()
 : _do_fork()
 : kernel_thread()
 : rest_init()
 : start_kernel()
 : x86_64_start_reservations()
 : x86_64_start_kernel()
 : verify_cpu()
 :
 : -> #2 (&p->pi_lock){-.-.}:
 : __lock_acquire()
 : lock_acquire()
 : _raw_spin_lock_irqsave()
 : try_to_wake_up()
 : default_wake_function()
 : woken_wake_function()
 : __wake_up_common()
 : __wake_up_common_lock()
 : __wake_up()
 : tty_wakeup()
 : tty_port_default_wakeup()
 : tty_port_tty_wakeup()
 : uart_write_wakeup()
 : serial8250_tx_chars()
 : serial8250_handle_irq.part.25()
 : serial8250_default_handle_irq()
 : serial8250_interrupt()
 : __handle_irq_event_percpu()
 : handle_irq_event_percpu()
 : handle_irq_event()
 : handle_level_irq()
 : handle_irq()
 : do_IRQ()
 : ret_from_intr()
 : native_safe_halt()
 : default_idle()
 : arch_cpu_idle()
 : default_idle_call()
 : do_idle()
 : cpu_startup_entry()
 : rest_init()
 : start_kernel()
 : x86_64_start_reservations()
 : x86_64_start_kernel()
 : verify_cpu()
 :
 : -> #1 (&tty->write_wait){-.-.}:
 : __lock_acquire()
 : lock_acquire()
 : _raw_spin_lock_irqsave()
 : __wake_up_common_lock()
 : __wake_up()
 : tty_wakeup()
 : tty_port_default_wakeup()
 : tty_port_tty_wakeup()
 : uart_write_wakeup()
 : serial8250_tx_chars()
 : serial8250_handle_irq.part.25()
 : serial8250_default_handle_irq()
 : serial8250_interrupt()
 : __handle_irq_event_percpu()
 : handle_irq_event_percpu()
 : handle_irq_event()
 : handle_level_irq()
 : handle_irq()
 : do_IRQ()
 : ret_from_intr()
 : native_safe_halt()
 : default_idle()
 : arch_cpu_idle()
 : default_idle_call()
 : do_idle()
 : cpu_startup_entry()
 : rest_init()
 : start_kernel()
 : x86_64_start_reservations()
 : x86_64_start_kernel()
 : verify_cpu()
 :
 : -> #0 (&port_lock_key){-.-.}:
 : check_prev_add()
 : __lock_acquire()
 : lock_acquire()
 : _raw_spin_lock_irqsave()
 : serial8250_console_write()
 : univ8250_console_write()
 : console_unlock()
 : vprintk_emit()
 : vprintk_default()
 : vprintk_func()
 : printk()
 : ___ratelimit()
 : __printk_ratelimit()
 : select_fallback_rq()
 : sched_cpu_dying()
 : cpuhp_invoke_callback()
 : take_cpu_down()
 : multi_cpu_stop()
 : cpu_stopper_thread()
 : smpboot_thread_fn()
 : kthread()
 : ret_from_fork()
 :
 : other info that might help us debug this:
 :
 : Chain exists of:
 :   &port_lock_key --> &p->pi_lock --> &rq->lock
 :
 :  Possible unsafe locking scenario:
 :
 :        CPU0                    CPU1
 :        ----                    ----
 :   lock(&rq->lock);
 :                                lock(&p->pi_lock);
 :                                lock(&rq->lock);
 :   lock(&port_lock_key);
 :
 :  *** DEADLOCK ***
 :
 : 4 locks held by migration/8/62:
 : #0: (&p->pi_lock){-.-.}, at: sched_cpu_dying()
 : #1: (&rq->lock){-.-.}, at: sched_cpu_dying()
 : #2: (printk_ratelimit_state.lock){....}, at: ___ratelimit()
 : #3: (console_lock){+.+.}, at: vprintk_emit()
 :
 : stack backtrace:
 : CPU: 8 PID: 62 Comm: migration/8 Not tainted 4.14.0-rc2-next-20170927+ #252
 : Call Trace:
 : dump_stack()
 : print_circular_bug()
 : check_prev_add()
 : ? add_lock_to_list.isra.26()
 : ? check_usage()
 : ? kvm_clock_read()
 : ? kvm_sched_clock_read()
 : ? sched_clock()
 : ? check_preemption_disabled()
 : __lock_acquire()
 : ? __lock_acquire()
 : ? add_lock_to_list.isra.26()
 : ? debug_check_no_locks_freed()
 : ? memcpy()
 : lock_acquire()
 : ? serial8250_console_write()
 : _raw_spin_lock_irqsave()
 : ? serial8250_console_write()
 : serial8250_console_write()
 : ? serial8250_start_tx()
 : ? lock_acquire()
 : ? memcpy()
 : univ8250_console_write()
 : console_unlock()
 : ? __down_trylock_console_sem()
 : vprintk_emit()
 : vprintk_default()
 : vprintk_func()
 : printk()
 : ? show_regs_print_info()
 : ? lock_acquire()
 : ___ratelimit()
 : __printk_ratelimit()
 : select_fallback_rq()
 : sched_cpu_dying()
 : ? sched_cpu_starting()
 : ? rcutree_dying_cpu()
 : ? sched_cpu_starting()
 : cpuhp_invoke_callback()
 : ? cpu_disable_common()
 : take_cpu_down()
 : ? trace_hardirqs_off_caller()
 : ? cpuhp_invoke_callback()
 : multi_cpu_stop()
 : ? __this_cpu_preempt_check()
 : ? cpu_stop_queue_work()
 : cpu_stopper_thread()
 : ? cpu_stop_create()
 : smpboot_thread_fn()
 : ? sort_range()
 : ? schedule()
 : ? __kthread_parkme()
 : kthread()
 : ? sort_range()
 : ? kthread_create_on_node()
 : ret_from_fork()
 : process 9121 (trinity-c78) no longer affine to cpu8
 : smpboot: CPU 8 is now offline

Link: http://lkml.kernel.org/r/20170928120405.18273-1-sergey.senozhatsky@gmail.com
Fixes: 6b1d174b0c27b ("ratelimit: extend to print suppressed messages on release")
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Borislav Petkov <bp@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/ratelimit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 08f8043cac61..d01f47135239 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -48,7 +48,9 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
 	if (time_is_before_jiffies(rs->begin + rs->interval)) {
 		if (rs->missed) {
 			if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
-				pr_warn("%s: %d callbacks suppressed\n", func, rs->missed);
+				printk_deferred(KERN_WARNING
+						"%s: %d callbacks suppressed\n",
+						func, rs->missed);
 				rs->missed = 0;
 			}
 		}

From d22e3d69ee1a3f83ff7cc943af63de48b6156dcf Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 3 Oct 2017 16:16:49 -0700
Subject: [PATCH 49/51] m32r: fix build failure

The allmodconfig build of m32r is failing with the error:

  lib/mpi/mpih-div.o: In function 'mpihelp_divrem':
  mpih-div.c:(.text+0x40): undefined reference to 'abort'
  mpih-div.c:(.text+0x40): relocation truncated to fit:
	R_M32R_26_PCREL_RELA against undefined symbol 'abort'

The function 'abort' was never defined for the m32r architecture.

Create 'abort' as is done in other arch like 'arm' and 'unicore32'.

Link: http://lkml.kernel.org/r/1506727220-6108-1-git-send-email-sudip.mukherjee@codethink.co.uk
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m32r/kernel/traps.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index 647dd94a0c39..72b96f282689 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -114,6 +114,15 @@ static void set_eit_vector_entries(void)
 	_flush_cache_copyback_all();
 }
 
+void abort(void)
+{
+	BUG();
+
+	/* if that doesn't kill us, halt */
+	panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
+
 void __init trap_init(void)
 {
 	set_eit_vector_entries();

From a08ffbef4ab799351d0610bbdbeaa1ee746b9065 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Tue, 3 Oct 2017 16:16:51 -0700
Subject: [PATCH 50/51] checkpatch: fix ignoring cover-letter logic

Currently running checkpatch on a directory with a cover-letter.patch
file reports the following error:

  -----------------------------------------
  patches/smp-v2/v2-0000-cover-letter.patch
  -----------------------------------------

  ERROR: Does not appear to be a unified-diff format patch

The logic to suppress the unified-diff check for cover letters is there
but is checking $file instead of $filename.  Fix the variable to use the
correct one.

Link: http://lkml.kernel.org/r/20170909090406.31523-1-shorne@gmail.com
Signed-off-by: Stafford Horne <shorne@gmail.com>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dd2c262aebbf..8b80bac055e4 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6390,7 +6390,7 @@ sub process {
 		exit(0);
 	}
 
-	if (!$is_patch && $file !~ /cover-letter\.patch$/) {
+	if (!$is_patch && $filename !~ /cover-letter\.patch$/) {
 		ERROR("NOT_UNIFIED_DIFF",
 		      "Does not appear to be a unified-diff format patch\n");
 	}

From 32e57c29e3c038ac802b7cc214a8795a4234055f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 16:16:54 -0700
Subject: [PATCH 51/51] include/linux/fs.h: fix comment about struct
 address_space

Before commit 9c5d760b8d22 ("mm: split gfp_mask and mapping flags into
separate fields") the private_* fields of struct adrress_space were
grouped together and using "ditto" in comments describing the last
fields was correct.

With introduction of gpf_mask between private_lock and private_list
"ditto" references the wrong description.

Fix it by using the elaborate description.

Link: http://lkml.kernel.org/r/1507009987-8746-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..13dab191a23e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,7 +403,7 @@ struct address_space {
 	unsigned long		flags;		/* error bits */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
-	struct list_head	private_list;	/* ditto */
+	struct list_head	private_list;	/* for use by the address_space */
 	void			*private_data;	/* ditto */
 	errseq_t		wb_err;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;