Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu

Pull RCU updates from Paul E. McKenney: - Documentation updates, yet again just simple changes. - Miscellaneous fixes, including a change to call_rcu()'s rcu_head alignment check. - Security-motivated list consistency checks, which are disabled by default behind DEBUG_LIST. - Torture-test updates. Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-11-23 10:04:28 +01:00 · 2016-11-23 10:04:28 +01:00 · af91a81131
parent 23400ac997 6190aaafd0
commit af91a81131
17 changed files with 222 additions and 108 deletions
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@ -547,7 +547,7 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
 	It could reuse a value formerly fetched from this same pointer.
 	It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
 	manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
-	mash-up of two distince pointer values.
+	mash-up of two distinct pointer values.
 	It might even use value-speculation optimizations, where it makes
 	a wrong guess, but by the time it gets around to checking the
 	value, an update has changed the pointer to match the wrong guess.
@ -659,6 +659,29 @@ systems with more than one CPU:
 	In other words, a given instance of <tt>synchronize_rcu()</tt>
 	can avoid waiting on a given RCU read-side critical section only
 	if it can prove that <tt>synchronize_rcu()</tt> started first.
+
+	<p>
+	A related question is &ldquo;When <tt>rcu_read_lock()</tt>
+	doesn't generate any code, why does it matter how it relates
+	to a grace period?&rdquo;
+	The answer is that it is not the relationship of
+	<tt>rcu_read_lock()</tt> itself that is important, but rather
+	the relationship of the code within the enclosed RCU read-side
+	critical section to the code preceding and following the
+	grace period.
+	If we take this viewpoint, then a given RCU read-side critical
+	section begins before a given grace period when some access
+	preceding the grace period observes the effect of some access
+	within the critical section, in which case none of the accesses
+	within the critical section may observe the effects of any
+	access following the grace period.
+
+	<p>
+	As of late 2016, mathematical models of RCU take this
+	viewpoint, for example, see slides&nbsp;62 and&nbsp;63
+	of the
+	<a href="http://www2.rdrop.com/users/paulmck/scalability/paper/LinuxMM.2016.10.04c.LCE.pdf">2016 LinuxCon EU</a>
+	presentation.
 </font></td></tr>
 <tr><td>&nbsp;</td></tr>
 </table>
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@ -237,7 +237,7 @@ rcu_dereference()

 	The reader uses rcu_dereference() to fetch an RCU-protected
 	pointer, which returns a value that may then be safely
-	dereferenced.  Note that rcu_deference() does not actually
+	dereferenced.  Note that rcu_dereference() does not actually
 	dereference the pointer, instead, it protects the pointer for
 	later dereferencing.  It also executes any needed memory-barrier
 	instructions for a given CPU architecture.  Currently, only Alpha
--- a/drivers/misc/lkdtm.h
+++ b/drivers/misc/lkdtm.h
@ -21,6 +21,8 @@ void lkdtm_SPINLOCKUP(void);
 void lkdtm_HUNG_TASK(void);
 void lkdtm_ATOMIC_UNDERFLOW(void);
 void lkdtm_ATOMIC_OVERFLOW(void);
+void lkdtm_CORRUPT_LIST_ADD(void);
+void lkdtm_CORRUPT_LIST_DEL(void);

 /* lkdtm_heap.c */
 void lkdtm_OVERWRITE_ALLOCATION(void);
--- a/drivers/misc/lkdtm_bugs.c
+++ b/drivers/misc/lkdtm_bugs.c
@ -5,8 +5,13 @@
 * test source files.
 */
 #include "lkdtm.h"
+#include <linux/list.h>
 #include <linux/sched.h>

+struct lkdtm_list {
+	struct list_head node;
+};
+
 /*
 * Make sure our attempts to over run the kernel stack doesn't trigger
 * a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we
@ -146,3 +151,66 @@ void lkdtm_ATOMIC_OVERFLOW(void)
 	pr_info("attempting bad atomic overflow\n");
 	atomic_inc(&over);
 }
+
+void lkdtm_CORRUPT_LIST_ADD(void)
+{
+	/*
+	 * Initially, an empty list via LIST_HEAD:
+	 *	test_head.next = &test_head
+	 *	test_head.prev = &test_head
+	 */
+	LIST_HEAD(test_head);
+	struct lkdtm_list good, bad;
+	void *target[2] = { };
+	void *redirection = &target;
+
+	pr_info("attempting good list addition\n");
+
+	/*
+	 * Adding to the list performs these actions:
+	 *	test_head.next->prev = &good.node
+	 *	good.node.next = test_head.next
+	 *	good.node.prev = test_head
+	 *	test_head.next = good.node
+	 */
+	list_add(&good.node, &test_head);
+
+	pr_info("attempting corrupted list addition\n");
+	/*
+	 * In simulating this "write what where" primitive, the "what" is
+	 * the address of &bad.node, and the "where" is the address held
+	 * by "redirection".
+	 */
+	test_head.next = redirection;
+	list_add(&bad.node, &test_head);
+
+	if (target[0] == NULL && target[1] == NULL)
+		pr_err("Overwrite did not happen, but no BUG?!\n");
+	else
+		pr_err("list_add() corruption not detected!\n");
+}
+
+void lkdtm_CORRUPT_LIST_DEL(void)
+{
+	LIST_HEAD(test_head);
+	struct lkdtm_list item;
+	void *target[2] = { };
+	void *redirection = &target;
+
+	list_add(&item.node, &test_head);
+
+	pr_info("attempting good list removal\n");
+	list_del(&item.node);
+
+	pr_info("attempting corrupted list removal\n");
+	list_add(&item.node, &test_head);
+
+	/* As with the list_add() test above, this corrupts "next". */
+	item.node.next = redirection;
+	list_del(&item.node);
+
+	if (target[0] == NULL && target[1] == NULL)
+		pr_err("Overwrite did not happen, but no BUG?!\n");
+	else
+		pr_err("list_del() corruption not detected!\n");
+}
--- a/drivers/misc/lkdtm_core.c
+++ b/drivers/misc/lkdtm_core.c
@ -197,6 +197,8 @@ struct crashtype crashtypes[] = {
 	CRASHTYPE(EXCEPTION),
 	CRASHTYPE(LOOP),
 	CRASHTYPE(OVERFLOW),
+	CRASHTYPE(CORRUPT_LIST_ADD),
+	CRASHTYPE(CORRUPT_LIST_DEL),
 	CRASHTYPE(CORRUPT_STACK),
 	CRASHTYPE(UNALIGNED_LOAD_STORE_WRITE),
 	CRASHTYPE(OVERWRITE_ALLOCATION),
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@ -121,4 +121,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 }

 #endif	/* CONFIG_GENERIC_BUG */
+
+/*
+ * Since detected data corruption should stop operation on the affected
+ * structures, this returns false if the corruption condition is found.
+ */
+#define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
+	do {								 \
+		if (unlikely(condition)) {				 \
+			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
+				pr_err(fmt, ##__VA_ARGS__);		 \
+				BUG();					 \
+			} else						 \
+				WARN(1, fmt, ##__VA_ARGS__);		 \
+			return false;					 \
+		}							 \
+	} while (0)
+
 #endif	/* _LINUX_BUG_H */
--- a/include/linux/list.h
+++ b/include/linux/list.h
@ -28,27 +28,42 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
 	list->prev = list;
 }

+#ifdef CONFIG_DEBUG_LIST
+extern bool __list_add_valid(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next);
+extern bool __list_del_entry_valid(struct list_head *entry);
+#else
+static inline bool __list_add_valid(struct list_head *new,
+				struct list_head *prev,
+				struct list_head *next)
+{
+	return true;
+}
+static inline bool __list_del_entry_valid(struct list_head *entry)
+{
+	return true;
+}
+#endif
+
 /*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_add(struct list_head *new,
 			      struct list_head *prev,
 			      struct list_head *next)
 {
+	if (!__list_add_valid(new, prev, next))
+		return;
+
 	next->prev = new;
 	new->next = next;
 	new->prev = prev;
 	WRITE_ONCE(prev->next, new);
 }
-#else
-extern void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next);
-#endif

 /**
 * list_add - add a new entry
@ -96,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_del_entry(struct list_head *entry)
 {
+	if (!__list_del_entry_valid(entry))
+		return;
+
 	__list_del(entry->prev, entry->next);
 }

 static inline void list_del(struct list_head *entry)
 {
-	__list_del(entry->prev, entry->next);
+	__list_del_entry(entry);
 	entry->next = LIST_POISON1;
 	entry->prev = LIST_POISON2;
 }
-#else
-extern void __list_del_entry(struct list_head *entry);
-extern void list_del(struct list_head *entry);
-#endif

 /**
 * list_replace - replace old entry by new one
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_add_rcu(struct list_head *new,
 		struct list_head *prev, struct list_head *next)
 {
+	if (!__list_add_valid(new, prev, next))
+		return;
+
 	new->next = next;
 	new->prev = prev;
 	rcu_assign_pointer(list_next_rcu(prev), new);
 	next->prev = new;
 }
-#else
-void __list_add_rcu(struct list_head *new,
-		    struct list_head *prev, struct list_head *next);
-#endif

 /**
 * list_add_rcu - add a new entry to rcu-protected list
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end,
 /*
 * Tracepoint for rcutorture readers.  The first argument is the name
 * of the RCU flavor from rcutorture's viewpoint and the second argument
- * is the callback address.
+ * is the callback address.  The third argument is the start time in
+ * seconds, and the last two arguments are the grace period numbers
+ * at the beginning and end of the read, respectively.  Note that the
+ * callback address can be NULL.
 */
 TRACE_EVENT(rcu_torture_read,

--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU)

 static void rcu_read_delay(struct torture_random_state *rrsp)
 {
+	unsigned long started;
+	unsigned long completed;
 	const unsigned long shortdelay_us = 200;
 	const unsigned long longdelay_ms = 50;
+	unsigned long long ts;

 	/* We want a short delay sometimes to make a reader delay the grace
 	 * period, and we want a long delay occasionally to trigger
 	 * force_quiescent_state. */

-	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+		started = cur_ops->completed();
+		ts = rcu_trace_clock_local();
 		mdelay(longdelay_ms);
+		completed = cur_ops->completed();
+		do_trace_rcu_torture_read(cur_ops->name, NULL, ts,
+					  started, completed);
+	}
 	if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
 #ifdef CONFIG_PREEMPT
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
 	if (!rcu_kick_kthreads)
 		return;
 	j = READ_ONCE(rsp->jiffies_kick_kthreads);
-	if (time_after(jiffies, j) && rsp->gp_kthread) {
+	if (time_after(jiffies, j) && rsp->gp_kthread &&
+	    (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) {
 		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
 		rcu_ftrace_dump(DUMP_ALL);
 		wake_up_process(rsp->gp_kthread);
@ -2828,8 +2829,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 * Also schedule RCU core processing.
 *
 * This function must be called from hardirq context.  It is normally
- * invoked from the scheduling-clock interrupt.  If rcu_pending returns
- * false, there is no point in invoking rcu_check_callbacks().
+ * invoked from the scheduling-clock interrupt.
 */
 void rcu_check_callbacks(int user)
 {
@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	unsigned long flags;
 	struct rcu_data *rdp;

-	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
+	/* Misaligned rcu_head! */
+	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+
 	if (debug_rcu_head_queue(head)) {
 		/* Probable double call_rcu(), so leak the callback. */
 		WRITE_ONCE(head->func, rcu_leak_callback);
@ -3130,13 +3132,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	}
 	head->func = func;
 	head->next = NULL;
-
-	/*
-	 * Opportunistically note grace-period endings and beginnings.
-	 * Note that we might see a beginning right after we see an
-	 * end, but never vice versa, since this CPU has to pass through
-	 * a quiescent state betweentimes.
-	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);

--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@ -404,6 +404,7 @@ struct rcu_data {
 	atomic_long_t exp_workdone1;	/* # done by others #1. */
 	atomic_long_t exp_workdone2;	/* # done by others #2. */
 	atomic_long_t exp_workdone3;	/* # done by others #3. */
+	int exp_dynticks_snap;		/* Double-check need for IPI. */

 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

+			rdp->exp_dynticks_snap =
+				atomic_add_return(0, &rdtp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
+			    !(rdp->exp_dynticks_snap & 0x1) ||
 			    !(rnp->qsmaskinitnext & rdp->grpmask))
 				mask_ofl_test |= rdp->grpmask;
 		}
@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		/* IPI the remaining CPUs for expedited quiescent state. */
 		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
+			if (atomic_add_return(0, &rdtp->dynticks) !=
+			    rdp->exp_dynticks_snap) {
+				mask_ofl_test |= mask;
+				continue;
+			}
 			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret) {
 				mask_ofl_ipi &= ~mask;
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@ -1218,7 +1218,7 @@ config DEBUG_BUGVERBOSE

 config DEBUG_LIST
 	bool "Debug linked list manipulation"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
 	help
 	  Enable this to turn on extended checks in the linked-list
 	  walking routines.
@ -1434,7 +1434,8 @@ config RCU_TRACE
 	select TRACE_CLOCK
 	help
 	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.
+	  in debugfs for debugging RCU implementation.  It also enables
+	  additional tracepoints for ftrace-style event tracing.

 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
@ -1964,6 +1965,16 @@ config TEST_STATIC_KEYS

 	  If unsure, say N.

+config BUG_ON_DATA_CORRUPTION
+	bool "Trigger a BUG when data corruption is detected"
+	select DEBUG_LIST
+	help
+	  Select this option if the kernel should BUG when it encounters
+	  data corruption in kernel memory structures when they get checked
+	  for validity.
+
+	  If unsure, say N.
+
 source "samples/Kconfig"

 source "lib/Kconfig.kgdb"
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@ -2,8 +2,7 @@
 * Copyright 2006, Red Hat, Inc., Dave Jones
 * Released under the General Public License (GPL).
 *
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
 */

 #include <linux/export.h>
@ -13,88 +12,48 @@
 #include <linux/rculist.h>

 /*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
 */

-void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+		      struct list_head *next)
 {
-	WARN(next->prev != prev,
-		"list_add corruption. next->prev should be "
-		"prev (%p), but was %p. (next=%p).\n",
+	CHECK_DATA_CORRUPTION(next->prev != prev,
+		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
 		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add corruption. prev->next should be "
-		"next (%p), but was %p. (prev=%p).\n",
+	CHECK_DATA_CORRUPTION(prev->next != next,
+		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
 		next, prev->next, prev);
-	WARN(new == prev || new == next,
-	     "list_add double add: new=%p, prev=%p, next=%p.\n",
-	     new, prev, next);
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	WRITE_ONCE(prev->next, new);
-}
-EXPORT_SYMBOL(__list_add);
+	CHECK_DATA_CORRUPTION(new == prev || new == next,
+		"list_add double add: new=%p, prev=%p, next=%p.\n",
+		new, prev, next);

-void __list_del_entry(struct list_head *entry)
+	return true;
+}
+EXPORT_SYMBOL(__list_add_valid);
+
+bool __list_del_entry_valid(struct list_head *entry)
 {
 	struct list_head *prev, *next;

 	prev = entry->prev;
 	next = entry->next;

-	if (WARN(next == LIST_POISON1,
+	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
 		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1) ||
-	    WARN(prev == LIST_POISON2,
+		entry, LIST_POISON1);
+	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
 		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2) ||
-	    WARN(prev->next != entry,
-		"list_del corruption. prev->next should be %p, "
-		"but was %p\n", entry, prev->next) ||
-	    WARN(next->prev != entry,
-		"list_del corruption. next->prev should be %p, "
-		"but was %p\n", entry, next->prev))
-		return;
+		entry, LIST_POISON2);
+	CHECK_DATA_CORRUPTION(prev->next != entry,
+		"list_del corruption. prev->next should be %p, but was %p\n",
+		entry, prev->next);
+	CHECK_DATA_CORRUPTION(next->prev != entry,
+		"list_del corruption. next->prev should be %p, but was %p\n",
+		entry, next->prev);
+	return true;

-	__list_del(prev, next);
 }
-EXPORT_SYMBOL(__list_del_entry);
-
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
-	__list_del_entry(entry);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
-
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
-		    struct list_head *prev, struct list_head *next)
-{
-	WARN(next->prev != prev,
-		"list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	new->next = next;
-	new->prev = prev;
-	rcu_assign_pointer(list_next_rcu(prev), new);
-	next->prev = new;
-}
-EXPORT_SYMBOL(__list_add_rcu);
+EXPORT_SYMBOL(__list_del_entry_valid);
--- a/tools/testing/selftests/rcutorture/.gitignore
+++ b/tools/testing/selftests/rcutorture/.gitignore
@ -1,6 +1,4 @@
 initrd
-linux-2.6
 b[0-9]*
-rcu-test-image
 res
 *.swp
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@ -303,6 +303,7 @@ then
 fi
 ___EOF___
 awk < $T/cfgcpu.pack \
+	-v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \
 	-v CONFIGDIR="$CONFIGFRAG/" \
 	-v KVM="$KVM" \
 	-v ncpus=$cpus \
@ -375,6 +376,10 @@ function dump(first, pastlast, batchnum)
 		njitter = ncpus;
 	else
 		njitter = ja[1];
+	if (TORTURE_BUILDONLY && njitter != 0) {
+		njitter = 0;
+		print "echo Build-only run, so suppressing jitter >> " rd "/log"
+	}
 	for (j = 0; j < njitter; j++)
 		print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&"
 	print "wait"