diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a4d3838130e4..39bcb74ea733 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -547,7 +547,7 @@ The rcu_access_pointer() on line 6 is similar to It could reuse a value formerly fetched from this same pointer. It could also fetch the pointer from gp in a byte-at-a-time manner, resulting in load tearing, in turn resulting a bytewise - mash-up of two distince pointer values. + mash-up of two distinct pointer values. It might even use value-speculation optimizations, where it makes a wrong guess, but by the time it gets around to checking the value, an update has changed the pointer to match the wrong guess. @@ -659,6 +659,29 @@ systems with more than one CPU: In other words, a given instance of synchronize_rcu() can avoid waiting on a given RCU read-side critical section only if it can prove that synchronize_rcu() started first. + +

+ A related question is “When rcu_read_lock() + doesn't generate any code, why does it matter how it relates + to a grace period?” + The answer is that it is not the relationship of + rcu_read_lock() itself that is important, but rather + the relationship of the code within the enclosed RCU read-side + critical section to the code preceding and following the + grace period. + If we take this viewpoint, then a given RCU read-side critical + section begins before a given grace period when some access + preceding the grace period observes the effect of some access + within the critical section, in which case none of the accesses + within the critical section may observe the effects of any + access following the grace period. + +

+ As of late 2016, mathematical models of RCU take this + viewpoint, for example, see slides 62 and 63 + of the + 2016 LinuxCon EU + presentation.   diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 204422719197..5cbd8b2395b8 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -237,7 +237,7 @@ rcu_dereference() The reader uses rcu_dereference() to fetch an RCU-protected pointer, which returns a value that may then be safely - dereferenced. Note that rcu_deference() does not actually + dereferenced. Note that rcu_dereference() does not actually dereference the pointer, instead, it protects the pointer for later dereferencing. It also executes any needed memory-barrier instructions for a given CPU architecture. Currently, only Alpha diff --git a/drivers/misc/lkdtm.h b/drivers/misc/lkdtm.h index fdf954c2107f..cfa1039c62e7 100644 --- a/drivers/misc/lkdtm.h +++ b/drivers/misc/lkdtm.h @@ -21,6 +21,8 @@ void lkdtm_SPINLOCKUP(void); void lkdtm_HUNG_TASK(void); void lkdtm_ATOMIC_UNDERFLOW(void); void lkdtm_ATOMIC_OVERFLOW(void); +void lkdtm_CORRUPT_LIST_ADD(void); +void lkdtm_CORRUPT_LIST_DEL(void); /* lkdtm_heap.c */ void lkdtm_OVERWRITE_ALLOCATION(void); diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c index 182ae1894b32..f336206d4b1f 100644 --- a/drivers/misc/lkdtm_bugs.c +++ b/drivers/misc/lkdtm_bugs.c @@ -5,8 +5,13 @@ * test source files. */ #include "lkdtm.h" +#include #include +struct lkdtm_list { + struct list_head node; +}; + /* * Make sure our attempts to over run the kernel stack doesn't trigger * a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we @@ -146,3 +151,66 @@ void lkdtm_ATOMIC_OVERFLOW(void) pr_info("attempting bad atomic overflow\n"); atomic_inc(&over); } + +void lkdtm_CORRUPT_LIST_ADD(void) +{ + /* + * Initially, an empty list via LIST_HEAD: + * test_head.next = &test_head + * test_head.prev = &test_head + */ + LIST_HEAD(test_head); + struct lkdtm_list good, bad; + void *target[2] = { }; + void *redirection = ⌖ + + pr_info("attempting good list addition\n"); + + /* + * Adding to the list performs these actions: + * test_head.next->prev = &good.node + * good.node.next = test_head.next + * good.node.prev = test_head + * test_head.next = good.node + */ + list_add(&good.node, &test_head); + + pr_info("attempting corrupted list addition\n"); + /* + * In simulating this "write what where" primitive, the "what" is + * the address of &bad.node, and the "where" is the address held + * by "redirection". + */ + test_head.next = redirection; + list_add(&bad.node, &test_head); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_add() corruption not detected!\n"); +} + +void lkdtm_CORRUPT_LIST_DEL(void) +{ + LIST_HEAD(test_head); + struct lkdtm_list item; + void *target[2] = { }; + void *redirection = ⌖ + + list_add(&item.node, &test_head); + + pr_info("attempting good list removal\n"); + list_del(&item.node); + + pr_info("attempting corrupted list removal\n"); + list_add(&item.node, &test_head); + + /* As with the list_add() test above, this corrupts "next". */ + item.node.next = redirection; + list_del(&item.node); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_del() corruption not detected!\n"); +} diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c index f9154b8d67f6..7eeb71a75549 100644 --- a/drivers/misc/lkdtm_core.c +++ b/drivers/misc/lkdtm_core.c @@ -197,6 +197,8 @@ struct crashtype crashtypes[] = { CRASHTYPE(EXCEPTION), CRASHTYPE(LOOP), CRASHTYPE(OVERFLOW), + CRASHTYPE(CORRUPT_LIST_ADD), + CRASHTYPE(CORRUPT_LIST_DEL), CRASHTYPE(CORRUPT_STACK), CRASHTYPE(UNALIGNED_LOAD_STORE_WRITE), CRASHTYPE(OVERWRITE_ALLOCATION), diff --git a/include/linux/bug.h b/include/linux/bug.h index 292d6a10b0c2..baff2e8fc8a8 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -121,4 +121,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } #endif /* CONFIG_GENERIC_BUG */ + +/* + * Since detected data corruption should stop operation on the affected + * structures, this returns false if the corruption condition is found. + */ +#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ + do { \ + if (unlikely(condition)) { \ + if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ + pr_err(fmt, ##__VA_ARGS__); \ + BUG(); \ + } else \ + WARN(1, fmt, ##__VA_ARGS__); \ + return false; \ + } \ + } while (0) + #endif /* _LINUX_BUG_H */ diff --git a/include/linux/list.h b/include/linux/list.h index 5809e9a2de5b..d1039ecaf94f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -28,27 +28,42 @@ static inline void INIT_LIST_HEAD(struct list_head *list) list->prev = list; } +#ifdef CONFIG_DEBUG_LIST +extern bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next); +extern bool __list_del_entry_valid(struct list_head *entry); +#else +static inline bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + return true; +} +static inline bool __list_del_entry_valid(struct list_head *entry) +{ + return true; +} +#endif + /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } -#else -extern void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next); -#endif /** * list_add - add a new entry @@ -96,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_del_entry(struct list_head *entry) { + if (!__list_del_entry_valid(entry)) + return; + __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } -#else -extern void __list_del_entry(struct list_head *entry); -extern void list_del(struct list_head *entry); -#endif /** * list_replace - replace old entry by new one diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8beb98dcf14f..4f7a9561b8c4 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } -#else -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next); -#endif /** * list_add_rcu - add a new entry to rcu-protected list diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d3e756539d44..9d4f9b3a2b7b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end, /* * Tracepoint for rcutorture readers. The first argument is the name * of the RCU flavor from rcutorture's viewpoint and the second argument - * is the callback address. + * is the callback address. The third argument is the start time in + * seconds, and the last two arguments are the grace period numbers + * at the beginning and end of the read, respectively. Note that the + * callback address can be NULL. */ TRACE_EVENT(rcu_torture_read, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf08fee53dc7..87c51225ceec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct torture_random_state *rrsp) { + unsigned long started; + unsigned long completed; const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; + unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + started = cur_ops->completed(); + ts = rcu_trace_clock_local(); mdelay(longdelay_ms); + completed = cur_ops->completed(); + do_trace_rcu_torture_read(cur_ops->name, NULL, ts, + started, completed); + } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 69a5611a7e7c..96c52e43f7ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) if (!rcu_kick_kthreads) return; j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread) { + if (time_after(jiffies, j) && rsp->gp_kthread && + (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); @@ -2828,8 +2829,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * Also schedule RCU core processing. * * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). + * invoked from the scheduling-clock interrupt. */ void rcu_check_callbacks(int user) { @@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ + /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ WRITE_ONCE(head->func, rcu_leak_callback); @@ -3130,13 +3132,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, } head->func = func; head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e99a5234d9ed..fe98dd24adf8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,6 +404,7 @@ struct rcu_data { atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ + int exp_dynticks_snap; /* Double-check need for IPI. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 24343eb87b58..d3053e99fdb6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_dynticks_snap = + atomic_add_return(0, &rdtp->dynticks); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rdp->exp_dynticks_snap & 0x1) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + if (!(mask_ofl_ipi & mask)) continue; retry_ipi: + if (atomic_add_return(0, &rdtp->dynticks) != + rdp->exp_dynticks_snap) { + mask_ofl_test |= mask; + continue; + } ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a6c8db1d62f6..9bb7d825ba14 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1218,7 +1218,7 @@ config DEBUG_BUGVERBOSE config DEBUG_LIST bool "Debug linked list manipulation" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION help Enable this to turn on extended checks in the linked-list walking routines. @@ -1434,7 +1434,8 @@ config RCU_TRACE select TRACE_CLOCK help This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. + in debugfs for debugging RCU implementation. It also enables + additional tracepoints for ftrace-style event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. @@ -1964,6 +1965,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include @@ -13,88 +12,48 @@ #include /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - WRITE_ONCE(prev->next, new); -} -EXPORT_SYMBOL(__list_add); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); -void __list_del_entry(struct list_head *entry) + return true; +} +EXPORT_SYMBOL(__list_add_valid); + +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return true; - __list_del(prev, next); } -EXPORT_SYMBOL(__list_del_entry); - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; -} -EXPORT_SYMBOL(__list_add_rcu); +EXPORT_SYMBOL(__list_del_entry_valid); diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index 05838f6f2ebe..ccc240275d1c 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -1,6 +1,4 @@ initrd -linux-2.6 b[0-9]* -rcu-test-image res *.swp diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0aed965f0062..3b3c1b693ee1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -303,6 +303,7 @@ then fi ___EOF___ awk < $T/cfgcpu.pack \ + -v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ @@ -375,6 +376,10 @@ function dump(first, pastlast, batchnum) njitter = ncpus; else njitter = ja[1]; + if (TORTURE_BUILDONLY && njitter != 0) { + njitter = 0; + print "echo Build-only run, so suppressing jitter >> " rd "/log" + } for (j = 0; j < njitter; j++) print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait"