From d7c816733d501b59dbdc2483f2cc8e4431fd9160 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:08 -0700 Subject: [PATCH 01/18] list: Split list_add() debug checking into separate function Right now, __list_add() code is repeated either in list.h or in list_debug.c, but the only differences between the two versions are the debug checks. This commit therefore extracts these debug checks into a separate __list_add_valid() function and consolidates __list_add(). Additionally this new __list_add_valid() function will stop list manipulations if a corruption is detected, instead of allowing for further corruption that may lead to even worse conditions. This is slight refactoring of the same hardening done in PaX and Grsecurity. Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel --- include/linux/list.h | 22 ++++++++++++++------ lib/list_debug.c | 48 +++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 5809e9a2de5b..b6da9b1dce4d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -28,27 +28,37 @@ static inline void INIT_LIST_HEAD(struct list_head *list) list->prev = list; } +#ifdef CONFIG_DEBUG_LIST +extern bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next); +#else +static inline bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + return true; +} +#endif + /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } -#else -extern void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next); -#endif /** * list_add - add a new entry diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..149dd57b583b 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include @@ -13,33 +12,32 @@ #include /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - WRITE_ONCE(prev->next, new); + if (unlikely(next->prev != prev)) { + WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + return false; + } + if (unlikely(prev->next != next)) { + WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + return false; + } + if (unlikely(new == prev || new == next)) { + WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + return false; + } + return true; } -EXPORT_SYMBOL(__list_add); +EXPORT_SYMBOL(__list_add_valid); void __list_del_entry(struct list_head *entry) { From 54acd4397d7e7a725c94101180cd9f38ef701acc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:09 -0700 Subject: [PATCH 02/18] rculist: Consolidate DEBUG_LIST for list_add_rcu() This commit consolidates the debug checking for list_add_rcu() into the new single __list_add_valid() debug function. Notably, this commit fixes the sanity check that was added in commit 17a801f4bfeb ("list_debug: WARN for adding something already in the list"), which wasn't checking RCU-protected lists. Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel --- include/linux/rculist.h | 8 +++----- lib/list_debug.c | 19 ------------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8beb98dcf14f..4f7a9561b8c4 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } -#else -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next); -#endif /** * list_add_rcu - add a new entry to rcu-protected list diff --git a/lib/list_debug.c b/lib/list_debug.c index 149dd57b583b..d0b89b9d0736 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -77,22 +77,3 @@ void list_del(struct list_head *entry) entry->prev = LIST_POISON2; } EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; -} -EXPORT_SYMBOL(__list_add_rcu); From 0cd340dcb05c4a43742fe156f36737bb2a321bfd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:10 -0700 Subject: [PATCH 03/18] list: Split list_del() debug checking into separate function Similar to the list_add() debug consolidation, this commit consolidates the debug checking performed during CONFIG_DEBUG_LIST into a new __list_del_entry_valid() function, and stops list updates when corruption is found. Refactored from same hardening in PaX and Grsecurity. Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel --- include/linux/list.h | 15 ++++++++----- lib/list_debug.c | 53 +++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index b6da9b1dce4d..d1039ecaf94f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -32,6 +32,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); +extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, @@ -39,6 +40,10 @@ static inline bool __list_add_valid(struct list_head *new, { return true; } +static inline bool __list_del_entry_valid(struct list_head *entry) +{ + return true; +} #endif /* @@ -106,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_del_entry(struct list_head *entry) { + if (!__list_del_entry_valid(entry)) + return; + __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } -#else -extern void __list_del_entry(struct list_head *entry); -extern void list_del(struct list_head *entry); -#endif /** * list_replace - replace old entry by new one diff --git a/lib/list_debug.c b/lib/list_debug.c index d0b89b9d0736..276565fca2a6 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -39,41 +39,34 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, } EXPORT_SYMBOL(__list_add_valid); -void __list_del_entry(struct list_head *entry) +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; + if (unlikely(next == LIST_POISON1)) { + WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1); + return false; + } + if (unlikely(prev == LIST_POISON2)) { + WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2); + return false; + } + if (unlikely(prev->next != entry)) { + WARN(1, "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + return false; + } + if (unlikely(next->prev != entry)) { + WARN(1, "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return false; + } + return true; - __list_del(prev, next); } -EXPORT_SYMBOL(__list_del_entry); - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); +EXPORT_SYMBOL(__list_del_entry_valid); From de54ebbe26bb371a6f1fbc0593372232f04e3107 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:11 -0700 Subject: [PATCH 04/18] bug: Provide toggle for BUG on data corruption The kernel checks for cases of data structure corruption under some CONFIGs (e.g. CONFIG_DEBUG_LIST). When corruption is detected, some systems may want to BUG() immediately instead of letting the system run with known corruption. Usually these kinds of manipulation primitives can be used by security flaws to gain arbitrary memory write control. This provides a new config CONFIG_BUG_ON_DATA_CORRUPTION and a corresponding macro CHECK_DATA_CORRUPTION for handling these situations. Notably, even if not BUGing, the kernel should not continue processing the corrupted structure. This is inspired by similar hardening by Syed Rameez Mustafa in MSM kernels, and in PaX and Grsecurity, which is likely in response to earlier removal of the BUG calls in commit 924d9addb9b1 ("list debugging: use WARN() instead of BUG()"). Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel --- include/linux/bug.h | 17 ++++++++++++++ lib/Kconfig.debug | 10 ++++++++ lib/list_debug.c | 57 +++++++++++++++++---------------------------- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/include/linux/bug.h b/include/linux/bug.h index 292d6a10b0c2..baff2e8fc8a8 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -121,4 +121,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } #endif /* CONFIG_GENERIC_BUG */ + +/* + * Since detected data corruption should stop operation on the affected + * structures, this returns false if the corruption condition is found. + */ +#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ + do { \ + if (unlikely(condition)) { \ + if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ + pr_err(fmt, ##__VA_ARGS__); \ + BUG(); \ + } else \ + WARN(1, fmt, ##__VA_ARGS__); \ + return false; \ + } \ + } while (0) + #endif /* _LINUX_BUG_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 33bc56cf60d7..07a6fac930c5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1960,6 +1960,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select CONFIG_DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/list_debug.c b/lib/list_debug.c index 276565fca2a6..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,21 +20,16 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (unlikely(next->prev != prev)) { - WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - return false; - } - if (unlikely(prev->next != next)) { - WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - return false; - } - if (unlikely(new == prev || new == next)) { - WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - return false; - } + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + return true; } EXPORT_SYMBOL(__list_add_valid); @@ -46,26 +41,18 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - if (unlikely(next == LIST_POISON1)) { - WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1); - return false; - } - if (unlikely(prev == LIST_POISON2)) { - WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2); - return false; - } - if (unlikely(prev->next != entry)) { - WARN(1, "list_del corruption. prev->next should be %p, but was %p\n", - entry, prev->next); - return false; - } - if (unlikely(next->prev != entry)) { - WARN(1, "list_del corruption. next->prev should be %p, but was %p\n", - entry, next->prev); - return false; - } + CHECK_DATA_CORRUPTION(next == LIST_POISON1, + "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); return true; } From 6819d101dd739dd4e8cbe60a98c9ebb224ecc992 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:12 -0700 Subject: [PATCH 05/18] lkdtm: Add tests for struct list corruption When building under CONFIG_DEBUG_LIST, list addition and removal will be sanity-checked. This validates that the check is working as expected by setting up classic corruption attacks against list manipulations, available with the new lkdtm tests CORRUPT_LIST_ADD and CORRUPT_LIST_DEL. Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel --- drivers/misc/lkdtm.h | 2 ++ drivers/misc/lkdtm_bugs.c | 68 +++++++++++++++++++++++++++++++++++++++ drivers/misc/lkdtm_core.c | 2 ++ 3 files changed, 72 insertions(+) diff --git a/drivers/misc/lkdtm.h b/drivers/misc/lkdtm.h index fdf954c2107f..cfa1039c62e7 100644 --- a/drivers/misc/lkdtm.h +++ b/drivers/misc/lkdtm.h @@ -21,6 +21,8 @@ void lkdtm_SPINLOCKUP(void); void lkdtm_HUNG_TASK(void); void lkdtm_ATOMIC_UNDERFLOW(void); void lkdtm_ATOMIC_OVERFLOW(void); +void lkdtm_CORRUPT_LIST_ADD(void); +void lkdtm_CORRUPT_LIST_DEL(void); /* lkdtm_heap.c */ void lkdtm_OVERWRITE_ALLOCATION(void); diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c index 182ae1894b32..f336206d4b1f 100644 --- a/drivers/misc/lkdtm_bugs.c +++ b/drivers/misc/lkdtm_bugs.c @@ -5,8 +5,13 @@ * test source files. */ #include "lkdtm.h" +#include #include +struct lkdtm_list { + struct list_head node; +}; + /* * Make sure our attempts to over run the kernel stack doesn't trigger * a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we @@ -146,3 +151,66 @@ void lkdtm_ATOMIC_OVERFLOW(void) pr_info("attempting bad atomic overflow\n"); atomic_inc(&over); } + +void lkdtm_CORRUPT_LIST_ADD(void) +{ + /* + * Initially, an empty list via LIST_HEAD: + * test_head.next = &test_head + * test_head.prev = &test_head + */ + LIST_HEAD(test_head); + struct lkdtm_list good, bad; + void *target[2] = { }; + void *redirection = ⌖ + + pr_info("attempting good list addition\n"); + + /* + * Adding to the list performs these actions: + * test_head.next->prev = &good.node + * good.node.next = test_head.next + * good.node.prev = test_head + * test_head.next = good.node + */ + list_add(&good.node, &test_head); + + pr_info("attempting corrupted list addition\n"); + /* + * In simulating this "write what where" primitive, the "what" is + * the address of &bad.node, and the "where" is the address held + * by "redirection". + */ + test_head.next = redirection; + list_add(&bad.node, &test_head); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_add() corruption not detected!\n"); +} + +void lkdtm_CORRUPT_LIST_DEL(void) +{ + LIST_HEAD(test_head); + struct lkdtm_list item; + void *target[2] = { }; + void *redirection = ⌖ + + list_add(&item.node, &test_head); + + pr_info("attempting good list removal\n"); + list_del(&item.node); + + pr_info("attempting corrupted list removal\n"); + list_add(&item.node, &test_head); + + /* As with the list_add() test above, this corrupts "next". */ + item.node.next = redirection; + list_del(&item.node); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_del() corruption not detected!\n"); +} diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c index f9154b8d67f6..7eeb71a75549 100644 --- a/drivers/misc/lkdtm_core.c +++ b/drivers/misc/lkdtm_core.c @@ -197,6 +197,8 @@ struct crashtype crashtypes[] = { CRASHTYPE(EXCEPTION), CRASHTYPE(LOOP), CRASHTYPE(OVERFLOW), + CRASHTYPE(CORRUPT_LIST_ADD), + CRASHTYPE(CORRUPT_LIST_DEL), CRASHTYPE(CORRUPT_STACK), CRASHTYPE(UNALIGNED_LOAD_STORE_WRITE), CRASHTYPE(OVERWRITE_ALLOCATION), From 91a6cee6e30eda815d657d1c2fd4b869fe69d705 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Sat, 20 Aug 2016 12:16:10 +0200 Subject: [PATCH 06/18] lib/Kconfig.debug: Fix typo in select statement Commit 484f29c7430b3 ("bug: Provide toggle for BUG on data corruption") added a Kconfig select statement on CONFIG_DEBUG_LIST, but the CONFIG_ prefix is only used in Make and C(PP) syntax. Remove the CONFIG_ prefix to correctly select the Kconfig option DEBUG_LIST. Signed-off-by: Valentin Rothberg Signed-off-by: Paul E. McKenney Acked-by: Kees Cook --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 07a6fac930c5..afa30fd52583 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1962,7 +1962,7 @@ config TEST_STATIC_KEYS config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" - select CONFIG_DEBUG_LIST + select DEBUG_LIST help Select this option if the kernel should BUG when it encounters data corruption in kernel memory structures when they get checked From 4520bcb2ba452863d744337db275c326835fca3f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 26 Aug 2016 17:42:00 +0200 Subject: [PATCH 07/18] bug: Avoid Kconfig warning for BUG_ON_DATA_CORRUPTION The CONFIG_DEBUG_LIST option is normally meant for kernel developers rather than production machines and is guarded by CONFIG_DEBUG_KERNEL. In contrast, the newly added CONFIG_BUG_ON_DATA_CORRUPTION is meant for security hardening and may be used on systems that intentionally do not enable CONFIG_DEBUG_KERNEL. In this configuration, we get a warning from Kconfig about the mismatched dependencies: warning: (BUG_ON_DATA_CORRUPTION) selects DEBUG_LIST which has unmet direct dependencies (DEBUG_KERNEL) This annotates the DEBUG_LIST option to be selectable by BUG_ON_DATA_CORRUPTION when DEBUG_KERNEL is disabled. Signed-off-by: Arnd Bergmann Fixes: 40cd725cfc7f ("bug: Provide toggle for BUG on data corruption") Acked-by: Rik van Riel Signed-off-by: Paul E. McKenney --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index afa30fd52583..0c6366b77062 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1214,7 +1214,7 @@ config DEBUG_BUGVERBOSE config DEBUG_LIST bool "Debug linked list manipulation" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION help Enable this to turn on extended checks in the linked-list walking routines. From e2c85cb12c86b080ee344928618eb918fa227ac8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2016 10:33:24 -0700 Subject: [PATCH 08/18] documentation: Present updated RCU guarantee Recent memory-model work deduces the relationships of RCU read-side critical sections and grace periods based on the relationships of accesses within a critical section and accesses preceding and following the grace period. This commit therefore adds this viewpoint. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a4d3838130e4..39bcb74ea733 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -547,7 +547,7 @@ The rcu_access_pointer() on line 6 is similar to It could reuse a value formerly fetched from this same pointer. It could also fetch the pointer from gp in a byte-at-a-time manner, resulting in load tearing, in turn resulting a bytewise - mash-up of two distince pointer values. + mash-up of two distinct pointer values. It might even use value-speculation optimizations, where it makes a wrong guess, but by the time it gets around to checking the value, an update has changed the pointer to match the wrong guess. @@ -659,6 +659,29 @@ systems with more than one CPU: In other words, a given instance of synchronize_rcu() can avoid waiting on a given RCU read-side critical section only if it can prove that synchronize_rcu() started first. + +

+ A related question is “When rcu_read_lock() + doesn't generate any code, why does it matter how it relates + to a grace period?” + The answer is that it is not the relationship of + rcu_read_lock() itself that is important, but rather + the relationship of the code within the enclosed RCU read-side + critical section to the code preceding and following the + grace period. + If we take this viewpoint, then a given RCU read-side critical + section begins before a given grace period when some access + preceding the grace period observes the effect of some access + within the critical section, in which case none of the accesses + within the critical section may observe the effects of any + access following the grace period. + +

+ As of late 2016, mathematical models of RCU take this + viewpoint, for example, see slides 62 and 63 + of the + 2016 LinuxCon EU + presentation.   From 8cf503d3378d641f8106c1803dc68cab88cd8fca Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 18 Oct 2016 00:54:03 -0400 Subject: [PATCH 09/18] Documentation/RCU: Fix minor typo deference should actually be dereference. Signed-off-by: Pranith Kumar Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 204422719197..5cbd8b2395b8 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -237,7 +237,7 @@ rcu_dereference() The reader uses rcu_dereference() to fetch an RCU-protected pointer, which returns a value that may then be safely - dereferenced. Note that rcu_deference() does not actually + dereferenced. Note that rcu_dereference() does not actually dereference the pointer, instead, it protects the pointer for later dereferencing. It also executes any needed memory-barrier instructions for a given CPU architecture. Currently, only Alpha From b8f2ed538477d9ab803c6458f497df1b1c6cf4ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Aug 2016 06:51:47 -0700 Subject: [PATCH 10/18] rcu: Tighten up __call_rcu() rcu_head alignment check Commit 720abae3d68ae ("rcu: force alignment on struct callback_head/rcu_head") forced the rcu_head (AKA callback_head) structure's alignment to pointer size, that is, to 4-byte boundaries on 32-bit systems and to 8-byte boundaries on 64-bit systems. This commit therefore checks for this same alignment in __call_rcu(), which used to contain a looser check for two-byte alignment. Signed-off-by: Paul E. McKenney Tested-by: Geert Uytterhoeven Acked-by: Kirill A. Shutemov Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 69a5611a7e7c..37e4f7d2be0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ + /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ WRITE_ONCE(head->func, rcu_leak_callback); From 5403d367a746cfc51676e3b39be600d94f587867 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Sep 2016 05:04:24 -0700 Subject: [PATCH 11/18] rcu: Remove obsolete rcu_check_callbacks() header comment In the deep past, rcu_check_callbacks() was only invoked if rcu_pending() returned true. Which was fine, but these days rcu_check_callbacks() is invoked unconditionally. This commit therefore removes the obsolete sentence from the header comment. Reported-by: Michalis Kokologiannakis Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37e4f7d2be0c..d52780024f9f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2828,8 +2828,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * Also schedule RCU core processing. * * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). + * invoked from the scheduling-clock interrupt. */ void rcu_check_callbacks(int user) { From 1f21b50b7785b26aff2a0770d8a674921514ff0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Sep 2016 05:15:53 -0700 Subject: [PATCH 12/18] rcu: Remove obsolete comment from __call_rcu() The __call_rcu() comment about opportunistically noting grace period beginnings and endings is obsolete. RCU still does such opportunistic noting, but in __call_rcu_core() rather than __call_rcu(), and there already is an appropriate comment in __call_rcu_core(). This commit therefore removes the obsolete comment. Reported-by: Michalis Kokologiannakis Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d52780024f9f..865af187498e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3131,13 +3131,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, } head->func = func; head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); From f2151a0a3436edbed454f48f9f94321eb70e3d2b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 6 Oct 2016 05:42:39 -0700 Subject: [PATCH 13/18] rcu: RCU_TRACE enables event tracing as well as debugfs The commit brings the RCU_TRACE Kconfig option's help text up to date by noting that it enables additional event tracing as well as debugfs. Signed-off-by: Nikolay Borisov [ paulmck: Do some wordsmithing. ] Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 33bc56cf60d7..4a431f18f218 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1430,7 +1430,8 @@ config RCU_TRACE select TRACE_CLOCK help This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. + in debugfs for debugging RCU implementation. It also enables + additional tracepoints for ftrace-style event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. From d0af39e89ec59fe7c92c4bcbc2d652ea4c0ee644 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Oct 2016 18:26:04 -0700 Subject: [PATCH 14/18] torture: Trace long read-side delays Although rcutorture will occasionally do a 50-millisecond grace-period delay, these delays are quite rare. And rightly so, because otherwise the read rate would be quite low. Thie means that it can be important to identify whether or not a given run contained a long-delay read. This commit therefore inserts a trace_rcu_torture_read() event to flag runs containing long delays. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 5 ++++- kernel/rcu/rcutorture.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d3e756539d44..9d4f9b3a2b7b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end, /* * Tracepoint for rcutorture readers. The first argument is the name * of the RCU flavor from rcutorture's viewpoint and the second argument - * is the callback address. + * is the callback address. The third argument is the start time in + * seconds, and the last two arguments are the grace period numbers + * at the beginning and end of the read, respectively. Note that the + * callback address can be NULL. */ TRACE_EVENT(rcu_torture_read, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf08fee53dc7..87c51225ceec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct torture_random_state *rrsp) { + unsigned long started; + unsigned long completed; const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; + unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + started = cur_ops->completed(); + ts = rcu_trace_clock_local(); mdelay(longdelay_ms); + completed = cur_ops->completed(); + do_trace_rcu_torture_read(cur_ops->name, NULL, ts, + started, completed); + } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT From 0742ac3e2f9f4b8a3a394a270d8685078837662b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Oct 2016 06:09:59 -0700 Subject: [PATCH 15/18] rcu: Make expedited grace periods recheck dyntick idle state Expedited grace periods check dyntick-idle state, and avoid sending IPIs to idle CPUs, including those running guest OSes, and, on NOHZ_FULL kernels, nohz_full CPUs. However, the kernel has been observed checking a CPU while it was non-idle, but sending the IPI after it has gone idle. This commit therefore rechecks idle state immediately before sending the IPI, refraining from IPIing CPUs that have since gone idle. Reported-by: Rik van Riel Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e99a5234d9ed..fe98dd24adf8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,6 +404,7 @@ struct rcu_data { atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ + int exp_dynticks_snap; /* Double-check need for IPI. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 24343eb87b58..d3053e99fdb6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_dynticks_snap = + atomic_add_return(0, &rdtp->dynticks); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rdp->exp_dynticks_snap & 0x1) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + if (!(mask_ofl_ipi & mask)) continue; retry_ipi: + if (atomic_add_return(0, &rdtp->dynticks) != + rdp->exp_dynticks_snap) { + mask_ofl_test |= mask; + continue; + } ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; From aa3e0bf1aa76579c6c51a2c1116a630aa716379d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Mar 2016 10:43:23 -0700 Subject: [PATCH 16/18] rcu: Don't kick unless grace period or request The current code can result in spurious kicks when there are no grace periods in progress and no grace-period-related requests. This is sort of OK for a diagnostic aid, but the resulting ftrace-dump messages in dmesg are annoying. This commit therefore avoids spurious kicks in the common case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 865af187498e..96c52e43f7ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) if (!rcu_kick_kthreads) return; j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread) { + if (time_after(jiffies, j) && rsp->gp_kthread && + (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); From 31f19ed4f41f0717b9604d6aa6f6015355ebae2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Sep 2016 08:05:14 -0700 Subject: [PATCH 17/18] torture: Remove obsolete files from rcutorture .gitignore Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/.gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index 05838f6f2ebe..ccc240275d1c 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -1,6 +1,4 @@ initrd -linux-2.6 b[0-9]* -rcu-test-image res *.swp From 1f32ee65864830e326aae5890780fa630423a8e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Sep 2016 11:56:12 -0700 Subject: [PATCH 18/18] torture: Prevent jitter from delaying build-only runs Currently, if the --jitter flag specifies jitter for a --build-only run, the system will obediently build a kernel, refuse to launch it, launch the requested number of jitter processes, and wait for the specified kernel run time, which defaults to 30 minutes. This is of course quite pointless. This commit therefore disables jitter on build-only runs. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0aed965f0062..3b3c1b693ee1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -303,6 +303,7 @@ then fi ___EOF___ awk < $T/cfgcpu.pack \ + -v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ @@ -375,6 +376,10 @@ function dump(first, pastlast, batchnum) njitter = ncpus; else njitter = ja[1]; + if (TORTURE_BUILDONLY && njitter != 0) { + njitter = 0; + print "echo Build-only run, so suppressing jitter >> " rd "/log" + } for (j = 0; j < njitter; j++) print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait"