From 9dd1220114e00d8ec5cdc20085bbe198b21e1985 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 11 Nov 2013 09:08:15 -0500 Subject: [PATCH 01/34] smp/cpumask: Make CONFIG_CPUMASK_OFFSTACK=y usable without debug dependency When CONFIG_CPUMASK_OFFSTACK was added in 2008, it was dependent upon CONFIG_DEBUG_PER_CPU_MAPS being enabled, or an architecture could select it. The debug dependency adds additional overhead that isn't required for operation of the feature and which is undesirable for distro kernels. CONFIG_CPUMASK_OFFSTACK=y is needed to increase the CONFIG_NR_CPUS value beyond 512 on x86. So drop the current dependency, its only real dependency is CONFIG_SMP=y. Signed-off-by: Josh Boyer Cc: Rusty Russell Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20131111140815.GB20328@hansolo.jdub.homelinux.org Signed-off-by: Ingo Molnar --- lib/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index b3c8be0da17f..50b47cde8b49 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -342,7 +342,8 @@ config CHECK_SIGNATURE bool config CPUMASK_OFFSTACK - bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS + bool "Force CPU masks off stack" + depends on SMP help Use dynamic allocation for cpumask_var_t, instead of putting them on the stack. This is a bit more expensive, but avoids From f1a83e652bedef88d6d77d3dc58250e08e7062bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:42:47 +0100 Subject: [PATCH 02/34] lockdep: Correctly annotate hardirq context in irq_exit() There was a reported deadlock on -rt which lockdep didn't report. It turns out that in irq_exit() we tell lockdep that the hardirq context ends and then do all kinds of locking afterwards. To fix it, move trace_hardirq_exit() to the very end of irq_exit(), this ensures all locking in tick_irq_exit() and rcu_irq_exit() are properly recorded as happening from hardirq context. This however leads to the 'fun' little problem of running softirqs while in hardirq context. To cure this make the softirq code a little more complex (in the CONFIG_TRACE_IRQFLAGS case). Due to stack swizzling arch dependent trickery we cannot pass an argument to __do_softirq() to tell it if it was done from hardirq context or not; so use a side-band argument. When we do __do_softirq() from hardirq context, 'atomically' flip to softirq context and back, so that no locking goes without being in either hard- or soft-irq context. I didn't find any new problems in mainline using this patch, but it did show the -rt problem. Reported-by: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-dgwc5cdksbn0jk09vbmcc9sa@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/softirq.c | 54 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index b24988353458..eb0acf44b063 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -213,14 +213,52 @@ EXPORT_SYMBOL(local_bh_enable_ip); #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) #define MAX_SOFTIRQ_RESTART 10 +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * Convoluted means of passing __do_softirq() a message through the various + * architecture execute_on_stack() bits. + * + * When we run softirqs from irq_exit() and thus on the hardirq stack we need + * to keep the lockdep irq context tracking as tight as possible in order to + * not miss-qualify lock contexts and miss possible deadlocks. + */ +static DEFINE_PER_CPU(int, softirq_from_hardirq); + +static inline void lockdep_softirq_from_hardirq(void) +{ + this_cpu_write(softirq_from_hardirq, 1); +} + +static inline void lockdep_softirq_start(void) +{ + if (this_cpu_read(softirq_from_hardirq)) + trace_hardirq_exit(); + lockdep_softirq_enter(); +} + +static inline void lockdep_softirq_end(void) +{ + lockdep_softirq_exit(); + if (this_cpu_read(softirq_from_hardirq)) { + this_cpu_write(softirq_from_hardirq, 0); + trace_hardirq_enter(); + } +} + +#else +static inline void lockdep_softirq_from_hardirq(void) { } +static inline void lockdep_softirq_start(void) { } +static inline void lockdep_softirq_end(void) { } +#endif + asmlinkage void __do_softirq(void) { - struct softirq_action *h; - __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; - int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; + struct softirq_action *h; + __u32 pending; + int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -233,7 +271,7 @@ asmlinkage void __do_softirq(void) account_irq_enter_time(current); __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_enter(); + lockdep_softirq_start(); cpu = smp_processor_id(); restart: @@ -280,16 +318,13 @@ restart: wakeup_softirqd(); } - lockdep_softirq_exit(); - + lockdep_softirq_end(); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } - - asmlinkage void do_softirq(void) { __u32 pending; @@ -332,6 +367,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { + lockdep_softirq_from_hardirq(); #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if @@ -377,13 +413,13 @@ void irq_exit(void) #endif account_irq_exit_time(current); - trace_hardirq_exit(); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); rcu_irq_exit(); + trace_hardirq_exit(); /* must be last! */ } /* From e0edc78f25c020dea66742c05a7fbcb2ff3df629 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Nov 2013 11:24:53 +0100 Subject: [PATCH 03/34] Documentation/memory-barriers.txt: Fix a typo in the data dependency description This typo has been there forever, it is 7.5 years old, looks like this section of our memory ordering documentation is a place where most eyes are glazed over already ;-) [ Also fix some stray spaces and stray tabs while at it, shrinking the file by 49 bytes. Visual output unchanged. ] Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-gncea9cb8igosblizfqMXrie@git.kernel.org Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index c8c42e64e953..020cccdbdd0c 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -500,7 +500,7 @@ odd-numbered bank is idle, one can see the new value of the pointer P (&B), but the old value of the variable B (2). -Another example of where data dependency barriers might by required is where a +Another example of where data dependency barriers might be required is where a number is read from memory and then used to calculate the index for an array access: @@ -882,12 +882,12 @@ cache it for later use. Consider: - CPU 1 CPU 2 + CPU 1 CPU 2 ======================= ======================= - LOAD B - DIVIDE } Divide instructions generally - DIVIDE } take a long time to perform - LOAD A + LOAD B + DIVIDE } Divide instructions generally + DIVIDE } take a long time to perform + LOAD A Which might appear as this: @@ -910,13 +910,13 @@ Which might appear as this: Placing a read barrier or a data dependency barrier just before the second load: - CPU 1 CPU 2 + CPU 1 CPU 2 ======================= ======================= - LOAD B - DIVIDE - DIVIDE + LOAD B + DIVIDE + DIVIDE - LOAD A + LOAD A will force any value speculatively obtained to be reconsidered to an extent dependent on the type of barrier used. If there was no change made to the @@ -1887,8 +1887,8 @@ functions: space should suffice for PCI. [*] NOTE! attempting to load from the same location as was written to may - cause a malfunction - consider the 16550 Rx/Tx serial registers for - example. + cause a malfunction - consider the 16550 Rx/Tx serial registers for + example. Used with prefetchable I/O memory, an mmiowb() barrier may be required to force stores to be ordered. @@ -1955,19 +1955,19 @@ barriers for the most part act at the interface between the CPU and its cache : +--------+ +--------+ : +--------+ +-----------+ | | | | : | | | | +--------+ - | CPU | | Memory | : | CPU | | | | | - | Core |--->| Access |----->| Cache |<-->| | | | + | CPU | | Memory | : | CPU | | | | | + | Core |--->| Access |----->| Cache |<-->| | | | | | | Queue | : | | | |--->| Memory | - | | | | : | | | | | | - +--------+ +--------+ : +--------+ | | | | + | | | | : | | | | | | + +--------+ +--------+ : +--------+ | | | | : | Cache | +--------+ : | Coherency | : | Mechanism | +--------+ +--------+ +--------+ : +--------+ | | | | | | | | : | | | | | | | CPU | | Memory | : | CPU | | |--->| Device | - | Core |--->| Access |----->| Cache |<-->| | | | - | | | Queue | : | | | | | | + | Core |--->| Access |----->| Cache |<-->| | | | + | | | Queue | : | | | | | | | | | | : | | | | +--------+ +--------+ +--------+ : +--------+ +-----------+ : @@ -2090,7 +2090,7 @@ CPU's caches by some other cache event: p = &v; q = p; - + x = *q; Reads from v before v updated in cache @@ -2115,7 +2115,7 @@ queue before processing any further requests: p = &v; q = p; - + smp_read_barrier_depends() From 5c4853b60ca8ec3d989ce05a5e995d15c3ed52c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 20 Nov 2013 01:07:34 +0100 Subject: [PATCH 04/34] lockdep: Simplify a bit hardirq <-> softirq transitions Instead of saving the hardirq state on a per CPU variable, which require an explicit call before the softirq handling and some complication, just save and restore the hardirq tracing state through functions return values and parameters. It simplifies a bit the black magic that works around the fact that softirqs can be called from hardirqs while hardirqs can nest on softirqs but those two cases have very different semantics and only the latter case assume both states. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1384906054-30676-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/softirq.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index f84aa48c0e66..9a4500e4c189 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -213,40 +213,35 @@ EXPORT_SYMBOL(local_bh_enable_ip); #ifdef CONFIG_TRACE_IRQFLAGS /* - * Convoluted means of passing __do_softirq() a message through the various - * architecture execute_on_stack() bits. - * * When we run softirqs from irq_exit() and thus on the hardirq stack we need * to keep the lockdep irq context tracking as tight as possible in order to * not miss-qualify lock contexts and miss possible deadlocks. */ -static DEFINE_PER_CPU(int, softirq_from_hardirq); -static inline void lockdep_softirq_from_hardirq(void) +static inline bool lockdep_softirq_start(void) { - this_cpu_write(softirq_from_hardirq, 1); -} + bool in_hardirq = false; -static inline void lockdep_softirq_start(void) -{ - if (this_cpu_read(softirq_from_hardirq)) + if (trace_hardirq_context(current)) { + in_hardirq = true; trace_hardirq_exit(); + } + lockdep_softirq_enter(); + + return in_hardirq; } -static inline void lockdep_softirq_end(void) +static inline void lockdep_softirq_end(bool in_hardirq) { lockdep_softirq_exit(); - if (this_cpu_read(softirq_from_hardirq)) { - this_cpu_write(softirq_from_hardirq, 0); - trace_hardirq_enter(); - } -} + if (in_hardirq) + trace_hardirq_enter(); +} #else -static inline void lockdep_softirq_from_hardirq(void) { } -static inline void lockdep_softirq_start(void) { } -static inline void lockdep_softirq_end(void) { } +static inline bool lockdep_softirq_start(void) { return false; } +static inline void lockdep_softirq_end(bool in_hardirq) { } #endif asmlinkage void __do_softirq(void) @@ -255,6 +250,7 @@ asmlinkage void __do_softirq(void) unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; struct softirq_action *h; + bool in_hardirq; __u32 pending; int cpu; @@ -269,7 +265,7 @@ asmlinkage void __do_softirq(void) account_irq_enter_time(current); __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_start(); + in_hardirq = lockdep_softirq_start(); cpu = smp_processor_id(); restart: @@ -316,7 +312,7 @@ restart: wakeup_softirqd(); } - lockdep_softirq_end(); + lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); @@ -365,7 +361,6 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { - lockdep_softirq_from_hardirq(); #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if From 8dce7a9a6f4ca7163161a80a4603b66c88c5de8e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:16 -0400 Subject: [PATCH 05/34] lockdep: Be nice about building from userspace Lockdep is an awesome piece of code which detects locking issues which are relevant both to userspace and kernelspace. We can easily make lockdep work in userspace since there is really no kernel spacific magic going on in the code. All we need is to wrap two functions which are used by lockdep and are very kernel specific. Doing that will allow tools located in tools/ to easily utilize lockdep's code for their own use. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: penberg@kernel.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1352753446-24109-1-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 576ba756a32d..eb8a54783fa0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class) /* * Is this the address of a static object: */ +#ifdef __KERNEL__ static int static_obj(void *obj) { unsigned long start = (unsigned long) &_stext, @@ -616,6 +617,7 @@ static int static_obj(void *obj) */ return is_module_address(addr) || is_module_percpu_address(addr); } +#endif /* * To make lock name printouts unique, we calculate a unique @@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); +#ifdef __KERNEL__ void debug_show_all_locks(void) { struct task_struct *g, *p; @@ -4172,6 +4175,7 @@ retry: read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); +#endif /* * Careful: only use this function if you are sure that From 5634bd7d2ab14fbf736b62b0788fb68e2cb0fde2 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:17 -0400 Subject: [PATCH 06/34] liblockdep: Wrap kernel/locking/lockdep.c to allow usage from userspace kernel/locking/lockdep.c deals with validating locking scenarios for various architectures supported by the kernel. There isn't anything kernel specific going on in lockdep, and when we compare userspace to other architectures that don't have to deal with irqs such as s390, they become all too similar. We wrap kernel/locking/lockdep.c and include/linux/lockdep.h with several headers which allow us to build and use lockdep from userspace. We don't touch the kernel code itself which means that any work done on lockdep in the kernel will automatically benefit userspace lockdep as well! Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-3-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/Makefile | 251 ++++++++++++++++++ tools/lib/lockdep/common.c | 33 +++ tools/lib/lockdep/lockdep.c | 2 + tools/lib/lockdep/lockdep_internals.h | 1 + tools/lib/lockdep/lockdep_states.h | 1 + tools/lib/lockdep/rbtree.c | 1 + tools/lib/lockdep/uinclude/asm/hweight.h | 3 + tools/lib/lockdep/uinclude/asm/sections.h | 3 + tools/lib/lockdep/uinclude/linux/bitops.h | 3 + tools/lib/lockdep/uinclude/linux/compiler.h | 7 + .../lib/lockdep/uinclude/linux/debug_locks.h | 12 + tools/lib/lockdep/uinclude/linux/delay.h | 3 + tools/lib/lockdep/uinclude/linux/export.h | 7 + tools/lib/lockdep/uinclude/linux/ftrace.h | 3 + tools/lib/lockdep/uinclude/linux/gfp.h | 3 + tools/lib/lockdep/uinclude/linux/hardirq.h | 11 + tools/lib/lockdep/uinclude/linux/hash.h | 1 + tools/lib/lockdep/uinclude/linux/interrupt.h | 3 + tools/lib/lockdep/uinclude/linux/irqflags.h | 38 +++ tools/lib/lockdep/uinclude/linux/kallsyms.h | 32 +++ .../lib/lockdep/uinclude/linux/kern_levels.h | 25 ++ tools/lib/lockdep/uinclude/linux/kernel.h | 44 +++ tools/lib/lockdep/uinclude/linux/kmemcheck.h | 8 + tools/lib/lockdep/uinclude/linux/linkage.h | 3 + tools/lib/lockdep/uinclude/linux/list.h | 1 + tools/lib/lockdep/uinclude/linux/lockdep.h | 55 ++++ tools/lib/lockdep/uinclude/linux/module.h | 6 + tools/lib/lockdep/uinclude/linux/mutex.h | 3 + tools/lib/lockdep/uinclude/linux/poison.h | 1 + tools/lib/lockdep/uinclude/linux/prefetch.h | 6 + tools/lib/lockdep/uinclude/linux/proc_fs.h | 3 + tools/lib/lockdep/uinclude/linux/rbtree.h | 1 + .../lockdep/uinclude/linux/rbtree_augmented.h | 2 + tools/lib/lockdep/uinclude/linux/rcu.h | 16 ++ tools/lib/lockdep/uinclude/linux/seq_file.h | 3 + tools/lib/lockdep/uinclude/linux/spinlock.h | 25 ++ tools/lib/lockdep/uinclude/linux/stacktrace.h | 32 +++ tools/lib/lockdep/uinclude/linux/stringify.h | 7 + tools/lib/lockdep/uinclude/linux/types.h | 58 ++++ .../lib/lockdep/uinclude/trace/events/lock.h | 3 + 40 files changed, 720 insertions(+) create mode 100644 tools/lib/lockdep/Makefile create mode 100644 tools/lib/lockdep/common.c create mode 100644 tools/lib/lockdep/lockdep.c create mode 100644 tools/lib/lockdep/lockdep_internals.h create mode 100644 tools/lib/lockdep/lockdep_states.h create mode 100644 tools/lib/lockdep/rbtree.c create mode 100644 tools/lib/lockdep/uinclude/asm/hweight.h create mode 100644 tools/lib/lockdep/uinclude/asm/sections.h create mode 100644 tools/lib/lockdep/uinclude/linux/bitops.h create mode 100644 tools/lib/lockdep/uinclude/linux/compiler.h create mode 100644 tools/lib/lockdep/uinclude/linux/debug_locks.h create mode 100644 tools/lib/lockdep/uinclude/linux/delay.h create mode 100644 tools/lib/lockdep/uinclude/linux/export.h create mode 100644 tools/lib/lockdep/uinclude/linux/ftrace.h create mode 100644 tools/lib/lockdep/uinclude/linux/gfp.h create mode 100644 tools/lib/lockdep/uinclude/linux/hardirq.h create mode 100644 tools/lib/lockdep/uinclude/linux/hash.h create mode 100644 tools/lib/lockdep/uinclude/linux/interrupt.h create mode 100644 tools/lib/lockdep/uinclude/linux/irqflags.h create mode 100644 tools/lib/lockdep/uinclude/linux/kallsyms.h create mode 100644 tools/lib/lockdep/uinclude/linux/kern_levels.h create mode 100644 tools/lib/lockdep/uinclude/linux/kernel.h create mode 100644 tools/lib/lockdep/uinclude/linux/kmemcheck.h create mode 100644 tools/lib/lockdep/uinclude/linux/linkage.h create mode 100644 tools/lib/lockdep/uinclude/linux/list.h create mode 100644 tools/lib/lockdep/uinclude/linux/lockdep.h create mode 100644 tools/lib/lockdep/uinclude/linux/module.h create mode 100644 tools/lib/lockdep/uinclude/linux/mutex.h create mode 100644 tools/lib/lockdep/uinclude/linux/poison.h create mode 100644 tools/lib/lockdep/uinclude/linux/prefetch.h create mode 100644 tools/lib/lockdep/uinclude/linux/proc_fs.h create mode 100644 tools/lib/lockdep/uinclude/linux/rbtree.h create mode 100644 tools/lib/lockdep/uinclude/linux/rbtree_augmented.h create mode 100644 tools/lib/lockdep/uinclude/linux/rcu.h create mode 100644 tools/lib/lockdep/uinclude/linux/seq_file.h create mode 100644 tools/lib/lockdep/uinclude/linux/spinlock.h create mode 100644 tools/lib/lockdep/uinclude/linux/stacktrace.h create mode 100644 tools/lib/lockdep/uinclude/linux/stringify.h create mode 100644 tools/lib/lockdep/uinclude/linux/types.h create mode 100644 tools/lib/lockdep/uinclude/trace/events/lock.h diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile new file mode 100644 index 000000000000..da8b7aa3d351 --- /dev/null +++ b/tools/lib/lockdep/Makefile @@ -0,0 +1,251 @@ +# liblockdep version +LL_VERSION = 0 +LL_PATCHLEVEL = 0 +LL_EXTRAVERSION = 1 + +# file format version +FILE_VERSION = 1 + +MAKEFLAGS += --no-print-directory + + +# Makefiles suck: This macro sets a default value of $(2) for the +# variable named by $(1), unless the variable has been set by +# environment or command line. This is necessary for CC and AR +# because make sets default values, so the simpler ?= approach +# won't work as expected. +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix. +$(call allow-override,CC,$(CROSS_COMPILE)gcc) +$(call allow-override,AR,$(CROSS_COMPILE)ar) + +INSTALL = install + +# Use DESTDIR for installing into a different root directory. +# This is useful for building a package. The program will be +# installed in this directory as if it was the root directory. +# Then the build tool can move it later. +DESTDIR ?= +DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))' + +prefix ?= /usr/local +libdir_relative = lib +libdir = $(prefix)/$(libdir_relative) +bindir_relative = bin +bindir = $(prefix)/$(bindir_relative) + +export DESTDIR DESTDIR_SQ INSTALL + +# copy a bit from Linux kbuild + +ifeq ("$(origin V)", "command line") + VERBOSE = $(V) +endif +ifndef VERBOSE + VERBOSE = 0 +endif + +ifeq ("$(origin O)", "command line") + BUILD_OUTPUT := $(O) +endif + +ifeq ($(BUILD_SRC),) +ifneq ($(BUILD_OUTPUT),) + +define build_output + $(if $(VERBOSE:1=),@)$(MAKE) -C $(BUILD_OUTPUT) \ + BUILD_SRC=$(CURDIR) -f $(CURDIR)/Makefile $1 +endef + +saved-output := $(BUILD_OUTPUT) +BUILD_OUTPUT := $(shell cd $(BUILD_OUTPUT) && /bin/pwd) +$(if $(BUILD_OUTPUT),, \ + $(error output directory "$(saved-output)" does not exist)) + +all: sub-make + +gui: force + $(call build_output, all_cmd) + +$(filter-out gui,$(MAKECMDGOALS)): sub-make + +sub-make: force + $(call build_output, $(MAKECMDGOALS)) + + +# Leave processing to above invocation of make +skip-makefile := 1 + +endif # BUILD_OUTPUT +endif # BUILD_SRC + +# We process the rest of the Makefile if this is the final invocation of make +ifeq ($(skip-makefile),) + +srctree := $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR)) +objtree := $(CURDIR) +src := $(srctree) +obj := $(objtree) + +export prefix libdir bindir src obj + +# Shell quotes +libdir_SQ = $(subst ','\'',$(libdir)) +bindir_SQ = $(subst ','\'',$(bindir)) + +LIB_FILE = liblockdep.a liblockdep.so +BIN_FILE = lockdep + +CONFIG_INCLUDES = +CONFIG_LIBS = +CONFIG_FLAGS = + +OBJ = $@ +N = + +export Q VERBOSE + +LIBLOCKDEP_VERSION = $(LL_VERSION).$(LL_PATCHLEVEL).$(LL_EXTRAVERSION) + +INCLUDES = -I. -I/usr/local/include -I./uinclude $(CONFIG_INCLUDES) + +# Set compile option CFLAGS if not set elsewhere +CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g + +override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ) + +ifeq ($(VERBOSE),1) + Q = + print_compile = + print_app_build = + print_fpic_compile = + print_shared_lib_compile = + print_install = +else + Q = @ + print_compile = echo ' CC '$(OBJ); + print_app_build = echo ' BUILD '$(OBJ); + print_fpic_compile = echo ' CC FPIC '$(OBJ); + print_shared_lib_compile = echo ' BUILD SHARED LIB '$(OBJ); + print_static_lib_build = echo ' BUILD STATIC LIB '$(OBJ); + print_install = echo ' INSTALL '$1' to $(DESTDIR_SQ)$2'; +endif + +do_fpic_compile = \ + ($(print_fpic_compile) \ + $(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@) + +do_app_build = \ + ($(print_app_build) \ + $(CC) $^ -rdynamic -o $@ $(CONFIG_LIBS) $(LIBS)) + +do_compile_shared_library = \ + ($(print_shared_lib_compile) \ + $(CC) --shared $^ -o $@ -lpthread -ldl) + +do_build_static_lib = \ + ($(print_static_lib_build) \ + $(RM) $@; $(AR) rcs $@ $^) + + +define do_compile + $(print_compile) \ + $(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@; +endef + +$(obj)/%.o: $(src)/%.c + $(Q)$(call do_compile) + +%.o: $(src)/%.c + $(Q)$(call do_compile) + +PEVENT_LIB_OBJS = common.o lockdep.o preload.o rbtree.o + +ALL_OBJS = $(PEVENT_LIB_OBJS) + +CMD_TARGETS = $(LIB_FILE) + +TARGETS = $(CMD_TARGETS) + + +all: all_cmd + +all_cmd: $(CMD_TARGETS) + +liblockdep.so: $(PEVENT_LIB_OBJS) + $(Q)$(do_compile_shared_library) + +liblockdep.a: $(PEVENT_LIB_OBJS) + $(Q)$(do_build_static_lib) + +$(PEVENT_LIB_OBJS): %.o: $(src)/%.c + $(Q)$(do_fpic_compile) + +## make deps + +all_objs := $(sort $(ALL_OBJS)) +all_deps := $(all_objs:%.o=.%.d) + +# let .d file also depends on the source and header files +define check_deps + @set -e; $(RM) $@; \ + $(CC) -MM $(CFLAGS) $< > $@.$$$$; \ + sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ + $(RM) $@.$$$$ +endef + +$(all_deps): .%.d: $(src)/%.c + $(Q)$(call check_deps) + +$(all_objs) : %.o : .%.d + +dep_includes := $(wildcard $(all_deps)) + +ifneq ($(dep_includes),) + include $(dep_includes) +endif + +### Detect environment changes +TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE) + +tags: force + $(RM) tags + find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \ + --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' + +TAGS: force + $(RM) TAGS + find . -name '*.[ch]' | xargs etags \ + --regex='/_PE(\([^,)]*\).*/PEVENT_ERRNO__\1/' + +define do_install + $(print_install) \ + if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ + fi; \ + $(INSTALL) $1 '$(DESTDIR_SQ)$2' +endef + +install_lib: all_cmd + $(Q)$(call do_install,$(LIB_FILE),$(libdir_SQ)) + $(Q)$(call do_install,$(BIN_FILE),$(bindir_SQ)) + +install: install_lib + +clean: + $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d + $(RM) tags TAGS + +endif # skip-makefile + +PHONY += force +force: + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable so we can use it in if_changed and friends. +.PHONY: $(PHONY) diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c new file mode 100644 index 000000000000..8ef602f18a32 --- /dev/null +++ b/tools/lib/lockdep/common.c @@ -0,0 +1,33 @@ +#include +#include +#include +#include +#include +#include + +static __thread struct task_struct current_obj; + +/* lockdep wants these */ +bool debug_locks = true; +bool debug_locks_silent; + +__attribute__((constructor)) static void liblockdep_init(void) +{ + lockdep_init(); +} + +__attribute__((destructor)) static void liblockdep_exit(void) +{ + debug_check_no_locks_held(¤t_obj); +} + +struct task_struct *__curr(void) +{ + if (current_obj.pid == 0) { + /* Makes lockdep output pretty */ + prctl(PR_GET_NAME, current_obj.comm); + current_obj.pid = syscall(__NR_gettid); + } + + return ¤t_obj; +} diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c new file mode 100644 index 000000000000..f42b7e9aa48f --- /dev/null +++ b/tools/lib/lockdep/lockdep.c @@ -0,0 +1,2 @@ +#include +#include "../../../kernel/locking/lockdep.c" diff --git a/tools/lib/lockdep/lockdep_internals.h b/tools/lib/lockdep/lockdep_internals.h new file mode 100644 index 000000000000..29d0c954cc24 --- /dev/null +++ b/tools/lib/lockdep/lockdep_internals.h @@ -0,0 +1 @@ +#include "../../../kernel/locking/lockdep_internals.h" diff --git a/tools/lib/lockdep/lockdep_states.h b/tools/lib/lockdep/lockdep_states.h new file mode 100644 index 000000000000..248d235efda9 --- /dev/null +++ b/tools/lib/lockdep/lockdep_states.h @@ -0,0 +1 @@ +#include "../../../kernel/locking/lockdep_states.h" diff --git a/tools/lib/lockdep/rbtree.c b/tools/lib/lockdep/rbtree.c new file mode 100644 index 000000000000..f7f43033c8b7 --- /dev/null +++ b/tools/lib/lockdep/rbtree.c @@ -0,0 +1 @@ +#include "../../../lib/rbtree.c" diff --git a/tools/lib/lockdep/uinclude/asm/hweight.h b/tools/lib/lockdep/uinclude/asm/hweight.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/asm/hweight.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/asm/sections.h b/tools/lib/lockdep/uinclude/asm/sections.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/asm/sections.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/bitops.h b/tools/lib/lockdep/uinclude/linux/bitops.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/bitops.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h new file mode 100644 index 000000000000..7ac838a1f196 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/compiler.h @@ -0,0 +1,7 @@ +#ifndef _LIBLOCKDEP_LINUX_COMPILER_H_ +#define _LIBLOCKDEP_LINUX_COMPILER_H_ + +#define __used __attribute__((__unused__)) +#define unlikely + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/debug_locks.h b/tools/lib/lockdep/uinclude/linux/debug_locks.h new file mode 100644 index 000000000000..f38eb64df794 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/debug_locks.h @@ -0,0 +1,12 @@ +#ifndef _LIBLOCKDEP_DEBUG_LOCKS_H_ +#define _LIBLOCKDEP_DEBUG_LOCKS_H_ + +#include +#include + +#define DEBUG_LOCKS_WARN_ON(x) (x) + +extern bool debug_locks; +extern bool debug_locks_silent; + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/delay.h b/tools/lib/lockdep/uinclude/linux/delay.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/delay.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/export.h b/tools/lib/lockdep/uinclude/linux/export.h new file mode 100644 index 000000000000..6bdf3492c535 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/export.h @@ -0,0 +1,7 @@ +#ifndef _LIBLOCKDEP_LINUX_EXPORT_H_ +#define _LIBLOCKDEP_LINUX_EXPORT_H_ + +#define EXPORT_SYMBOL(sym) +#define EXPORT_SYMBOL_GPL(sym) + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/ftrace.h b/tools/lib/lockdep/uinclude/linux/ftrace.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/ftrace.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/gfp.h b/tools/lib/lockdep/uinclude/linux/gfp.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/gfp.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/hardirq.h b/tools/lib/lockdep/uinclude/linux/hardirq.h new file mode 100644 index 000000000000..c8f3f8f58729 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/hardirq.h @@ -0,0 +1,11 @@ +#ifndef _LIBLOCKDEP_LINUX_HARDIRQ_H_ +#define _LIBLOCKDEP_LINUX_HARDIRQ_H_ + +#define SOFTIRQ_BITS 0UL +#define HARDIRQ_BITS 0UL +#define SOFTIRQ_SHIFT 0UL +#define HARDIRQ_SHIFT 0UL +#define hardirq_count() 0UL +#define softirq_count() 0UL + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/hash.h b/tools/lib/lockdep/uinclude/linux/hash.h new file mode 100644 index 000000000000..0f8479858dc0 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/hash.h @@ -0,0 +1 @@ +#include "../../../include/linux/hash.h" diff --git a/tools/lib/lockdep/uinclude/linux/interrupt.h b/tools/lib/lockdep/uinclude/linux/interrupt.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/interrupt.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/irqflags.h b/tools/lib/lockdep/uinclude/linux/irqflags.h new file mode 100644 index 000000000000..6cc296f0fad0 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/irqflags.h @@ -0,0 +1,38 @@ +#ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ +#define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ + +# define trace_hardirq_context(p) 0 +# define trace_softirq_context(p) 0 +# define trace_hardirqs_enabled(p) 0 +# define trace_softirqs_enabled(p) 0 +# define trace_hardirq_enter() do { } while (0) +# define trace_hardirq_exit() do { } while (0) +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +# define INIT_TRACE_IRQFLAGS + +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) + +#define raw_local_irq_disable() do { } while (0) +#define raw_local_irq_enable() do { } while (0) +#define raw_local_irq_save(flags) ((flags) = 0) +#define raw_local_irq_restore(flags) do { } while (0) +#define raw_local_save_flags(flags) ((flags) = 0) +#define raw_irqs_disabled_flags(flags) do { } while (0) +#define raw_irqs_disabled() 0 +#define raw_safe_halt() + +#define local_irq_enable() do { } while (0) +#define local_irq_disable() do { } while (0) +#define local_irq_save(flags) ((flags) = 0) +#define local_irq_restore(flags) do { } while (0) +#define local_save_flags(flags) ((flags) = 0) +#define irqs_disabled() (1) +#define irqs_disabled_flags(flags) (0) +#define safe_halt() do { } while (0) + +#define trace_lock_release(x, y) +#define trace_lock_acquire(a, b, c, d, e, f, g) + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/kallsyms.h b/tools/lib/lockdep/uinclude/linux/kallsyms.h new file mode 100644 index 000000000000..b0f2dbdf1a15 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/kallsyms.h @@ -0,0 +1,32 @@ +#ifndef _LIBLOCKDEP_LINUX_KALLSYMS_H_ +#define _LIBLOCKDEP_LINUX_KALLSYMS_H_ + +#include +#include + +#define KSYM_NAME_LEN 128 + +struct module; + +static inline const char *kallsyms_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, char *namebuf) +{ + return NULL; +} + +#include +#include +static inline void print_ip_sym(unsigned long ip) +{ + char **name; + + name = backtrace_symbols((void **)&ip, 1); + + printf("%s\n", *name); + + free(name); +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/kern_levels.h b/tools/lib/lockdep/uinclude/linux/kern_levels.h new file mode 100644 index 000000000000..3b9bade28698 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/kern_levels.h @@ -0,0 +1,25 @@ +#ifndef __KERN_LEVELS_H__ +#define __KERN_LEVELS_H__ + +#define KERN_SOH "" /* ASCII Start Of Header */ +#define KERN_SOH_ASCII '' + +#define KERN_EMERG KERN_SOH "" /* system is unusable */ +#define KERN_ALERT KERN_SOH "" /* action must be taken immediately */ +#define KERN_CRIT KERN_SOH "" /* critical conditions */ +#define KERN_ERR KERN_SOH "" /* error conditions */ +#define KERN_WARNING KERN_SOH "" /* warning conditions */ +#define KERN_NOTICE KERN_SOH "" /* normal but significant condition */ +#define KERN_INFO KERN_SOH "" /* informational */ +#define KERN_DEBUG KERN_SOH "" /* debug-level messages */ + +#define KERN_DEFAULT KERN_SOH "" /* the default kernel loglevel */ + +/* + * Annotation for a "continued" line of log printout (only done after a + * line that had no enclosing \n). Only to be used by core/arch code + * during early bootup (a continued line is not SMP-safe otherwise). + */ +#define KERN_CONT "" + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/kernel.h b/tools/lib/lockdep/uinclude/linux/kernel.h new file mode 100644 index 000000000000..a11e3c357be7 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/kernel.h @@ -0,0 +1,44 @@ +#ifndef _LIBLOCKDEP_LINUX_KERNEL_H_ +#define _LIBLOCKDEP_LINUX_KERNEL_H_ + +#include +#include +#include +#include +#include + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define WARN_ON(x) (x) +#define WARN_ON_ONCE(x) (x) +#define likely(x) (x) +#define WARN(x, y, z) (x) +#define uninitialized_var(x) x +#define __init +#define noinline +#define list_add_tail_rcu list_add_tail + +#ifndef CALLER_ADDR0 +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#endif + +#ifndef _RET_IP_ +#define _RET_IP_ CALLER_ADDR0 +#endif + +#ifndef _THIS_IP_ +#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) +#endif + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/kmemcheck.h b/tools/lib/lockdep/uinclude/linux/kmemcheck.h new file mode 100644 index 000000000000..94d598bc6abe --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/kmemcheck.h @@ -0,0 +1,8 @@ +#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_ +#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_ + +static inline void kmemcheck_mark_initialized(void *address, unsigned int n) +{ +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/linkage.h b/tools/lib/lockdep/uinclude/linux/linkage.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/linkage.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/list.h b/tools/lib/lockdep/uinclude/linux/list.h new file mode 100644 index 000000000000..6e9ef31ed82e --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/list.h @@ -0,0 +1 @@ +#include "../../../include/linux/list.h" diff --git a/tools/lib/lockdep/uinclude/linux/lockdep.h b/tools/lib/lockdep/uinclude/linux/lockdep.h new file mode 100644 index 000000000000..d0f5d6e50214 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/lockdep.h @@ -0,0 +1,55 @@ +#ifndef _LIBLOCKDEP_LOCKDEP_H_ +#define _LIBLOCKDEP_LOCKDEP_H_ + +#include +#include +#include +#include +#include + + +#define MAX_LOCK_DEPTH 2000UL + +#include "../../../include/linux/lockdep.h" + +struct task_struct { + u64 curr_chain_key; + int lockdep_depth; + unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; + gfp_t lockdep_reclaim_gfp; + int pid; + char comm[17]; +}; + +extern struct task_struct *__curr(void); + +#define current (__curr()) + +#define debug_locks_off() 1 +#define task_pid_nr(tsk) ((tsk)->pid) + +#define KSYM_NAME_LEN 128 +#define printk printf + +#define list_del_rcu list_del + +#define atomic_t unsigned long +#define atomic_inc(x) ((*(x))++) + +static struct new_utsname *init_utsname(void) +{ + static struct new_utsname n = (struct new_utsname) { + .release = "liblockdep", + .version = LIBLOCKDEP_VERSION, + }; + + return &n; +} + +#define print_tainted() "" +#define static_obj(x) 1 + +#define debug_show_all_locks() + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/module.h b/tools/lib/lockdep/uinclude/linux/module.h new file mode 100644 index 000000000000..09c7a7be8ccc --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/module.h @@ -0,0 +1,6 @@ +#ifndef _LIBLOCKDEP_LINUX_MODULE_H_ +#define _LIBLOCKDEP_LINUX_MODULE_H_ + +#define module_param(name, type, perm) + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/mutex.h b/tools/lib/lockdep/uinclude/linux/mutex.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/mutex.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/poison.h b/tools/lib/lockdep/uinclude/linux/poison.h new file mode 100644 index 000000000000..0c27bdf14233 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/poison.h @@ -0,0 +1 @@ +#include "../../../include/linux/poison.h" diff --git a/tools/lib/lockdep/uinclude/linux/prefetch.h b/tools/lib/lockdep/uinclude/linux/prefetch.h new file mode 100644 index 000000000000..d73fe6f850ac --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/prefetch.h @@ -0,0 +1,6 @@ +#ifndef _LIBLOCKDEP_LINUX_PREFETCH_H_ +#define _LIBLOCKDEP_LINUX_PREFETCH_H + +static inline void prefetch(void *a __attribute__((unused))) { } + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/proc_fs.h b/tools/lib/lockdep/uinclude/linux/proc_fs.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/proc_fs.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/rbtree.h b/tools/lib/lockdep/uinclude/linux/rbtree.h new file mode 100644 index 000000000000..965901db4862 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/rbtree.h @@ -0,0 +1 @@ +#include "../../../include/linux/rbtree.h" diff --git a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h new file mode 100644 index 000000000000..c3759477379c --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h @@ -0,0 +1,2 @@ +#define __always_inline +#include "../../../include/linux/rbtree_augmented.h" diff --git a/tools/lib/lockdep/uinclude/linux/rcu.h b/tools/lib/lockdep/uinclude/linux/rcu.h new file mode 100644 index 000000000000..4c99fcb5da27 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/rcu.h @@ -0,0 +1,16 @@ +#ifndef _LIBLOCKDEP_RCU_H_ +#define _LIBLOCKDEP_RCU_H_ + +int rcu_scheduler_active; + +static inline int rcu_lockdep_current_cpu_online(void) +{ + return 1; +} + +static inline int rcu_is_cpu_idle(void) +{ + return 1; +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/seq_file.h b/tools/lib/lockdep/uinclude/linux/seq_file.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/seq_file.h @@ -0,0 +1,3 @@ + +/* empty file */ + diff --git a/tools/lib/lockdep/uinclude/linux/spinlock.h b/tools/lib/lockdep/uinclude/linux/spinlock.h new file mode 100644 index 000000000000..68c1aa2bcba5 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/spinlock.h @@ -0,0 +1,25 @@ +#ifndef _LIBLOCKDEP_SPINLOCK_H_ +#define _LIBLOCKDEP_SPINLOCK_H_ + +#include +#include + +#define arch_spinlock_t pthread_mutex_t +#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER + +static inline void arch_spin_lock(arch_spinlock_t *mutex) +{ + pthread_mutex_lock(mutex); +} + +static inline void arch_spin_unlock(arch_spinlock_t *mutex) +{ + pthread_mutex_unlock(mutex); +} + +static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) +{ + return true; +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/stacktrace.h b/tools/lib/lockdep/uinclude/linux/stacktrace.h new file mode 100644 index 000000000000..39aecc6b19d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/stacktrace.h @@ -0,0 +1,32 @@ +#ifndef _LIBLOCKDEP_LINUX_STACKTRACE_H_ +#define _LIBLOCKDEP_LINUX_STACKTRACE_H_ + +#include + +struct stack_trace { + unsigned int nr_entries, max_entries; + unsigned long *entries; + int skip; +}; + +static inline void print_stack_trace(struct stack_trace *trace, int spaces) +{ + backtrace_symbols_fd((void **)trace->entries, trace->nr_entries, 1); +} + +#define save_stack_trace(trace) \ + ((trace)->nr_entries = \ + backtrace((void **)(trace)->entries, (trace)->max_entries)) + +static inline int dump_stack(void) +{ + void *array[64]; + size_t size; + + size = backtrace(array, 64); + backtrace_symbols_fd(array, size, 1); + + return 0; +} + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/stringify.h b/tools/lib/lockdep/uinclude/linux/stringify.h new file mode 100644 index 000000000000..05dfcd1ac118 --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/stringify.h @@ -0,0 +1,7 @@ +#ifndef _LIBLOCKDEP_LINUX_STRINGIFY_H_ +#define _LIBLOCKDEP_LINUX_STRINGIFY_H_ + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#endif diff --git a/tools/lib/lockdep/uinclude/linux/types.h b/tools/lib/lockdep/uinclude/linux/types.h new file mode 100644 index 000000000000..929938f426de --- /dev/null +++ b/tools/lib/lockdep/uinclude/linux/types.h @@ -0,0 +1,58 @@ +#ifndef _LIBLOCKDEP_LINUX_TYPES_H_ +#define _LIBLOCKDEP_LINUX_TYPES_H_ + +#include +#include + +#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +#include + +struct page; +struct kmem_cache; + +typedef unsigned gfp_t; + +typedef __u64 u64; +typedef __s64 s64; + +typedef __u32 u32; +typedef __s32 s32; + +typedef __u16 u16; +typedef __s16 s16; + +typedef __u8 u8; +typedef __s8 s8; + +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#ifdef __CHECK_ENDIAN__ +#define __bitwise __bitwise__ +#else +#define __bitwise +#endif + + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +struct list_head { + struct list_head *next, *prev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#endif diff --git a/tools/lib/lockdep/uinclude/trace/events/lock.h b/tools/lib/lockdep/uinclude/trace/events/lock.h new file mode 100644 index 000000000000..fab00ff936d1 --- /dev/null +++ b/tools/lib/lockdep/uinclude/trace/events/lock.h @@ -0,0 +1,3 @@ + +/* empty file */ + From 45e6207464b59dca63c8a9a79a7befbbf6a68fdb Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:18 -0400 Subject: [PATCH 07/34] liblockdep: Add public headers for pthread_mutex_t implementation These headers provide the same API as their pthread mutex counterparts. The design here is to allow to easily switch to liblockdep lock validation just by adding a "liblockdep_" to pthread_mutex_*() calls, which means that it's easy to integrate liblockdep into existing codebases. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-4-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/include/liblockdep/common.h | 50 +++++++++++++ tools/lib/lockdep/include/liblockdep/mutex.h | 70 +++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 tools/lib/lockdep/include/liblockdep/common.h create mode 100644 tools/lib/lockdep/include/liblockdep/mutex.h diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h new file mode 100644 index 000000000000..0bda630027c3 --- /dev/null +++ b/tools/lib/lockdep/include/liblockdep/common.h @@ -0,0 +1,50 @@ +#ifndef _LIBLOCKDEP_COMMON_H +#define _LIBLOCKDEP_COMMON_H + +#include + +#define NR_LOCKDEP_CACHING_CLASSES 2 +#define MAX_LOCKDEP_SUBCLASSES 8UL + +#ifndef CALLER_ADDR0 +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#endif + +#ifndef _RET_IP_ +#define _RET_IP_ CALLER_ADDR0 +#endif + +#ifndef _THIS_IP_ +#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) +#endif + +struct lockdep_subclass_key { + char __one_byte; +}; + +struct lock_class_key { + struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; +}; + +struct lockdep_map { + struct lock_class_key *key; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; + const char *name; +#ifdef CONFIG_LOCK_STAT + int cpu; + unsigned long ip; +#endif +}; + +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass); +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip); +void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip); + +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), } + +#endif diff --git a/tools/lib/lockdep/include/liblockdep/mutex.h b/tools/lib/lockdep/include/liblockdep/mutex.h new file mode 100644 index 000000000000..c342f7087147 --- /dev/null +++ b/tools/lib/lockdep/include/liblockdep/mutex.h @@ -0,0 +1,70 @@ +#ifndef _LIBLOCKDEP_MUTEX_H +#define _LIBLOCKDEP_MUTEX_H + +#include +#include "common.h" + +struct liblockdep_pthread_mutex { + pthread_mutex_t mutex; + struct lockdep_map dep_map; +}; + +typedef struct liblockdep_pthread_mutex liblockdep_pthread_mutex_t; + +#define LIBLOCKDEP_PTHREAD_MUTEX_INITIALIZER(mtx) \ + (const struct liblockdep_pthread_mutex) { \ + .mutex = PTHREAD_MUTEX_INITIALIZER, \ + .dep_map = STATIC_LOCKDEP_MAP_INIT(#mtx, &((&(mtx))->dep_map)), \ +} + +static inline int __mutex_init(liblockdep_pthread_mutex_t *lock, + const char *name, + struct lock_class_key *key, + const pthread_mutexattr_t *__mutexattr) +{ + lockdep_init_map(&lock->dep_map, name, key, 0); + return pthread_mutex_init(&lock->mutex, __mutexattr); +} + +#define liblockdep_pthread_mutex_init(mutex, mutexattr) \ +({ \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key, (mutexattr)); \ +}) + +static inline int liblockdep_pthread_mutex_lock(liblockdep_pthread_mutex_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 0, 0, 2, NULL, (unsigned long)_RET_IP_); + return pthread_mutex_lock(&lock->mutex); +} + +static inline int liblockdep_pthread_mutex_unlock(liblockdep_pthread_mutex_t *lock) +{ + lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_); + return pthread_mutex_unlock(&lock->mutex); +} + +static inline int liblockdep_pthread_mutex_trylock(liblockdep_pthread_mutex_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 1, 0, 2, NULL, (unsigned long)_RET_IP_); + return pthread_mutex_trylock(&lock->mutex) == 0 ? 1 : 0; +} + +static inline int liblockdep_pthread_mutex_destroy(liblockdep_pthread_mutex_t *lock) +{ + return pthread_mutex_destroy(&lock->mutex); +} + +#ifdef __USE_LIBLOCKDEP + +#define pthread_mutex_t liblockdep_pthread_mutex_t +#define pthread_mutex_init liblockdep_pthread_mutex_init +#define pthread_mutex_lock liblockdep_pthread_mutex_lock +#define pthread_mutex_unlock liblockdep_pthread_mutex_unlock +#define pthread_mutex_trylock liblockdep_pthread_mutex_trylock +#define pthread_mutex_destroy liblockdep_pthread_mutex_destroy + +#endif + +#endif From 878f968eeb852383ff79dc3f181db24e5b52fd75 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:19 -0400 Subject: [PATCH 08/34] liblockdep: Add pthread_mutex_t test suite This is a rather simple and basic test suite to test common locking issues. Beyond tests, it also shows how to use the library. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-5-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/run_tests.sh | 27 ++++++++++++++++++++++++ tools/lib/lockdep/tests/AA.c | 13 ++++++++++++ tools/lib/lockdep/tests/ABBA.c | 13 ++++++++++++ tools/lib/lockdep/tests/ABBCCA.c | 15 +++++++++++++ tools/lib/lockdep/tests/ABBCCDDA.c | 17 +++++++++++++++ tools/lib/lockdep/tests/ABCABC.c | 15 +++++++++++++ tools/lib/lockdep/tests/ABCDBCDA.c | 17 +++++++++++++++ tools/lib/lockdep/tests/ABCDBDDA.c | 17 +++++++++++++++ tools/lib/lockdep/tests/common.h | 12 +++++++++++ tools/lib/lockdep/tests/unlock_balance.c | 12 +++++++++++ 10 files changed, 158 insertions(+) create mode 100644 tools/lib/lockdep/run_tests.sh create mode 100644 tools/lib/lockdep/tests/AA.c create mode 100644 tools/lib/lockdep/tests/ABBA.c create mode 100644 tools/lib/lockdep/tests/ABBCCA.c create mode 100644 tools/lib/lockdep/tests/ABBCCDDA.c create mode 100644 tools/lib/lockdep/tests/ABCABC.c create mode 100644 tools/lib/lockdep/tests/ABCDBCDA.c create mode 100644 tools/lib/lockdep/tests/ABCDBDDA.c create mode 100644 tools/lib/lockdep/tests/common.h create mode 100644 tools/lib/lockdep/tests/unlock_balance.c diff --git a/tools/lib/lockdep/run_tests.sh b/tools/lib/lockdep/run_tests.sh new file mode 100644 index 000000000000..5334ad9d39b7 --- /dev/null +++ b/tools/lib/lockdep/run_tests.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +make &> /dev/null + +for i in `ls tests/*.c`; do + testname=$(basename -s .c "$i") + gcc -o tests/$testname -pthread -lpthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null + echo -ne "$testname... " + if [ $(timeout 1 ./tests/$testname | wc -l) -gt 0 ]; then + echo "PASSED!" + else + echo "FAILED!" + fi + rm tests/$testname +done + +for i in `ls tests/*.c`; do + testname=$(basename -s .c "$i") + gcc -o tests/$testname -pthread -lpthread -Iinclude $i &> /dev/null + echo -ne "(PRELOAD) $testname... " + if [ $(timeout 1 ./lockdep ./tests/$testname | wc -l) -gt 0 ]; then + echo "PASSED!" + else + echo "FAILED!" + fi + rm tests/$testname +done diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c new file mode 100644 index 000000000000..0f782ff404ac --- /dev/null +++ b/tools/lib/lockdep/tests/AA.c @@ -0,0 +1,13 @@ +#include + +void main(void) +{ + pthread_mutex_t a, b; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + + pthread_mutex_lock(&a); + pthread_mutex_lock(&b); + pthread_mutex_lock(&a); +} diff --git a/tools/lib/lockdep/tests/ABBA.c b/tools/lib/lockdep/tests/ABBA.c new file mode 100644 index 000000000000..07f0e29d5485 --- /dev/null +++ b/tools/lib/lockdep/tests/ABBA.c @@ -0,0 +1,13 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(b, a); +} diff --git a/tools/lib/lockdep/tests/ABBCCA.c b/tools/lib/lockdep/tests/ABBCCA.c new file mode 100644 index 000000000000..843db09ac666 --- /dev/null +++ b/tools/lib/lockdep/tests/ABBCCA.c @@ -0,0 +1,15 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b, c; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + pthread_mutex_init(&c, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(b, c); + LOCK_UNLOCK_2(c, a); +} diff --git a/tools/lib/lockdep/tests/ABBCCDDA.c b/tools/lib/lockdep/tests/ABBCCDDA.c new file mode 100644 index 000000000000..33620e268f85 --- /dev/null +++ b/tools/lib/lockdep/tests/ABBCCDDA.c @@ -0,0 +1,17 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b, c, d; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + pthread_mutex_init(&c, NULL); + pthread_mutex_init(&d, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(b, c); + LOCK_UNLOCK_2(c, d); + LOCK_UNLOCK_2(d, a); +} diff --git a/tools/lib/lockdep/tests/ABCABC.c b/tools/lib/lockdep/tests/ABCABC.c new file mode 100644 index 000000000000..3fee51e3a68a --- /dev/null +++ b/tools/lib/lockdep/tests/ABCABC.c @@ -0,0 +1,15 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b, c; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + pthread_mutex_init(&c, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(c, a); + LOCK_UNLOCK_2(b, c); +} diff --git a/tools/lib/lockdep/tests/ABCDBCDA.c b/tools/lib/lockdep/tests/ABCDBCDA.c new file mode 100644 index 000000000000..427ba562c75b --- /dev/null +++ b/tools/lib/lockdep/tests/ABCDBCDA.c @@ -0,0 +1,17 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b, c, d; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + pthread_mutex_init(&c, NULL); + pthread_mutex_init(&d, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(c, d); + LOCK_UNLOCK_2(b, c); + LOCK_UNLOCK_2(d, a); +} diff --git a/tools/lib/lockdep/tests/ABCDBDDA.c b/tools/lib/lockdep/tests/ABCDBDDA.c new file mode 100644 index 000000000000..680c6cf3e919 --- /dev/null +++ b/tools/lib/lockdep/tests/ABCDBDDA.c @@ -0,0 +1,17 @@ +#include +#include "common.h" + +void main(void) +{ + pthread_mutex_t a, b, c, d; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + pthread_mutex_init(&c, NULL); + pthread_mutex_init(&d, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(c, d); + LOCK_UNLOCK_2(b, d); + LOCK_UNLOCK_2(d, a); +} diff --git a/tools/lib/lockdep/tests/common.h b/tools/lib/lockdep/tests/common.h new file mode 100644 index 000000000000..d89e94d47d86 --- /dev/null +++ b/tools/lib/lockdep/tests/common.h @@ -0,0 +1,12 @@ +#ifndef _LIBLOCKDEP_TEST_COMMON_H +#define _LIBLOCKDEP_TEST_COMMON_H + +#define LOCK_UNLOCK_2(a, b) \ + do { \ + pthread_mutex_lock(&(a)); \ + pthread_mutex_lock(&(b)); \ + pthread_mutex_unlock(&(b)); \ + pthread_mutex_unlock(&(a)); \ + } while(0) + +#endif diff --git a/tools/lib/lockdep/tests/unlock_balance.c b/tools/lib/lockdep/tests/unlock_balance.c new file mode 100644 index 000000000000..0bc62de686f7 --- /dev/null +++ b/tools/lib/lockdep/tests/unlock_balance.c @@ -0,0 +1,12 @@ +#include + +void main(void) +{ + pthread_mutex_t a; + + pthread_mutex_init(&a, NULL); + + pthread_mutex_lock(&a); + pthread_mutex_unlock(&a); + pthread_mutex_unlock(&a); +} From 5a52c9b480e09a782618dbf08de57f9ca54c8b49 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:20 -0400 Subject: [PATCH 09/34] liblockdep: Add public headers for pthread_rwlock_t implementation Both pthreads and lockdep support dealing with rwlocks, so here's the liblockdep implementation for those. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-6-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/include/liblockdep/rwlock.h | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 tools/lib/lockdep/include/liblockdep/rwlock.h diff --git a/tools/lib/lockdep/include/liblockdep/rwlock.h b/tools/lib/lockdep/include/liblockdep/rwlock.h new file mode 100644 index 000000000000..a680ab8c2e36 --- /dev/null +++ b/tools/lib/lockdep/include/liblockdep/rwlock.h @@ -0,0 +1,86 @@ +#ifndef _LIBLOCKDEP_RWLOCK_H +#define _LIBLOCKDEP_RWLOCK_H + +#include +#include "common.h" + +struct liblockdep_pthread_rwlock { + pthread_rwlock_t rwlock; + struct lockdep_map dep_map; +}; + +typedef struct liblockdep_pthread_rwlock liblockdep_pthread_rwlock_t; + +#define LIBLOCKDEP_PTHREAD_RWLOCK_INITIALIZER(rwl) \ + (struct liblockdep_pthread_rwlock) { \ + .rwlock = PTHREAD_RWLOCK_INITIALIZER, \ + .dep_map = STATIC_LOCKDEP_MAP_INIT(#rwl, &((&(rwl))->dep_map)), \ +} + +static inline int __rwlock_init(liblockdep_pthread_rwlock_t *lock, + const char *name, + struct lock_class_key *key, + const pthread_rwlockattr_t *attr) +{ + lockdep_init_map(&lock->dep_map, name, key, 0); + + return pthread_rwlock_init(&lock->rwlock, attr); +} + +#define liblockdep_pthread_rwlock_init(lock, attr) \ +({ \ + static struct lock_class_key __key; \ + \ + __rwlock_init((lock), #lock, &__key, (attr)); \ +}) + +static inline int liblockdep_pthread_rwlock_rdlock(liblockdep_pthread_rwlock_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 0, 2, 2, NULL, (unsigned long)_RET_IP_); + return pthread_rwlock_rdlock(&lock->rwlock); + +} + +static inline int liblockdep_pthread_rwlock_unlock(liblockdep_pthread_rwlock_t *lock) +{ + lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_); + return pthread_rwlock_unlock(&lock->rwlock); +} + +static inline int liblockdep_pthread_rwlock_wrlock(liblockdep_pthread_rwlock_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 0, 0, 2, NULL, (unsigned long)_RET_IP_); + return pthread_rwlock_wrlock(&lock->rwlock); +} + +static inline int liblockdep_pthread_rwlock_tryrdlock(liblockdep_pthread_rwlock_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 1, 2, 2, NULL, (unsigned long)_RET_IP_); + return pthread_rwlock_tryrdlock(&lock->rwlock) == 0 ? 1 : 0; +} + +static inline int liblockdep_pthread_rwlock_trywlock(liblockdep_pthread_rwlock_t *lock) +{ + lock_acquire(&lock->dep_map, 0, 1, 0, 2, NULL, (unsigned long)_RET_IP_); + return pthread_rwlock_trywlock(&lock->rwlock) == 0 ? 1 : 0; +} + +static inline int liblockdep_rwlock_destroy(liblockdep_pthread_rwlock_t *lock) +{ + return pthread_rwlock_destroy(&lock->rwlock); +} + +#ifdef __USE_LIBLOCKDEP + +#define pthread_rwlock_t liblockdep_pthread_rwlock_t +#define pthread_rwlock_init liblockdep_pthread_rwlock_init +#define pthread_rwlock_rdlock liblockdep_pthread_rwlock_rdlock +#define pthread_rwlock_unlock liblockdep_pthread_rwlock_unlock +#define pthread_rwlock_wrlock liblockdep_pthread_rwlock_wrlock +#define pthread_rwlock_tryrdlock liblockdep_pthread_rwlock_tryrdlock +#define pthread_rwlock_trywlock liblockdep_pthread_rwlock_trywlock +#define pthread_rwlock_destroy liblockdep_rwlock_destroy + +#endif + +#endif From dbe941827eab53194eda5cd350a4e1414f192658 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:21 -0400 Subject: [PATCH 10/34] liblockdep: Add pthread_rwlock_t test suite A simple test to make sure we handle rwlocks correctly. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-7-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/tests/WW.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tools/lib/lockdep/tests/WW.c diff --git a/tools/lib/lockdep/tests/WW.c b/tools/lib/lockdep/tests/WW.c new file mode 100644 index 000000000000..d44f77d71029 --- /dev/null +++ b/tools/lib/lockdep/tests/WW.c @@ -0,0 +1,13 @@ +#include + +void main(void) +{ + pthread_rwlock_t a, b; + + pthread_rwlock_init(&a, NULL); + pthread_rwlock_init(&b, NULL); + + pthread_rwlock_wrlock(&a); + pthread_rwlock_rdlock(&b); + pthread_rwlock_wrlock(&a); +} From 231941eec8aeee4f0ac210a28e484200b20f74d8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:22 -0400 Subject: [PATCH 11/34] liblockdep: Support using LD_PRELOAD This allows lockdep to be used without being compiled in the original program. Usage is quite simple: LD_PRELOAD=/path/to/liblockdep.so /path/to/my/program And magically, you'll have lockdep checking in your program! Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-8-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- tools/lib/lockdep/preload.c | 447 ++++++++++++++++++++++++++++++++++++ 1 file changed, 447 insertions(+) create mode 100644 tools/lib/lockdep/preload.c diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c new file mode 100644 index 000000000000..f8465a811aa5 --- /dev/null +++ b/tools/lib/lockdep/preload.c @@ -0,0 +1,447 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "include/liblockdep/mutex.h" +#include "../../../include/linux/rbtree.h" + +/** + * struct lock_lookup - liblockdep's view of a single unique lock + * @orig: pointer to the original pthread lock, used for lookups + * @dep_map: lockdep's dep_map structure + * @key: lockdep's key structure + * @node: rb-tree node used to store the lock in a global tree + * @name: a unique name for the lock + */ +struct lock_lookup { + void *orig; /* Original pthread lock, used for lookups */ + struct lockdep_map dep_map; /* Since all locks are dynamic, we need + * a dep_map and a key for each lock */ + /* + * Wait, there's no support for key classes? Yup :( + * Most big projects wrap the pthread api with their own calls to + * be compatible with different locking methods. This means that + * "classes" will be brokes since the function that creates all + * locks will point to a generic locking function instead of the + * actual code that wants to do the locking. + */ + struct lock_class_key key; + struct rb_node node; +#define LIBLOCKDEP_MAX_LOCK_NAME 22 + char name[LIBLOCKDEP_MAX_LOCK_NAME]; +}; + +/* This is where we store our locks */ +static struct rb_root locks = RB_ROOT; +static pthread_rwlock_t locks_rwlock = PTHREAD_RWLOCK_INITIALIZER; + +/* pthread mutex API */ + +#ifdef __GLIBC__ +extern int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *attr); +extern int __pthread_mutex_lock(pthread_mutex_t *mutex); +extern int __pthread_mutex_trylock(pthread_mutex_t *mutex); +extern int __pthread_mutex_unlock(pthread_mutex_t *mutex); +extern int __pthread_mutex_destroy(pthread_mutex_t *mutex); +#else +#define __pthread_mutex_init NULL +#define __pthread_mutex_lock NULL +#define __pthread_mutex_trylock NULL +#define __pthread_mutex_unlock NULL +#define __pthread_mutex_destroy NULL +#endif +static int (*ll_pthread_mutex_init)(pthread_mutex_t *mutex, + const pthread_mutexattr_t *attr) = __pthread_mutex_init; +static int (*ll_pthread_mutex_lock)(pthread_mutex_t *mutex) = __pthread_mutex_lock; +static int (*ll_pthread_mutex_trylock)(pthread_mutex_t *mutex) = __pthread_mutex_trylock; +static int (*ll_pthread_mutex_unlock)(pthread_mutex_t *mutex) = __pthread_mutex_unlock; +static int (*ll_pthread_mutex_destroy)(pthread_mutex_t *mutex) = __pthread_mutex_destroy; + +/* pthread rwlock API */ + +#ifdef __GLIBC__ +extern int __pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr); +extern int __pthread_rwlock_destroy(pthread_rwlock_t *rwlock); +extern int __pthread_rwlock_wrlock(pthread_rwlock_t *rwlock); +extern int __pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock); +extern int __pthread_rwlock_rdlock(pthread_rwlock_t *rwlock); +extern int __pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock); +extern int __pthread_rwlock_unlock(pthread_rwlock_t *rwlock); +#else +#define __pthread_rwlock_init NULL +#define __pthread_rwlock_destroy NULL +#define __pthread_rwlock_wrlock NULL +#define __pthread_rwlock_trywrlock NULL +#define __pthread_rwlock_rdlock NULL +#define __pthread_rwlock_tryrdlock NULL +#define __pthread_rwlock_unlock NULL +#endif + +static int (*ll_pthread_rwlock_init)(pthread_rwlock_t *rwlock, + const pthread_rwlockattr_t *attr) = __pthread_rwlock_init; +static int (*ll_pthread_rwlock_destroy)(pthread_rwlock_t *rwlock) = __pthread_rwlock_destroy; +static int (*ll_pthread_rwlock_rdlock)(pthread_rwlock_t *rwlock) = __pthread_rwlock_rdlock; +static int (*ll_pthread_rwlock_tryrdlock)(pthread_rwlock_t *rwlock) = __pthread_rwlock_tryrdlock; +static int (*ll_pthread_rwlock_trywrlock)(pthread_rwlock_t *rwlock) = __pthread_rwlock_trywrlock; +static int (*ll_pthread_rwlock_wrlock)(pthread_rwlock_t *rwlock) = __pthread_rwlock_wrlock; +static int (*ll_pthread_rwlock_unlock)(pthread_rwlock_t *rwlock) = __pthread_rwlock_unlock; + +enum { none, prepare, done, } __init_state; +static void init_preload(void); +static void try_init_preload(void) +{ + if (!__init_state != done) + init_preload(); +} + +static struct rb_node **__get_lock_node(void *lock, struct rb_node **parent) +{ + struct rb_node **node = &locks.rb_node; + struct lock_lookup *l; + + *parent = NULL; + + while (*node) { + l = rb_entry(*node, struct lock_lookup, node); + + *parent = *node; + if (lock < l->orig) + node = &l->node.rb_left; + else if (lock > l->orig) + node = &l->node.rb_right; + else + return node; + } + + return node; +} + +#ifndef LIBLOCKDEP_STATIC_ENTRIES +#define LIBLOCKDEP_STATIC_ENTRIES 1024 +#endif + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +static struct lock_lookup __locks[LIBLOCKDEP_STATIC_ENTRIES]; +static int __locks_nr; + +static inline bool is_static_lock(struct lock_lookup *lock) +{ + return lock >= __locks && lock < __locks + ARRAY_SIZE(__locks); +} + +static struct lock_lookup *alloc_lock(void) +{ + if (__init_state != done) { + /* + * Some programs attempt to initialize and use locks in their + * allocation path. This means that a call to malloc() would + * result in locks being initialized and locked. + * + * Why is it an issue for us? dlsym() below will try allocating + * to give us the original function. Since this allocation will + * result in a locking operations, we have to let pthread deal + * with it, but we can't! we don't have the pointer to the + * original API since we're inside dlsym() trying to get it + */ + + int idx = __locks_nr++; + if (idx >= ARRAY_SIZE(__locks)) { + fprintf(stderr, + "LOCKDEP error: insufficient LIBLOCKDEP_STATIC_ENTRIES\n"); + exit(EX_UNAVAILABLE); + } + return __locks + idx; + } + + return malloc(sizeof(struct lock_lookup)); +} + +static inline void free_lock(struct lock_lookup *lock) +{ + if (likely(!is_static_lock(lock))) + free(lock); +} + +/** + * __get_lock - find or create a lock instance + * @lock: pointer to a pthread lock function + * + * Try to find an existing lock in the rbtree using the provided pointer. If + * one wasn't found - create it. + */ +static struct lock_lookup *__get_lock(void *lock) +{ + struct rb_node **node, *parent; + struct lock_lookup *l; + + ll_pthread_rwlock_rdlock(&locks_rwlock); + node = __get_lock_node(lock, &parent); + ll_pthread_rwlock_unlock(&locks_rwlock); + if (*node) { + return rb_entry(*node, struct lock_lookup, node); + } + + /* We didn't find the lock, let's create it */ + l = alloc_lock(); + if (l == NULL) + return NULL; + + l->orig = lock; + /* + * Currently the name of the lock is the ptr value of the pthread lock, + * while not optimal, it makes debugging a bit easier. + * + * TODO: Get the real name of the lock using libdwarf + */ + sprintf(l->name, "%p", lock); + lockdep_init_map(&l->dep_map, l->name, &l->key, 0); + + ll_pthread_rwlock_wrlock(&locks_rwlock); + /* This might have changed since the last time we fetched it */ + node = __get_lock_node(lock, &parent); + rb_link_node(&l->node, parent, node); + rb_insert_color(&l->node, &locks); + ll_pthread_rwlock_unlock(&locks_rwlock); + + return l; +} + +static void __del_lock(struct lock_lookup *lock) +{ + ll_pthread_rwlock_wrlock(&locks_rwlock); + rb_erase(&lock->node, &locks); + ll_pthread_rwlock_unlock(&locks_rwlock); + free_lock(lock); +} + +int pthread_mutex_init(pthread_mutex_t *mutex, + const pthread_mutexattr_t *attr) +{ + int r; + + /* + * We keep trying to init our preload module because there might be + * code in init sections that tries to touch locks before we are + * initialized, in that case we'll need to manually call preload + * to get us going. + * + * Funny enough, kernel's lockdep had the same issue, and used + * (almost) the same solution. See look_up_lock_class() in + * kernel/locking/lockdep.c for details. + */ + try_init_preload(); + + r = ll_pthread_mutex_init(mutex, attr); + if (r == 0) + /* + * We do a dummy initialization here so that lockdep could + * warn us if something fishy is going on - such as + * initializing a held lock. + */ + __get_lock(mutex); + + return r; +} + +int pthread_mutex_lock(pthread_mutex_t *mutex) +{ + int r; + + try_init_preload(); + + lock_acquire(&__get_lock(mutex)->dep_map, 0, 0, 0, 2, NULL, + (unsigned long)_RET_IP_); + /* + * Here's the thing with pthread mutexes: unlike the kernel variant, + * they can fail. + * + * This means that the behaviour here is a bit different from what's + * going on in the kernel: there we just tell lockdep that we took the + * lock before actually taking it, but here we must deal with the case + * that locking failed. + * + * To do that we'll "release" the lock if locking failed - this way + * we'll get lockdep doing the correct checks when we try to take + * the lock, and if that fails - we'll be back to the correct + * state by releasing it. + */ + r = ll_pthread_mutex_lock(mutex); + if (r) + lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_mutex_trylock(pthread_mutex_t *mutex) +{ + int r; + + try_init_preload(); + + lock_acquire(&__get_lock(mutex)->dep_map, 0, 1, 0, 2, NULL, (unsigned long)_RET_IP_); + r = ll_pthread_mutex_trylock(mutex); + if (r) + lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_mutex_unlock(pthread_mutex_t *mutex) +{ + int r; + + try_init_preload(); + + lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + /* + * Just like taking a lock, only in reverse! + * + * If we fail releasing the lock, tell lockdep we're holding it again. + */ + r = ll_pthread_mutex_unlock(mutex); + if (r) + lock_acquire(&__get_lock(mutex)->dep_map, 0, 0, 0, 2, NULL, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_mutex_destroy(pthread_mutex_t *mutex) +{ + try_init_preload(); + + /* + * Let's see if we're releasing a lock that's held. + * + * TODO: Hook into free() and add that check there as well. + */ + debug_check_no_locks_freed(mutex, mutex + sizeof(*mutex)); + __del_lock(__get_lock(mutex)); + return ll_pthread_mutex_destroy(mutex); +} + +/* This is the rwlock part, very similar to what happened with mutex above */ +int pthread_rwlock_init(pthread_rwlock_t *rwlock, + const pthread_rwlockattr_t *attr) +{ + int r; + + try_init_preload(); + + r = ll_pthread_rwlock_init(rwlock, attr); + if (r == 0) + __get_lock(rwlock); + + return r; +} + +int pthread_rwlock_destroy(pthread_rwlock_t *rwlock) +{ + try_init_preload(); + + debug_check_no_locks_freed(rwlock, rwlock + sizeof(*rwlock)); + __del_lock(__get_lock(rwlock)); + return ll_pthread_rwlock_destroy(rwlock); +} + +int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) +{ + int r; + + init_preload(); + + lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 2, 2, NULL, (unsigned long)_RET_IP_); + r = ll_pthread_rwlock_rdlock(rwlock); + if (r) + lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) +{ + int r; + + init_preload(); + + lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 2, 2, NULL, (unsigned long)_RET_IP_); + r = ll_pthread_rwlock_tryrdlock(rwlock); + if (r) + lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) +{ + int r; + + init_preload(); + + lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 0, 2, NULL, (unsigned long)_RET_IP_); + r = ll_pthread_rwlock_trywrlock(rwlock); + if (r) + lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) +{ + int r; + + init_preload(); + + lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 2, NULL, (unsigned long)_RET_IP_); + r = ll_pthread_rwlock_wrlock(rwlock); + if (r) + lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + + return r; +} + +int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) +{ + int r; + + init_preload(); + + lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + r = ll_pthread_rwlock_unlock(rwlock); + if (r) + lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 2, NULL, (unsigned long)_RET_IP_); + + return r; +} + +__attribute__((constructor)) static void init_preload(void) +{ + if (__init_state != done) + return; + +#ifndef __GLIBC__ + __init_state = prepare; + + ll_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init"); + ll_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock"); + ll_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock"); + ll_pthread_mutex_unlock = dlsym(RTLD_NEXT, "pthread_mutex_unlock"); + ll_pthread_mutex_destroy = dlsym(RTLD_NEXT, "pthread_mutex_destroy"); + + ll_pthread_rwlock_init = dlsym(RTLD_NEXT, "pthread_rwlock_init"); + ll_pthread_rwlock_destroy = dlsym(RTLD_NEXT, "pthread_rwlock_destroy"); + ll_pthread_rwlock_rdlock = dlsym(RTLD_NEXT, "pthread_rwlock_rdlock"); + ll_pthread_rwlock_tryrdlock = dlsym(RTLD_NEXT, "pthread_rwlock_tryrdlock"); + ll_pthread_rwlock_wrlock = dlsym(RTLD_NEXT, "pthread_rwlock_wrlock"); + ll_pthread_rwlock_trywrlock = dlsym(RTLD_NEXT, "pthread_rwlock_trywrlock"); + ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock"); +#endif + + printf("%p\n", ll_pthread_mutex_trylock);fflush(stdout); + + lockdep_init(); + + __init_state = done; +} From f612ac05b7ce66919507d25f4c81e4272f7a8705 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:23 -0400 Subject: [PATCH 12/34] liblockdep: Add the 'lockdep' user-space utility This is a simple wrapper to make using liblockdep on existing applications much easier. After running 'make && make install', it becomes quite simple to test things with liblockdep. For example, to try it on perf: lockdep perf No other integration required. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-9-git-send-email-sasha.levin@oracle.com [ Changed it to load ./liblockdep.so, so it can be tested in situ. ] Signed-off-by: Ingo Molnar --- tools/lib/lockdep/lockdep | 3 +++ 1 file changed, 3 insertions(+) create mode 100755 tools/lib/lockdep/lockdep diff --git a/tools/lib/lockdep/lockdep b/tools/lib/lockdep/lockdep new file mode 100755 index 000000000000..49af9fe19f5b --- /dev/null +++ b/tools/lib/lockdep/lockdep @@ -0,0 +1,3 @@ +#!/bin/bash + +LD_PRELOAD="./liblockdep.so $LD_PRELOAD" "$@" From 1acd437c59c7dd78d8f1e5f07bea6a18b7b5cd77 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Jun 2013 18:41:24 -0400 Subject: [PATCH 13/34] liblockdep: Add a MAINTAINERS entry Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/1371163284-6346-10-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 63f30484932b..9e62574cca4a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5111,6 +5111,11 @@ F: drivers/lguest/ F: include/linux/lguest*.h F: tools/lguest/ +LIBLOCKDEP +M: Sasha Levin +S: Maintained +F: tools/lib/lockdep/ + LINUX FOR IBM pSERIES (RS/6000) M: Paul Mackerras W: http://www.ibm.com/linux/ltc/projects/ppc From 854ff82ab085aed7509f6fb90f79ce6c41ee1fa4 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Wed, 27 Nov 2013 23:18:00 +0100 Subject: [PATCH 14/34] Documentation/robust-futex-API: Count properly to 4 A strictly monotonically increasing sequence is normally used in numbered lists as they provide an intuitive ordering of the elements. However, in situations where race conditions can appear, this assumption breaks down and you can end up with unpredictable results, leading to a rather confusing list :-) This changes the numbered list 1,2,2,2 to the more intuitive 1,2,3,4. Introduced in: 2eec9ad91f71 [PATCH] lightweight robust futexes: docs Signed-off-by: Henrik Austad Link: http://lkml.kernel.org/r/1385590680-8110-1-git-send-email-henrik@austad.us Signed-off-by: Ingo Molnar --- Documentation/robust-futex-ABI.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt index fd1cd8aae4eb..16eb314f56cc 100644 --- a/Documentation/robust-futex-ABI.txt +++ b/Documentation/robust-futex-ABI.txt @@ -146,8 +146,8 @@ On removal: 1) set the 'list_op_pending' word to the address of the 'lock entry' to be removed, 2) remove the lock entry for this lock from the 'head' list, - 2) release the futex lock, and - 2) clear the 'lock_op_pending' word. + 3) release the futex lock, and + 4) clear the 'lock_op_pending' word. On exit, the kernel will consider the address stored in 'list_op_pending' and the address of each 'lock word' found by walking From 962d9c5757b2ebd30678a88a4f475ae997f4d5dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Nov 2013 15:20:15 +0100 Subject: [PATCH 15/34] Revert "smp/cpumask: Make CONFIG_CPUMASK_OFFSTACK=y usable without debug dependency" This reverts commit 9dd1220114e00d8ec5cdc20085bbe198b21e1985. Revert it until Linus's concerns are addressed: this option should not allow nonsensical CONFIG_CPUMASK_OFFSTACK and CONFIG_NR_CPUS values, and it should probably select sane defaults as well. Cc: Josh Boyer Cc: Rusty Russell Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-etcruvuw9neycYf0Rripxrjv@git.kernel.org Signed-off-by: Ingo Molnar --- lib/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 3f7fb4de6ee7..06dc74200a51 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -342,8 +342,7 @@ config CHECK_SIGNATURE bool config CPUMASK_OFFSTACK - bool "Force CPU masks off stack" - depends on SMP + bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS help Use dynamic allocation for cpumask_var_t, instead of putting them on the stack. This is a bit more expensive, but avoids From 2ecf810121c7eae34473b8fa108112036bc61127 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:04 -0800 Subject: [PATCH 16/34] Documentation/memory-barriers.txt: Add needed ACCESS_ONCE() calls to memory-barriers.txt The Documentation/memory-barriers.txt file was written before the need for ACCESS_ONCE() was fully appreciated. It therefore contains no ACCESS_ONCE() calls, which can be a problem when people lift examples from it. This commit therefore adds ACCESS_ONCE() calls. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 206 ++++++++++++++++++------------ 1 file changed, 126 insertions(+), 80 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 020cccdbdd0c..1d067235b0bc 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -194,18 +194,22 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - Q = P; D = *Q; + ACCESS_ONCE(Q) = P; smp_read_barrier_depends(); D = ACCESS_ONCE(*Q); the CPU will issue the following memory operations: Q = LOAD P, D = LOAD *Q - and always in that order. + and always in that order. On most systems, smp_read_barrier_depends() + does nothing, but it is required for DEC Alpha. The ACCESS_ONCE() + is required to prevent compiler mischief. Please note that you + should normally use something like rcu_dereference() instead of + open-coding smp_read_barrier_depends(). (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: - a = *X; *X = b; + a = ACCESS_ONCE(*X); ACCESS_ONCE(*X) = b; the CPU will only issue the following sequence of memory operations: @@ -213,7 +217,7 @@ There are some minimal guarantees that may be expected of a CPU: And for: - *X = c; d = *X; + ACCESS_ONCE(*X) = c; d = ACCESS_ONCE(*X); the CPU will only issue: @@ -224,6 +228,41 @@ There are some minimal guarantees that may be expected of a CPU: And there are a number of things that _must_ or _must_not_ be assumed: + (*) It _must_not_ be assumed that the compiler will do what you want with + memory references that are not protected by ACCESS_ONCE(). Without + ACCESS_ONCE(), the compiler is within its rights to do all sorts + of "creative" transformations: + + (-) Repeat the load, possibly getting a different value on the second + and subsequent loads. This is especially prone to happen when + register pressure is high. + + (-) Merge adjacent loads and stores to the same location. The most + familiar example is the transformation from: + + while (a) + do_something(); + + to something like: + + if (a) + for (;;) + do_something(); + + Using ACCESS_ONCE() as follows prevents this sort of optimization: + + while (ACCESS_ONCE(a)) + do_something(); + + (-) "Store tearing", where a single store in the source code is split + into smaller stores in the object code. Note that gcc really + will do this on some architectures when storing certain constants. + It can be cheaper to do a series of immediate stores than to + form the constant in a register and then to store that register. + + (-) "Load tearing", which splits loads in a manner analogous to + store tearing. + (*) It _must_not_ be assumed that independent loads and stores will be issued in the order given. This means that for: @@ -450,14 +489,14 @@ The usage requirements of data dependency barriers are a little subtle, and it's not always obvious that they're needed. To illustrate, consider the following sequence of events: - CPU 1 CPU 2 - =============== =============== + CPU 1 CPU 2 + =============== =============== { A == 1, B == 2, C = 3, P == &A, Q == &C } B = 4; - P = &B - Q = P; - D = *Q; + ACCESS_ONCE(P) = &B + Q = ACCESS_ONCE(P); + D = *Q; There's a clear data dependency here, and it would seem that by the end of the sequence, Q must be either &A or &B, and that: @@ -477,15 +516,15 @@ Alpha). To deal with this, a data dependency barrier or better must be inserted between the address load and the data load: - CPU 1 CPU 2 - =============== =============== + CPU 1 CPU 2 + =============== =============== { A == 1, B == 2, C = 3, P == &A, Q == &C } B = 4; - P = &B - Q = P; - - D = *Q; + ACCESS_ONCE(P) = &B + Q = ACCESS_ONCE(P); + + D = *Q; This enforces the occurrence of one of the two implications, and prevents the third possibility from arising. @@ -504,21 +543,22 @@ Another example of where data dependency barriers might be required is where a number is read from memory and then used to calculate the index for an array access: - CPU 1 CPU 2 - =============== =============== + CPU 1 CPU 2 + =============== =============== { M[0] == 1, M[1] == 2, M[3] = 3, P == 0, Q == 3 } M[1] = 4; - P = 1 - Q = P; - - D = M[Q]; + ACCESS_ONCE(P) = 1 + Q = ACCESS_ONCE(P); + + D = M[Q]; -The data dependency barrier is very important to the RCU system, for example. -See rcu_dereference() in include/linux/rcupdate.h. This permits the current -target of an RCU'd pointer to be replaced with a new modified target, without -the replacement target appearing to be incompletely initialised. +The data dependency barrier is very important to the RCU system, +for example. See rcu_assign_pointer() and rcu_dereference() in +include/linux/rcupdate.h. This permits the current target of an RCU'd +pointer to be replaced with a new modified target, without the replacement +target appearing to be incompletely initialised. See also the subsection on "Cache Coherency" for a more thorough example. @@ -530,22 +570,23 @@ A control dependency requires a full read memory barrier, not simply a data dependency barrier to make it work correctly. Consider the following bit of code: - q = &a; + q = ACCESS_ONCE(a); if (p) { - q = &b; + q = ACCESS_ONCE(b); } x = *q; This will not have the desired effect because there is no actual data -dependency, but rather a control dependency that the CPU may short-circuit by -attempting to predict the outcome in advance. In such a case what's actually -required is: +dependency, but rather a control dependency that the CPU may short-circuit +by attempting to predict the outcome in advance, so that other CPUs see +the load from b as having happened before the load from a. In such a +case what's actually required is: - q = &a; + q = ACCESS_ONCE(a); if (p) { - q = &b; + q = ACCESS_ONCE(b); } x = *q; @@ -561,23 +602,23 @@ barrier, though a general barrier would also be viable. Similarly a read barrier or a data dependency barrier should always be paired with at least an write barrier, though, again, a general barrier is viable: - CPU 1 CPU 2 - =============== =============== - a = 1; + CPU 1 CPU 2 + =============== =============== + ACCESS_ONCE(a) = 1; - b = 2; x = b; - - y = a; + ACCESS_ONCE(b) = 2; x = ACCESS_ONCE(b); + + y = ACCESS_ONCE(a); Or: - CPU 1 CPU 2 - =============== =============================== + CPU 1 CPU 2 + =============== =============================== a = 1; - b = &a; x = b; - - y = *x; + ACCESS_ONCE(b) = &a; x = ACCESS_ONCE(b); + + y = *x; Basically, the read barrier always has to be there, even though it can be of the "weaker" type. @@ -586,13 +627,13 @@ the "weaker" type. match the loads after the read barrier or the data dependency barrier, and vice versa: - CPU 1 CPU 2 - =============== =============== - a = 1; }---- --->{ v = c - b = 2; } \ / { w = d - \ - c = 3; } / \ { x = a; - d = 4; }---- --->{ y = b; + CPU 1 CPU 2 + =================== =================== + ACCESS_ONCE(a) = 1; }---- --->{ v = ACCESS_ONCE(c); + ACCESS_ONCE(b) = 2; } \ / { w = ACCESS_ONCE(d); + \ + ACCESS_ONCE(c) = 3; } / \ { x = ACCESS_ONCE(a); + ACCESS_ONCE(d) = 4; }---- --->{ y = ACCESS_ONCE(b); EXAMPLES OF MEMORY BARRIER SEQUENCES @@ -1435,12 +1476,12 @@ three CPUs; then should the following sequence of events occur: CPU 1 CPU 2 =============================== =============================== - *A = a; *E = e; + ACCESS_ONCE(*A) = a; ACCESS_ONCE(*E) = e; LOCK M LOCK Q - *B = b; *F = f; - *C = c; *G = g; + ACCESS_ONCE(*B) = b; ACCESS_ONCE(*F) = f; + ACCESS_ONCE(*C) = c; ACCESS_ONCE(*G) = g; UNLOCK M UNLOCK Q - *D = d; *H = h; + ACCESS_ONCE(*D) = d; ACCESS_ONCE(*H) = h; Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks @@ -1460,17 +1501,17 @@ However, if the following occurs: CPU 1 CPU 2 =============================== =============================== - *A = a; - LOCK M [1] - *B = b; - *C = c; - UNLOCK M [1] - *D = d; *E = e; - LOCK M [2] - *F = f; - *G = g; - UNLOCK M [2] - *H = h; + ACCESS_ONCE(*A) = a; + LOCK M [1] + ACCESS_ONCE(*B) = b; + ACCESS_ONCE(*C) = c; + UNLOCK M [1] + ACCESS_ONCE(*D) = d; ACCESS_ONCE(*E) = e; + LOCK M [2] + ACCESS_ONCE(*F) = f; + ACCESS_ONCE(*G) = g; + UNLOCK M [2] + ACCESS_ONCE(*H) = h; CPU 3 might see: @@ -2177,11 +2218,11 @@ A programmer might take it for granted that the CPU will perform memory operations in exactly the order specified, so that if the CPU is, for example, given the following piece of code to execute: - a = *A; - *B = b; - c = *C; - d = *D; - *E = e; + a = ACCESS_ONCE(*A); + ACCESS_ONCE(*B) = b; + c = ACCESS_ONCE(*C); + d = ACCESS_ONCE(*D); + ACCESS_ONCE(*E) = e; they would then expect that the CPU will complete the memory operation for each instruction before moving on to the next one, leading to a definite sequence of @@ -2228,12 +2269,12 @@ However, it is guaranteed that a CPU will be self-consistent: it will see its _own_ accesses appear to be correctly ordered, without the need for a memory barrier. For instance with the following code: - U = *A; - *A = V; - *A = W; - X = *A; - *A = Y; - Z = *A; + U = ACCESS_ONCE(*A); + ACCESS_ONCE(*A) = V; + ACCESS_ONCE(*A) = W; + X = ACCESS_ONCE(*A); + ACCESS_ONCE(*A) = Y; + Z = ACCESS_ONCE(*A); and assuming no intervention by an external influence, it can be assumed that the final result will appear to be: @@ -2250,7 +2291,12 @@ accesses: in that order, but, without intervention, the sequence may have almost any combination of elements combined or discarded, provided the program's view of -the world remains consistent. +the world remains consistent. Note that ACCESS_ONCE() is -not- optional +in the above example, as there are architectures where a given CPU might +interchange successive loads to the same location. On such architectures, +ACCESS_ONCE() does whatever is necessary to prevent this, for example, on +Itanium the volatile casts used by ACCESS_ONCE() cause GCC to emit the +special ld.acq and st.rel instructions that prevent such reordering. The compiler may also combine, discard or defer elements of the sequence before the CPU even sees them. @@ -2264,13 +2310,13 @@ may be reduced to: *A = W; -since, without a write barrier, it can be assumed that the effect of the -storage of V to *A is lost. Similarly: +since, without either a write barrier or an ACCESS_ONCE(), it can be +assumed that the effect of the storage of V to *A is lost. Similarly: *A = Y; Z = *A; -may, without a memory barrier, be reduced to: +may, without a memory barrier or an ACCESS_ONCE(), be reduced to: *A = Y; Z = Y; From fb2b581968db140586e8d7db38ff278f60872313 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:05 -0800 Subject: [PATCH 17/34] Documentation/memory-barriers.txt: Add long atomic examples to memory-barriers.txt Although the atomic_long_t functions are quite useful, they are a bit obscure. This commit therefore adds the common ones alongside their atomic_t counterparts in Documentation/memory-barriers.txt. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-2-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 1d067235b0bc..2d22da095a60 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1728,21 +1728,23 @@ explicit lock operations, described later). These include: xchg(); cmpxchg(); - atomic_xchg(); - atomic_cmpxchg(); - atomic_inc_return(); - atomic_dec_return(); - atomic_add_return(); - atomic_sub_return(); - atomic_inc_and_test(); - atomic_dec_and_test(); - atomic_sub_and_test(); - atomic_add_negative(); - atomic_add_unless(); /* when succeeds (returns 1) */ + atomic_xchg(); atomic_long_xchg(); + atomic_cmpxchg(); atomic_long_cmpxchg(); + atomic_inc_return(); atomic_long_inc_return(); + atomic_dec_return(); atomic_long_dec_return(); + atomic_add_return(); atomic_long_add_return(); + atomic_sub_return(); atomic_long_sub_return(); + atomic_inc_and_test(); atomic_long_inc_and_test(); + atomic_dec_and_test(); atomic_long_dec_and_test(); + atomic_sub_and_test(); atomic_long_sub_and_test(); + atomic_add_negative(); atomic_long_add_negative(); test_and_set_bit(); test_and_clear_bit(); test_and_change_bit(); + /* when succeeds (returns 1) */ + atomic_add_unless(); atomic_long_add_unless(); + These are used for such things as implementing LOCK-class and UNLOCK-class operations and adjusting reference counters towards object destruction, and as such the implicit memory barrier effects are necessary. From 18c03c61444a211237f3d4782353cb38dba795df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Dec 2013 13:59:06 -0800 Subject: [PATCH 18/34] Documentation/memory-barriers.txt: Prohibit speculative writes No SMP architecture currently supporting Linux allows speculative writes, so this commit updates Documentation/memory-barriers.txt to prohibit them in Linux core code. It also records restrictions on their use. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-3-git-send-email-paulmck@linux.vnet.ibm.com [ Paul modified the original patch from Peter. ] Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 183 ++++++++++++++++++++++++++++-- 1 file changed, 175 insertions(+), 8 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 2d22da095a60..deafa36aeea1 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -571,11 +571,10 @@ dependency barrier to make it work correctly. Consider the following bit of code: q = ACCESS_ONCE(a); - if (p) { - - q = ACCESS_ONCE(b); + if (q) { + /* BUG: No data dependency!!! */ + p = ACCESS_ONCE(b); } - x = *q; This will not have the desired effect because there is no actual data dependency, but rather a control dependency that the CPU may short-circuit @@ -584,11 +583,176 @@ the load from b as having happened before the load from a. In such a case what's actually required is: q = ACCESS_ONCE(a); - if (p) { + if (q) { - q = ACCESS_ONCE(b); + p = ACCESS_ONCE(b); } - x = *q; + +However, stores are not speculated. This means that ordering -is- provided +in the following example: + + q = ACCESS_ONCE(a); + if (ACCESS_ONCE(q)) { + ACCESS_ONCE(b) = p; + } + +Please note that ACCESS_ONCE() is not optional! Without the ACCESS_ONCE(), +the compiler is within its rights to transform this example: + + q = a; + if (q) { + b = p; /* BUG: Compiler can reorder!!! */ + do_something(); + } else { + b = p; /* BUG: Compiler can reorder!!! */ + do_something_else(); + } + +into this, which of course defeats the ordering: + + b = p; + q = a; + if (q) + do_something(); + else + do_something_else(); + +Worse yet, if the compiler is able to prove (say) that the value of +variable 'a' is always non-zero, it would be well within its rights +to optimize the original example by eliminating the "if" statement +as follows: + + q = a; + b = p; /* BUG: Compiler can reorder!!! */ + do_something(); + +The solution is again ACCESS_ONCE(), which preserves the ordering between +the load from variable 'a' and the store to variable 'b': + + q = ACCESS_ONCE(a); + if (q) { + ACCESS_ONCE(b) = p; + do_something(); + } else { + ACCESS_ONCE(b) = p; + do_something_else(); + } + +You could also use barrier() to prevent the compiler from moving +the stores to variable 'b', but barrier() would not prevent the +compiler from proving to itself that a==1 always, so ACCESS_ONCE() +is also needed. + +It is important to note that control dependencies absolutely require a +a conditional. For example, the following "optimized" version of +the above example breaks ordering: + + q = ACCESS_ONCE(a); + ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */ + if (q) { + /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ + do_something(); + } else { + /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ + do_something_else(); + } + +It is of course legal for the prior load to be part of the conditional, +for example, as follows: + + if (ACCESS_ONCE(a) > 0) { + ACCESS_ONCE(b) = q / 2; + do_something(); + } else { + ACCESS_ONCE(b) = q / 3; + do_something_else(); + } + +This will again ensure that the load from variable 'a' is ordered before the +stores to variable 'b'. + +In addition, you need to be careful what you do with the local variable 'q', +otherwise the compiler might be able to guess the value and again remove +the needed conditional. For example: + + q = ACCESS_ONCE(a); + if (q % MAX) { + ACCESS_ONCE(b) = p; + do_something(); + } else { + ACCESS_ONCE(b) = p; + do_something_else(); + } + +If MAX is defined to be 1, then the compiler knows that (q % MAX) is +equal to zero, in which case the compiler is within its rights to +transform the above code into the following: + + q = ACCESS_ONCE(a); + ACCESS_ONCE(b) = p; + do_something_else(); + +This transformation loses the ordering between the load from variable 'a' +and the store to variable 'b'. If you are relying on this ordering, you +should do something like the following: + + q = ACCESS_ONCE(a); + BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ + if (q % MAX) { + ACCESS_ONCE(b) = p; + do_something(); + } else { + ACCESS_ONCE(b) = p; + do_something_else(); + } + +Finally, control dependencies do -not- provide transitivity. This is +demonstrated by two related examples: + + CPU 0 CPU 1 + ===================== ===================== + r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(y); + if (r1 >= 0) if (r2 >= 0) + ACCESS_ONCE(y) = 1; ACCESS_ONCE(x) = 1; + + assert(!(r1 == 1 && r2 == 1)); + +The above two-CPU example will never trigger the assert(). However, +if control dependencies guaranteed transitivity (which they do not), +then adding the following two CPUs would guarantee a related assertion: + + CPU 2 CPU 3 + ===================== ===================== + ACCESS_ONCE(x) = 2; ACCESS_ONCE(y) = 2; + + assert(!(r1 == 2 && r2 == 2 && x == 1 && y == 1)); /* FAILS!!! */ + +But because control dependencies do -not- provide transitivity, the +above assertion can fail after the combined four-CPU example completes. +If you need the four-CPU example to provide ordering, you will need +smp_mb() between the loads and stores in the CPU 0 and CPU 1 code fragments. + +In summary: + + (*) Control dependencies can order prior loads against later stores. + However, they do -not- guarantee any other sort of ordering: + Not prior loads against later loads, nor prior stores against + later anything. If you need these other forms of ordering, + use smb_rmb(), smp_wmb(), or, in the case of prior stores and + later loads, smp_mb(). + + (*) Control dependencies require at least one run-time conditional + between the prior load and the subsequent store. If the compiler + is able to optimize the conditional away, it will have also + optimized away the ordering. Careful use of ACCESS_ONCE() can + help to preserve the needed conditional. + + (*) Control dependencies require that the compiler avoid reordering the + dependency into nonexistence. Careful use of ACCESS_ONCE() or + barrier() can help to preserve your control dependency. + + (*) Control dependencies do -not- provide transitivity. If you + need transitivity, use smp_mb(). SMP BARRIER PAIRING @@ -1083,7 +1247,10 @@ compiler from moving the memory accesses either side of it to the other side: barrier(); -This is a general barrier - lesser varieties of compiler barrier do not exist. +This is a general barrier -- there are no read-read or write-write variants +of barrier(). Howevever, ACCESS_ONCE() can be thought of as a weak form +for barrier() that affects only the specific accesses flagged by the +ACCESS_ONCE(). The compiler barrier has no direct effect on the CPU, which may then reorder things however it wishes. From 692118dac47e65f5131686b1103ebfebf0cbfa8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:07 -0800 Subject: [PATCH 19/34] Documentation/memory-barriers.txt: Document ACCESS_ONCE() The situations in which ACCESS_ONCE() is required are not well documented, so this commit adds some verbiage to memory-barriers.txt. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-4-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 306 ++++++++++++++++++++++++++---- 1 file changed, 271 insertions(+), 35 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index deafa36aeea1..919fd604969d 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -231,37 +231,8 @@ And there are a number of things that _must_ or _must_not_ be assumed: (*) It _must_not_ be assumed that the compiler will do what you want with memory references that are not protected by ACCESS_ONCE(). Without ACCESS_ONCE(), the compiler is within its rights to do all sorts - of "creative" transformations: - - (-) Repeat the load, possibly getting a different value on the second - and subsequent loads. This is especially prone to happen when - register pressure is high. - - (-) Merge adjacent loads and stores to the same location. The most - familiar example is the transformation from: - - while (a) - do_something(); - - to something like: - - if (a) - for (;;) - do_something(); - - Using ACCESS_ONCE() as follows prevents this sort of optimization: - - while (ACCESS_ONCE(a)) - do_something(); - - (-) "Store tearing", where a single store in the source code is split - into smaller stores in the object code. Note that gcc really - will do this on some architectures when storing certain constants. - It can be cheaper to do a series of immediate stores than to - form the constant in a register and then to store that register. - - (-) "Load tearing", which splits loads in a manner analogous to - store tearing. + of "creative" transformations, which are covered in the Compiler + Barrier section. (*) It _must_not_ be assumed that independent loads and stores will be issued in the order given. This means that for: @@ -749,7 +720,8 @@ In summary: (*) Control dependencies require that the compiler avoid reordering the dependency into nonexistence. Careful use of ACCESS_ONCE() or - barrier() can help to preserve your control dependency. + barrier() can help to preserve your control dependency. Please + see the Compiler Barrier section for more information. (*) Control dependencies do -not- provide transitivity. If you need transitivity, use smp_mb(). @@ -1248,12 +1220,276 @@ compiler from moving the memory accesses either side of it to the other side: barrier(); This is a general barrier -- there are no read-read or write-write variants -of barrier(). Howevever, ACCESS_ONCE() can be thought of as a weak form +of barrier(). However, ACCESS_ONCE() can be thought of as a weak form for barrier() that affects only the specific accesses flagged by the ACCESS_ONCE(). -The compiler barrier has no direct effect on the CPU, which may then reorder -things however it wishes. +The barrier() function has the following effects: + + (*) Prevents the compiler from reordering accesses following the + barrier() to precede any accesses preceding the barrier(). + One example use for this property is to ease communication between + interrupt-handler code and the code that was interrupted. + + (*) Within a loop, forces the compiler to load the variables used + in that loop's conditional on each pass through that loop. + +The ACCESS_ONCE() function can prevent any number of optimizations that, +while perfectly safe in single-threaded code, can be fatal in concurrent +code. Here are some examples of these sorts of optimizations: + + (*) The compiler is within its rights to merge successive loads from + the same variable. Such merging can cause the compiler to "optimize" + the following code: + + while (tmp = a) + do_something_with(tmp); + + into the following code, which, although in some sense legitimate + for single-threaded code, is almost certainly not what the developer + intended: + + if (tmp = a) + for (;;) + do_something_with(tmp); + + Use ACCESS_ONCE() to prevent the compiler from doing this to you: + + while (tmp = ACCESS_ONCE(a)) + do_something_with(tmp); + + (*) The compiler is within its rights to reload a variable, for example, + in cases where high register pressure prevents the compiler from + keeping all data of interest in registers. The compiler might + therefore optimize the variable 'tmp' out of our previous example: + + while (tmp = a) + do_something_with(tmp); + + This could result in the following code, which is perfectly safe in + single-threaded code, but can be fatal in concurrent code: + + while (a) + do_something_with(a); + + For example, the optimized version of this code could result in + passing a zero to do_something_with() in the case where the variable + a was modified by some other CPU between the "while" statement and + the call to do_something_with(). + + Again, use ACCESS_ONCE() to prevent the compiler from doing this: + + while (tmp = ACCESS_ONCE(a)) + do_something_with(tmp); + + Note that if the compiler runs short of registers, it might save + tmp onto the stack. The overhead of this saving and later restoring + is why compilers reload variables. Doing so is perfectly safe for + single-threaded code, so you need to tell the compiler about cases + where it is not safe. + + (*) The compiler is within its rights to omit a load entirely if it knows + what the value will be. For example, if the compiler can prove that + the value of variable 'a' is always zero, it can optimize this code: + + while (tmp = a) + do_something_with(tmp); + + Into this: + + do { } while (0); + + This transformation is a win for single-threaded code because it gets + rid of a load and a branch. The problem is that the compiler will + carry out its proof assuming that the current CPU is the only one + updating variable 'a'. If variable 'a' is shared, then the compiler's + proof will be erroneous. Use ACCESS_ONCE() to tell the compiler + that it doesn't know as much as it thinks it does: + + while (tmp = ACCESS_ONCE(a)) + do_something_with(tmp); + + But please note that the compiler is also closely watching what you + do with the value after the ACCESS_ONCE(). For example, suppose you + do the following and MAX is a preprocessor macro with the value 1: + + while ((tmp = ACCESS_ONCE(a)) % MAX) + do_something_with(tmp); + + Then the compiler knows that the result of the "%" operator applied + to MAX will always be zero, again allowing the compiler to optimize + the code into near-nonexistence. (It will still load from the + variable 'a'.) + + (*) Similarly, the compiler is within its rights to omit a store entirely + if it knows that the variable already has the value being stored. + Again, the compiler assumes that the current CPU is the only one + storing into the variable, which can cause the compiler to do the + wrong thing for shared variables. For example, suppose you have + the following: + + a = 0; + /* Code that does not store to variable a. */ + a = 0; + + The compiler sees that the value of variable 'a' is already zero, so + it might well omit the second store. This would come as a fatal + surprise if some other CPU might have stored to variable 'a' in the + meantime. + + Use ACCESS_ONCE() to prevent the compiler from making this sort of + wrong guess: + + ACCESS_ONCE(a) = 0; + /* Code that does not store to variable a. */ + ACCESS_ONCE(a) = 0; + + (*) The compiler is within its rights to reorder memory accesses unless + you tell it not to. For example, consider the following interaction + between process-level code and an interrupt handler: + + void process_level(void) + { + msg = get_message(); + flag = true; + } + + void interrupt_handler(void) + { + if (flag) + process_message(msg); + } + + There is nothing to prevent the the compiler from transforming + process_level() to the following, in fact, this might well be a + win for single-threaded code: + + void process_level(void) + { + flag = true; + msg = get_message(); + } + + If the interrupt occurs between these two statement, then + interrupt_handler() might be passed a garbled msg. Use ACCESS_ONCE() + to prevent this as follows: + + void process_level(void) + { + ACCESS_ONCE(msg) = get_message(); + ACCESS_ONCE(flag) = true; + } + + void interrupt_handler(void) + { + if (ACCESS_ONCE(flag)) + process_message(ACCESS_ONCE(msg)); + } + + Note that the ACCESS_ONCE() wrappers in interrupt_handler() + are needed if this interrupt handler can itself be interrupted + by something that also accesses 'flag' and 'msg', for example, + a nested interrupt or an NMI. Otherwise, ACCESS_ONCE() is not + needed in interrupt_handler() other than for documentation purposes. + (Note also that nested interrupts do not typically occur in modern + Linux kernels, in fact, if an interrupt handler returns with + interrupts enabled, you will get a WARN_ONCE() splat.) + + You should assume that the compiler can move ACCESS_ONCE() past + code not containing ACCESS_ONCE(), barrier(), or similar primitives. + + This effect could also be achieved using barrier(), but ACCESS_ONCE() + is more selective: With ACCESS_ONCE(), the compiler need only forget + the contents of the indicated memory locations, while with barrier() + the compiler must discard the value of all memory locations that + it has currented cached in any machine registers. Of course, + the compiler must also respect the order in which the ACCESS_ONCE()s + occur, though the CPU of course need not do so. + + (*) The compiler is within its rights to invent stores to a variable, + as in the following example: + + if (a) + b = a; + else + b = 42; + + The compiler might save a branch by optimizing this as follows: + + b = 42; + if (a) + b = a; + + In single-threaded code, this is not only safe, but also saves + a branch. Unfortunately, in concurrent code, this optimization + could cause some other CPU to see a spurious value of 42 -- even + if variable 'a' was never zero -- when loading variable 'b'. + Use ACCESS_ONCE() to prevent this as follows: + + if (a) + ACCESS_ONCE(b) = a; + else + ACCESS_ONCE(b) = 42; + + The compiler can also invent loads. These are usually less + damaging, but they can result in cache-line bouncing and thus in + poor performance and scalability. Use ACCESS_ONCE() to prevent + invented loads. + + (*) For aligned memory locations whose size allows them to be accessed + with a single memory-reference instruction, prevents "load tearing" + and "store tearing," in which a single large access is replaced by + multiple smaller accesses. For example, given an architecture having + 16-bit store instructions with 7-bit immediate fields, the compiler + might be tempted to use two 16-bit store-immediate instructions to + implement the following 32-bit store: + + p = 0x00010002; + + Please note that GCC really does use this sort of optimization, + which is not surprising given that it would likely take more + than two instructions to build the constant and then store it. + This optimization can therefore be a win in single-threaded code. + In fact, a recent bug (since fixed) caused GCC to incorrectly use + this optimization in a volatile store. In the absence of such bugs, + use of ACCESS_ONCE() prevents store tearing in the following example: + + ACCESS_ONCE(p) = 0x00010002; + + Use of packed structures can also result in load and store tearing, + as in this example: + + struct __attribute__((__packed__)) foo { + short a; + int b; + short c; + }; + struct foo foo1, foo2; + ... + + foo2.a = foo1.a; + foo2.b = foo1.b; + foo2.c = foo1.c; + + Because there are no ACCESS_ONCE() wrappers and no volatile markings, + the compiler would be well within its rights to implement these three + assignment statements as a pair of 32-bit loads followed by a pair + of 32-bit stores. This would result in load tearing on 'foo1.b' + and store tearing on 'foo2.b'. ACCESS_ONCE() again prevents tearing + in this example: + + foo2.a = foo1.a; + ACCESS_ONCE(foo2.b) = ACCESS_ONCE(foo1.b); + foo2.c = foo1.c; + +All that aside, it is never necessary to use ACCESS_ONCE() on a variable +that has been marked volatile. For example, because 'jiffies' is marked +volatile, it is never necessary to say ACCESS_ONCE(jiffies). The reason +for this is that ACCESS_ONCE() is implemented as a volatile cast, which +has no effect when its argument is already marked volatile. + +Please note that these compiler barriers have no direct effect on the CPU, +which may then reorder things however it wishes. CPU MEMORY BARRIERS From 01352fb81658cbf78c55844de8e3d1d606bbf3f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:08 -0800 Subject: [PATCH 20/34] locking: Add an smp_mb__after_unlock_lock() for UNLOCK+BLOCK barrier The Linux kernel has traditionally required that an UNLOCK+LOCK pair act as a full memory barrier when either (1) that UNLOCK+LOCK pair was executed by the same CPU or task, or (2) the same lock variable was used for the UNLOCK and LOCK. It now seems likely that very few places in the kernel rely on this full-memory-barrier semantic, and with the advent of queued locks, providing this semantic either requires complex reasoning, or for some architectures, added overhead. This commit therefore adds a smp_mb__after_unlock_lock(), which may be placed after a LOCK primitive to restore the full-memory-barrier semantic. All definitions are currently no-ops, but will be upgraded for some architectures when queued locks arrive. Signed-off-by: Paul E. McKenney Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-5-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 75f34949d9ab..3f2867ff0ced 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -130,6 +130,16 @@ do { \ #define smp_mb__before_spinlock() smp_wmb() #endif +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifndef smp_mb__after_unlock_lock +#define smp_mb__after_unlock_lock() do { } while (0) +#endif + /** * raw_spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. From 17eb88e068430014deb709e5af34197cdf2390c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:09 -0800 Subject: [PATCH 21/34] Documentation/memory-barriers.txt: Downgrade UNLOCK+BLOCK Historically, an UNLOCK+LOCK pair executed by one CPU, by one task, or on a given lock variable has implied a full memory barrier. In a recent LKML thread, the wisdom of this historical approach was called into question: http://www.spinics.net/lists/linux-mm/msg65653.html, in part due to the memory-order complexities of low-handoff-overhead queued locks on x86 systems. This patch therefore removes this guarantee from the documentation, and further documents how to restore it via a new smp_mb__after_unlock_lock() primitive. Signed-off-by: Paul E. McKenney Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-6-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 84 +++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 15 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 919fd604969d..cb753c8158f2 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -402,12 +402,18 @@ And a couple of implicit varieties: Memory operations that occur after an UNLOCK operation may appear to happen before it completes. - LOCK and UNLOCK operations are guaranteed to appear with respect to each - other strictly in the order specified. - The use of LOCK and UNLOCK operations generally precludes the need for other sorts of memory barrier (but note the exceptions mentioned in the - subsection "MMIO write barrier"). + subsection "MMIO write barrier"). In addition, an UNLOCK+LOCK pair + is -not- guaranteed to act as a full memory barrier. However, + after a LOCK on a given lock variable, all memory accesses preceding any + prior UNLOCK on that same variable are guaranteed to be visible. + In other words, within a given lock variable's critical section, + all accesses of all previous critical sections for that lock variable + are guaranteed to have completed. + + This means that LOCK acts as a minimal "acquire" operation and + UNLOCK acts as a minimal "release" operation. Memory barriers are only required where there's a possibility of interaction @@ -1633,8 +1639,12 @@ for each construct. These operations all imply certain barriers: Memory operations issued after the LOCK will be completed after the LOCK operation has completed. - Memory operations issued before the LOCK may be completed after the LOCK - operation has completed. + Memory operations issued before the LOCK may be completed after the + LOCK operation has completed. An smp_mb__before_spinlock(), combined + with a following LOCK, orders prior loads against subsequent stores + and stores and prior stores against subsequent stores. Note that + this is weaker than smp_mb()! The smp_mb__before_spinlock() + primitive is free on many architectures. (2) UNLOCK operation implication: @@ -1654,9 +1664,6 @@ for each construct. These operations all imply certain barriers: All LOCK operations issued before an UNLOCK operation will be completed before the UNLOCK operation. - All UNLOCK operations issued before a LOCK operation will be completed - before the LOCK operation. - (5) Failed conditional LOCK implication: Certain variants of the LOCK operation may fail, either due to being @@ -1664,9 +1671,6 @@ for each construct. These operations all imply certain barriers: signal whilst asleep waiting for the lock to become available. Failed locks do not imply any sort of barrier. -Therefore, from (1), (2) and (4) an UNLOCK followed by an unconditional LOCK is -equivalent to a full barrier, but a LOCK followed by an UNLOCK is not. - [!] Note: one of the consequences of LOCKs and UNLOCKs being only one-way barriers is that the effects of instructions outside of a critical section may seep into the inside of the critical section. @@ -1677,13 +1681,57 @@ LOCK, and an access following the UNLOCK to happen before the UNLOCK, and the two accesses can themselves then cross: *A = a; - LOCK - UNLOCK + LOCK M + UNLOCK M *B = b; may occur as: - LOCK, STORE *B, STORE *A, UNLOCK + LOCK M, STORE *B, STORE *A, UNLOCK M + +This same reordering can of course occur if the LOCK and UNLOCK are +to the same lock variable, but only from the perspective of another +CPU not holding that lock. + +In short, an UNLOCK followed by a LOCK may -not- be assumed to be a full +memory barrier because it is possible for a preceding UNLOCK to pass a +later LOCK from the viewpoint of the CPU, but not from the viewpoint +of the compiler. Note that deadlocks cannot be introduced by this +interchange because if such a deadlock threatened, the UNLOCK would +simply complete. + +If it is necessary for an UNLOCK-LOCK pair to produce a full barrier, +the LOCK can be followed by an smp_mb__after_unlock_lock() invocation. +This will produce a full barrier if either (a) the UNLOCK and the LOCK +are executed by the same CPU or task, or (b) the UNLOCK and LOCK act +on the same lock variable. The smp_mb__after_unlock_lock() primitive +is free on many architectures. Without smp_mb__after_unlock_lock(), +the critical sections corresponding to the UNLOCK and the LOCK can cross: + + *A = a; + UNLOCK M + LOCK N + *B = b; + +could occur as: + + LOCK N, STORE *B, STORE *A, UNLOCK M + +With smp_mb__after_unlock_lock(), they cannot, so that: + + *A = a; + UNLOCK M + LOCK N + smp_mb__after_unlock_lock(); + *B = b; + +will always occur as either of the following: + + STORE *A, UNLOCK, LOCK, STORE *B + STORE *A, LOCK, UNLOCK, STORE *B + +If the UNLOCK and LOCK were instead both operating on the same lock +variable, only the first of these two alternatives can occur. Locks and semaphores may not provide any guarantee of ordering on UP compiled systems, and so cannot be counted on in such a situation to actually achieve @@ -1911,6 +1959,7 @@ However, if the following occurs: UNLOCK M [1] ACCESS_ONCE(*D) = d; ACCESS_ONCE(*E) = e; LOCK M [2] + smp_mb__after_unlock_lock(); ACCESS_ONCE(*F) = f; ACCESS_ONCE(*G) = g; UNLOCK M [2] @@ -1928,6 +1977,11 @@ But assuming CPU 1 gets the lock first, CPU 3 won't see any of: *F, *G or *H preceding LOCK M [2] *A, *B, *C, *E, *F or *G following UNLOCK M [2] +Note that the smp_mb__after_unlock_lock() is critically important +here: Without it CPU 3 might see some of the above orderings. +Without smp_mb__after_unlock_lock(), the accesses are not guaranteed +to be seen in order unless CPU 3 holds lock M. + LOCKS VS I/O ACCESSES --------------------- From 6303b9c87d52eaedc82968d3ff59c471e7682afc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:10 -0800 Subject: [PATCH 22/34] rcu: Apply smp_mb__after_unlock_lock() to preserve grace periods RCU must ensure that there is the equivalent of a full memory barrier between any memory access preceding grace period and any memory access following that same grace period, regardless of which CPU(s) happen to execute the two memory accesses. Therefore, downgrading UNLOCK+LOCK to no longer imply a full memory barrier requires some adjustments to RCU. This commit therefore adds smp_mb__after_unlock_lock() invocations as needed after the RCU lock acquisitions that need to be part of a full-memory-barrier UNLOCK+LOCK. Signed-off-by: Paul E. McKenney Reviewed-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-7-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 18 +++++++++++++++++- kernel/rcu/tree_plugin.h | 13 +++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd081987a8ec..a6205a05b5e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1133,8 +1133,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1354,6 +1356,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1368,6 +1371,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); if (rsp->gp_flags == 0) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1409,6 +1413,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1463,6 +1468,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1480,6 +1486,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1505,6 +1512,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) @@ -1515,6 +1523,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ @@ -1749,6 +1758,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1778,6 +1788,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1992,6 +2003,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2202,6 +2214,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2231,6 +2244,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2263,6 +2277,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2378,6 +2393,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03dff5c0..eb161d80cbde 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu) rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -381,6 +383,7 @@ void rcu_read_unlock_special(struct task_struct *t) for (;;) { rnp = t->rcu_blocked_node; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -605,6 +608,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, while (!list_empty(lp)) { t = list_entry(lp->next, typeof(*t), rcu_node_entry); raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); list_del(&t->rcu_node_entry); t->rcu_blocked_node = rnp_root; list_add(&t->rcu_node_entry, lp_root); @@ -629,6 +633,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * in this case. */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); if (rnp_root->boost_tasks != NULL && rnp_root->boost_tasks != rnp_root->gp_tasks && rnp_root->boost_tasks != rnp_root->exp_tasks) @@ -772,6 +777,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -787,6 +793,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); rnp->expmask &= ~mask; } } @@ -806,6 +813,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -886,6 +894,7 @@ void synchronize_rcu_expedited(void) /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->expmask = rnp->qsmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1191,6 +1200,7 @@ static int rcu_boost(struct rcu_node *rnp) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* * Recheck under the lock: all tasks in need of boosting @@ -1377,6 +1387,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_BOOST_PRIO; @@ -1769,6 +1780,7 @@ static void rcu_prepare_for_idle(int cpu) continue; rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -2209,6 +2221,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); c = rcu_start_future_gp(rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); From 919fc6e34831d1c2b58bfb5ae261dc3facc9b269 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2013 13:59:11 -0800 Subject: [PATCH 23/34] powerpc: Full barrier for smp_mb__after_unlock_lock() The powerpc lock acquisition sequence is as follows: lwarx; cmpwi; bne; stwcx.; lwsync; Lock release is as follows: lwsync; stw; If CPU 0 does a store (say, x=1) then a lock release, and CPU 1 does a lock acquisition then a load (say, r1=y), then there is no guarantee of a full memory barrier between the store to 'x' and the load from 'y'. To see this, suppose that CPUs 0 and 1 are hardware threads in the same core that share a store buffer, and that CPU 2 is in some other core, and that CPU 2 does the following: y = 1; sync; r2 = x; If 'x' and 'y' are both initially zero, then the lock acquisition and release sequences above can result in r1 and r2 both being equal to zero, which could not happen if unlock+lock was a full barrier. This commit therefore makes powerpc's smp_mb__after_unlock_lock() be a full barrier. Signed-off-by: Paul E. McKenney Acked-by: Benjamin Herrenschmidt Reviewed-by: Peter Zijlstra Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Cc: Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1386799151-2219-8-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/spinlock.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 5f54a744dcc5..f6e78d63fb6a 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -28,6 +28,8 @@ #include #include +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ + #define arch_spin_is_locked(x) ((x)->slock != 0) #ifdef CONFIG_PPC64 From 91f30a17024ff0d8345e11228af33ee938b13426 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Wed, 4 Dec 2013 13:58:13 +0800 Subject: [PATCH 24/34] mutexes: Give more informative mutex warning in the !lock->owner case When mutex debugging is enabled and an imbalanced mutex_unlock() is called, we get the following, slightly confusing warning: [ 364.208284] DEBUG_LOCKS_WARN_ON(lock->owner != current) But in that case the warning is due to an imbalanced mutex_unlock() call, and the lock->owner is NULL - so the message is misleading. So improve the message by testing for this case specifically: DEBUG_LOCKS_WARN_ON(!lock->owner) Signed-off-by: Liu, Chuansheng Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1386136693.3650.48.camel@cliu38-desktop-build [ Improved the changelog, changed the patch to use !lock->owner consistently. ] Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..faf6f5b53e77 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); + + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } From 2e4f5382d12a441b5cccfdde00308df15c2ce300 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2013 14:57:36 +0100 Subject: [PATCH 25/34] locking/doc: Rename LOCK/UNLOCK to ACQUIRE/RELEASE The LOCK and UNLOCK barriers as described in our barrier document are generally known as ACQUIRE and RELEASE barriers in other literature. Since we plan to introduce the acquire and release nomenclature in generic kernel primitives we should amend the document to avoid confusion as to what an acquire/release means. Reviewed-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra Acked-by: Mathieu Desnoyers Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Michael Ellerman Cc: Michael Neuling Cc: Russell King Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Victor Kaplansky Cc: Tony Luck Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131217092435.GC21999@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 237 +++++++++++++++--------------- 1 file changed, 121 insertions(+), 116 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index cb753c8158f2..102dc19c4119 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -381,39 +381,44 @@ Memory barriers come in four basic varieties: And a couple of implicit varieties: - (5) LOCK operations. + (5) ACQUIRE operations. This acts as a one-way permeable barrier. It guarantees that all memory - operations after the LOCK operation will appear to happen after the LOCK - operation with respect to the other components of the system. + operations after the ACQUIRE operation will appear to happen after the + ACQUIRE operation with respect to the other components of the system. + ACQUIRE operations include LOCK operations and smp_load_acquire() + operations. - Memory operations that occur before a LOCK operation may appear to happen - after it completes. + Memory operations that occur before an ACQUIRE operation may appear to + happen after it completes. - A LOCK operation should almost always be paired with an UNLOCK operation. + An ACQUIRE operation should almost always be paired with a RELEASE + operation. - (6) UNLOCK operations. + (6) RELEASE operations. This also acts as a one-way permeable barrier. It guarantees that all - memory operations before the UNLOCK operation will appear to happen before - the UNLOCK operation with respect to the other components of the system. + memory operations before the RELEASE operation will appear to happen + before the RELEASE operation with respect to the other components of the + system. RELEASE operations include UNLOCK operations and + smp_store_release() operations. - Memory operations that occur after an UNLOCK operation may appear to + Memory operations that occur after a RELEASE operation may appear to happen before it completes. - The use of LOCK and UNLOCK operations generally precludes the need for - other sorts of memory barrier (but note the exceptions mentioned in the - subsection "MMIO write barrier"). In addition, an UNLOCK+LOCK pair - is -not- guaranteed to act as a full memory barrier. However, - after a LOCK on a given lock variable, all memory accesses preceding any - prior UNLOCK on that same variable are guaranteed to be visible. - In other words, within a given lock variable's critical section, - all accesses of all previous critical sections for that lock variable - are guaranteed to have completed. + The use of ACQUIRE and RELEASE operations generally precludes the need + for other sorts of memory barrier (but note the exceptions mentioned in + the subsection "MMIO write barrier"). In addition, a RELEASE+ACQUIRE + pair is -not- guaranteed to act as a full memory barrier. However, after + an ACQUIRE on a given variable, all memory accesses preceding any prior + RELEASE on that same variable are guaranteed to be visible. In other + words, within a given variable's critical section, all accesses of all + previous critical sections for that variable are guaranteed to have + completed. - This means that LOCK acts as a minimal "acquire" operation and - UNLOCK acts as a minimal "release" operation. + This means that ACQUIRE acts as a minimal "acquire" operation and + RELEASE acts as a minimal "release" operation. Memory barriers are only required where there's a possibility of interaction @@ -1585,7 +1590,7 @@ There are some more advanced barrier functions: clear_bit( ... ); This prevents memory operations before the clear leaking to after it. See - the subsection on "Locking Functions" with reference to UNLOCK operation + the subsection on "Locking Functions" with reference to RELEASE operation implications. See Documentation/atomic_ops.txt for more information. See the "Atomic @@ -1619,8 +1624,8 @@ provide more substantial guarantees, but these may not be relied upon outside of arch specific code. -LOCKING FUNCTIONS ------------------ +ACQUIRING FUNCTIONS +------------------- The Linux kernel has a number of locking constructs: @@ -1631,106 +1636,106 @@ The Linux kernel has a number of locking constructs: (*) R/W semaphores (*) RCU -In all cases there are variants on "LOCK" operations and "UNLOCK" operations +In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations for each construct. These operations all imply certain barriers: - (1) LOCK operation implication: + (1) ACQUIRE operation implication: - Memory operations issued after the LOCK will be completed after the LOCK - operation has completed. + Memory operations issued after the ACQUIRE will be completed after the + ACQUIRE operation has completed. - Memory operations issued before the LOCK may be completed after the - LOCK operation has completed. An smp_mb__before_spinlock(), combined - with a following LOCK, orders prior loads against subsequent stores - and stores and prior stores against subsequent stores. Note that - this is weaker than smp_mb()! The smp_mb__before_spinlock() - primitive is free on many architectures. + Memory operations issued before the ACQUIRE may be completed after the + ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined + with a following ACQUIRE, orders prior loads against subsequent stores and + stores and prior stores against subsequent stores. Note that this is + weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on + many architectures. - (2) UNLOCK operation implication: + (2) RELEASE operation implication: - Memory operations issued before the UNLOCK will be completed before the - UNLOCK operation has completed. + Memory operations issued before the RELEASE will be completed before the + RELEASE operation has completed. - Memory operations issued after the UNLOCK may be completed before the - UNLOCK operation has completed. + Memory operations issued after the RELEASE may be completed before the + RELEASE operation has completed. - (3) LOCK vs LOCK implication: + (3) ACQUIRE vs ACQUIRE implication: - All LOCK operations issued before another LOCK operation will be completed - before that LOCK operation. + All ACQUIRE operations issued before another ACQUIRE operation will be + completed before that ACQUIRE operation. - (4) LOCK vs UNLOCK implication: + (4) ACQUIRE vs RELEASE implication: - All LOCK operations issued before an UNLOCK operation will be completed - before the UNLOCK operation. + All ACQUIRE operations issued before a RELEASE operation will be + completed before the RELEASE operation. - (5) Failed conditional LOCK implication: + (5) Failed conditional ACQUIRE implication: - Certain variants of the LOCK operation may fail, either due to being - unable to get the lock immediately, or due to receiving an unblocked + Certain locking variants of the ACQUIRE operation may fail, either due to + being unable to get the lock immediately, or due to receiving an unblocked signal whilst asleep waiting for the lock to become available. Failed locks do not imply any sort of barrier. -[!] Note: one of the consequences of LOCKs and UNLOCKs being only one-way - barriers is that the effects of instructions outside of a critical section - may seep into the inside of the critical section. +[!] Note: one of the consequences of lock ACQUIREs and RELEASEs being only +one-way barriers is that the effects of instructions outside of a critical +section may seep into the inside of the critical section. -A LOCK followed by an UNLOCK may not be assumed to be full memory barrier -because it is possible for an access preceding the LOCK to happen after the -LOCK, and an access following the UNLOCK to happen before the UNLOCK, and the -two accesses can themselves then cross: +An ACQUIRE followed by a RELEASE may not be assumed to be full memory barrier +because it is possible for an access preceding the ACQUIRE to happen after the +ACQUIRE, and an access following the RELEASE to happen before the RELEASE, and +the two accesses can themselves then cross: *A = a; - LOCK M - UNLOCK M + ACQUIRE M + RELEASE M *B = b; may occur as: - LOCK M, STORE *B, STORE *A, UNLOCK M + ACQUIRE M, STORE *B, STORE *A, RELEASE M -This same reordering can of course occur if the LOCK and UNLOCK are -to the same lock variable, but only from the perspective of another -CPU not holding that lock. +This same reordering can of course occur if the lock's ACQUIRE and RELEASE are +to the same lock variable, but only from the perspective of another CPU not +holding that lock. -In short, an UNLOCK followed by a LOCK may -not- be assumed to be a full -memory barrier because it is possible for a preceding UNLOCK to pass a -later LOCK from the viewpoint of the CPU, but not from the viewpoint +In short, a RELEASE followed by an ACQUIRE may -not- be assumed to be a full +memory barrier because it is possible for a preceding RELEASE to pass a +later ACQUIRE from the viewpoint of the CPU, but not from the viewpoint of the compiler. Note that deadlocks cannot be introduced by this -interchange because if such a deadlock threatened, the UNLOCK would +interchange because if such a deadlock threatened, the RELEASE would simply complete. -If it is necessary for an UNLOCK-LOCK pair to produce a full barrier, -the LOCK can be followed by an smp_mb__after_unlock_lock() invocation. -This will produce a full barrier if either (a) the UNLOCK and the LOCK -are executed by the same CPU or task, or (b) the UNLOCK and LOCK act -on the same lock variable. The smp_mb__after_unlock_lock() primitive -is free on many architectures. Without smp_mb__after_unlock_lock(), -the critical sections corresponding to the UNLOCK and the LOCK can cross: +If it is necessary for a RELEASE-ACQUIRE pair to produce a full barrier, the +ACQUIRE can be followed by an smp_mb__after_unlock_lock() invocation. This +will produce a full barrier if either (a) the RELEASE and the ACQUIRE are +executed by the same CPU or task, or (b) the RELEASE and ACQUIRE act on the +same variable. The smp_mb__after_unlock_lock() primitive is free on many +architectures. Without smp_mb__after_unlock_lock(), the critical sections +corresponding to the RELEASE and the ACQUIRE can cross: *A = a; - UNLOCK M - LOCK N + RELEASE M + ACQUIRE N *B = b; could occur as: - LOCK N, STORE *B, STORE *A, UNLOCK M + ACQUIRE N, STORE *B, STORE *A, RELEASE M With smp_mb__after_unlock_lock(), they cannot, so that: *A = a; - UNLOCK M - LOCK N + RELEASE M + ACQUIRE N smp_mb__after_unlock_lock(); *B = b; will always occur as either of the following: - STORE *A, UNLOCK, LOCK, STORE *B - STORE *A, LOCK, UNLOCK, STORE *B + STORE *A, RELEASE, ACQUIRE, STORE *B + STORE *A, ACQUIRE, RELEASE, STORE *B -If the UNLOCK and LOCK were instead both operating on the same lock +If the RELEASE and ACQUIRE were instead both operating on the same lock variable, only the first of these two alternatives can occur. Locks and semaphores may not provide any guarantee of ordering on UP compiled @@ -1745,33 +1750,33 @@ As an example, consider the following: *A = a; *B = b; - LOCK + ACQUIRE *C = c; *D = d; - UNLOCK + RELEASE *E = e; *F = f; The following sequence of events is acceptable: - LOCK, {*F,*A}, *E, {*C,*D}, *B, UNLOCK + ACQUIRE, {*F,*A}, *E, {*C,*D}, *B, RELEASE [+] Note that {*F,*A} indicates a combined access. But none of the following are: - {*F,*A}, *B, LOCK, *C, *D, UNLOCK, *E - *A, *B, *C, LOCK, *D, UNLOCK, *E, *F - *A, *B, LOCK, *C, UNLOCK, *D, *E, *F - *B, LOCK, *C, *D, UNLOCK, {*F,*A}, *E + {*F,*A}, *B, ACQUIRE, *C, *D, RELEASE, *E + *A, *B, *C, ACQUIRE, *D, RELEASE, *E, *F + *A, *B, ACQUIRE, *C, RELEASE, *D, *E, *F + *B, ACQUIRE, *C, *D, RELEASE, {*F,*A}, *E INTERRUPT DISABLING FUNCTIONS ----------------------------- -Functions that disable interrupts (LOCK equivalent) and enable interrupts -(UNLOCK equivalent) will act as compiler barriers only. So if memory or I/O +Functions that disable interrupts (ACQUIRE equivalent) and enable interrupts +(RELEASE equivalent) will act as compiler barriers only. So if memory or I/O barriers are required in such a situation, they must be provided from some other means. @@ -1910,17 +1915,17 @@ Other functions that imply barriers: (*) schedule() and similar imply full memory barriers. -================================= -INTER-CPU LOCKING BARRIER EFFECTS -================================= +=================================== +INTER-CPU ACQUIRING BARRIER EFFECTS +=================================== On SMP systems locking primitives give a more substantial form of barrier: one that does affect memory access ordering on other CPUs, within the context of conflict on any particular lock. -LOCKS VS MEMORY ACCESSES ------------------------- +ACQUIRES VS MEMORY ACCESSES +--------------------------- Consider the following: the system has a pair of spinlocks (M) and (Q), and three CPUs; then should the following sequence of events occur: @@ -1928,24 +1933,24 @@ three CPUs; then should the following sequence of events occur: CPU 1 CPU 2 =============================== =============================== ACCESS_ONCE(*A) = a; ACCESS_ONCE(*E) = e; - LOCK M LOCK Q + ACQUIRE M ACQUIRE Q ACCESS_ONCE(*B) = b; ACCESS_ONCE(*F) = f; ACCESS_ONCE(*C) = c; ACCESS_ONCE(*G) = g; - UNLOCK M UNLOCK Q + RELEASE M RELEASE Q ACCESS_ONCE(*D) = d; ACCESS_ONCE(*H) = h; Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks on the separate CPUs. It might, for example, see: - *E, LOCK M, LOCK Q, *G, *C, *F, *A, *B, UNLOCK Q, *D, *H, UNLOCK M + *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M But it won't see any of: - *B, *C or *D preceding LOCK M - *A, *B or *C following UNLOCK M - *F, *G or *H preceding LOCK Q - *E, *F or *G following UNLOCK Q + *B, *C or *D preceding ACQUIRE M + *A, *B or *C following RELEASE M + *F, *G or *H preceding ACQUIRE Q + *E, *F or *G following RELEASE Q However, if the following occurs: @@ -1953,29 +1958,29 @@ However, if the following occurs: CPU 1 CPU 2 =============================== =============================== ACCESS_ONCE(*A) = a; - LOCK M [1] + ACQUIRE M [1] ACCESS_ONCE(*B) = b; ACCESS_ONCE(*C) = c; - UNLOCK M [1] + RELEASE M [1] ACCESS_ONCE(*D) = d; ACCESS_ONCE(*E) = e; - LOCK M [2] + ACQUIRE M [2] smp_mb__after_unlock_lock(); ACCESS_ONCE(*F) = f; ACCESS_ONCE(*G) = g; - UNLOCK M [2] + RELEASE M [2] ACCESS_ONCE(*H) = h; CPU 3 might see: - *E, LOCK M [1], *C, *B, *A, UNLOCK M [1], - LOCK M [2], *H, *F, *G, UNLOCK M [2], *D + *E, ACQUIRE M [1], *C, *B, *A, RELEASE M [1], + ACQUIRE M [2], *H, *F, *G, RELEASE M [2], *D But assuming CPU 1 gets the lock first, CPU 3 won't see any of: - *B, *C, *D, *F, *G or *H preceding LOCK M [1] - *A, *B or *C following UNLOCK M [1] - *F, *G or *H preceding LOCK M [2] - *A, *B, *C, *E, *F or *G following UNLOCK M [2] + *B, *C, *D, *F, *G or *H preceding ACQUIRE M [1] + *A, *B or *C following RELEASE M [1] + *F, *G or *H preceding ACQUIRE M [2] + *A, *B, *C, *E, *F or *G following RELEASE M [2] Note that the smp_mb__after_unlock_lock() is critically important here: Without it CPU 3 might see some of the above orderings. @@ -1983,8 +1988,8 @@ Without smp_mb__after_unlock_lock(), the accesses are not guaranteed to be seen in order unless CPU 3 holds lock M. -LOCKS VS I/O ACCESSES ---------------------- +ACQUIRES VS I/O ACCESSES +------------------------ Under certain circumstances (especially involving NUMA), I/O accesses within two spinlocked sections on two different CPUs may be seen as interleaved by the @@ -2202,13 +2207,13 @@ explicit lock operations, described later). These include: /* when succeeds (returns 1) */ atomic_add_unless(); atomic_long_add_unless(); -These are used for such things as implementing LOCK-class and UNLOCK-class +These are used for such things as implementing ACQUIRE-class and RELEASE-class operations and adjusting reference counters towards object destruction, and as such the implicit memory barrier effects are necessary. The following operations are potential problems as they do _not_ imply memory -barriers, but might be used for implementing such things as UNLOCK-class +barriers, but might be used for implementing such things as RELEASE-class operations: atomic_set(); @@ -2250,7 +2255,7 @@ The following operations are special locking primitives: clear_bit_unlock(); __clear_bit_unlock(); -These implement LOCK-class and UNLOCK-class operations. These should be used in +These implement ACQUIRE-class and RELEASE-class operations. These should be used in preference to other operations when implementing locking primitives, because their implementations can be optimised on many architectures. From 1de7da377bd880ff23917f78924d0e908329d978 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2013 12:01:59 +0100 Subject: [PATCH 26/34] arch: Move smp_mb__{before,after}_atomic_{inc,dec}.h into asm/atomic.h Move the barriers functions that depend on the atomic implementation into the atomic implementation. Reviewed-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Acked-by: Vineet Gupta [for arch/arc bits] Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20131213150640.786183683@infradead.org Signed-off-by: Ingo Molnar --- arch/arc/include/asm/atomic.h | 5 +++++ arch/arc/include/asm/barrier.h | 5 ----- arch/hexagon/include/asm/atomic.h | 6 +++++- arch/hexagon/include/asm/barrier.h | 4 ---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 83f03ca6caf6..03e494f695d1 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -190,6 +190,11 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) #endif /* !CONFIG_ARC_HAS_LLSC */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + /** * __atomic_add_unless - add unless the number is a given value * @v: pointer of type atomic_t diff --git a/arch/arc/include/asm/barrier.h b/arch/arc/include/asm/barrier.h index f6cb7c4ffb35..c32245c3d1e9 100644 --- a/arch/arc/include/asm/barrier.h +++ b/arch/arc/include/asm/barrier.h @@ -30,11 +30,6 @@ #define smp_wmb() barrier() #endif -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #define smp_read_barrier_depends() do { } while (0) #endif diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 8a64ff2337f6..7aae4cb2a29a 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -160,8 +160,12 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, (v)) == 0) #define atomic_add_negative(i, v) (atomic_add_return(i, (v)) < 0) - #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + #endif diff --git a/arch/hexagon/include/asm/barrier.h b/arch/hexagon/include/asm/barrier.h index 1041a8e70ce8..4e863daea25b 100644 --- a/arch/hexagon/include/asm/barrier.h +++ b/arch/hexagon/include/asm/barrier.h @@ -29,10 +29,6 @@ #define smp_read_barrier_depends() barrier() #define smp_wmb() barrier() #define smp_mb() barrier() -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() /* Set a value and use a memory barrier. Used by the scheduler somewhere. */ #define set_mb(var, value) \ From 93ea02bb84354370e51de803a9405f171f3edf88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2013 14:57:36 +0100 Subject: [PATCH 27/34] arch: Clean up asm/barrier.h implementations using asm-generic/barrier.h We're going to be adding a few new barrier primitives, and in order to avoid endless duplication make more agressive use of asm-generic/barrier.h. Change the asm-generic/barrier.h such that it allows partial barrier definitions and fills out the rest with defaults. There are a few architectures (m32r, m68k) that could probably do away with their barrier.h file entirely but are kept for now due to their unconventional nop() implementation. Suggested-by: Geert Uytterhoeven Reviewed-by: "Paul E. McKenney" Reviewed-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra Cc: Michael Ellerman Cc: Michael Neuling Cc: Russell King Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Victor Kaplansky Cc: Tony Luck Cc: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131213150640.846368594@infradead.org Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/barrier.h | 25 ++------- arch/arc/include/asm/Kbuild | 1 + arch/avr32/include/asm/barrier.h | 17 ++---- arch/blackfin/include/asm/barrier.h | 18 +----- arch/cris/include/asm/Kbuild | 1 + arch/cris/include/asm/barrier.h | 25 --------- arch/frv/include/asm/barrier.h | 8 +-- arch/hexagon/include/asm/Kbuild | 1 + arch/m32r/include/asm/barrier.h | 80 +-------------------------- arch/m68k/include/asm/barrier.h | 14 +---- arch/microblaze/include/asm/Kbuild | 1 + arch/microblaze/include/asm/barrier.h | 27 --------- arch/mn10300/include/asm/Kbuild | 1 + arch/mn10300/include/asm/barrier.h | 37 ------------- arch/parisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/barrier.h | 35 ------------ arch/score/include/asm/Kbuild | 1 + arch/score/include/asm/barrier.h | 16 ------ arch/sh/include/asm/barrier.h | 21 +------ arch/sparc/include/asm/barrier_32.h | 12 +--- arch/tile/include/asm/barrier.h | 68 +---------------------- arch/unicore32/include/asm/barrier.h | 11 +--- arch/xtensa/include/asm/barrier.h | 9 +-- include/asm-generic/barrier.h | 42 +++++++++----- 24 files changed, 58 insertions(+), 414 deletions(-) delete mode 100644 arch/cris/include/asm/barrier.h delete mode 100644 arch/microblaze/include/asm/barrier.h delete mode 100644 arch/mn10300/include/asm/barrier.h delete mode 100644 arch/parisc/include/asm/barrier.h delete mode 100644 arch/score/include/asm/barrier.h diff --git a/arch/alpha/include/asm/barrier.h b/arch/alpha/include/asm/barrier.h index ce8860a0b32d..3832bdb794fe 100644 --- a/arch/alpha/include/asm/barrier.h +++ b/arch/alpha/include/asm/barrier.h @@ -3,33 +3,18 @@ #include -#define mb() \ -__asm__ __volatile__("mb": : :"memory") +#define mb() __asm__ __volatile__("mb": : :"memory") +#define rmb() __asm__ __volatile__("mb": : :"memory") +#define wmb() __asm__ __volatile__("wmb": : :"memory") -#define rmb() \ -__asm__ __volatile__("mb": : :"memory") - -#define wmb() \ -__asm__ __volatile__("wmb": : :"memory") - -#define read_barrier_depends() \ -__asm__ __volatile__("mb": : :"memory") +#define read_barrier_depends() __asm__ __volatile__("mb": : :"memory") #ifdef CONFIG_SMP #define __ASM_SMP_MB "\tmb\n" -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() read_barrier_depends() #else #define __ASM_SMP_MB -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #endif -#define set_mb(var, value) \ -do { var = value; mb(); } while (0) +#include #endif /* __BARRIER_H */ diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 5943f7f9d325..e07c786011af 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -47,3 +47,4 @@ generic-y += user.h generic-y += vga.h generic-y += xor.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/avr32/include/asm/barrier.h b/arch/avr32/include/asm/barrier.h index 0961275373db..715100790fd0 100644 --- a/arch/avr32/include/asm/barrier.h +++ b/arch/avr32/include/asm/barrier.h @@ -8,22 +8,15 @@ #ifndef __ASM_AVR32_BARRIER_H #define __ASM_AVR32_BARRIER_H -#define nop() asm volatile("nop") - -#define mb() asm volatile("" : : : "memory") -#define rmb() mb() -#define wmb() asm volatile("sync 0" : : : "memory") -#define read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; mb(); } while(0) +/* + * Weirdest thing ever.. no full barrier, but it has a write barrier! + */ +#define wmb() asm volatile("sync 0" : : : "memory") #ifdef CONFIG_SMP # error "The AVR32 port does not support SMP" -#else -# define smp_mb() barrier() -# define smp_rmb() barrier() -# define smp_wmb() barrier() -# define smp_read_barrier_depends() do { } while(0) #endif +#include #endif /* __ASM_AVR32_BARRIER_H */ diff --git a/arch/blackfin/include/asm/barrier.h b/arch/blackfin/include/asm/barrier.h index ebb189507dd7..19283a16ac08 100644 --- a/arch/blackfin/include/asm/barrier.h +++ b/arch/blackfin/include/asm/barrier.h @@ -23,26 +23,10 @@ # define rmb() do { barrier(); smp_check_barrier(); } while (0) # define wmb() do { barrier(); smp_mark_barrier(); } while (0) # define read_barrier_depends() do { barrier(); smp_check_barrier(); } while (0) -#else -# define mb() barrier() -# define rmb() barrier() -# define wmb() barrier() -# define read_barrier_depends() do { } while (0) #endif -#else /* !CONFIG_SMP */ - -#define mb() barrier() -#define rmb() barrier() -#define wmb() barrier() -#define read_barrier_depends() do { } while (0) - #endif /* !CONFIG_SMP */ -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define set_mb(var, value) do { var = value; mb(); } while (0) -#define smp_read_barrier_depends() read_barrier_depends() +#include #endif /* _BLACKFIN_BARRIER_H */ diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index b06caf649a95..35ec2e5ca832 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -12,3 +12,4 @@ generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/cris/include/asm/barrier.h b/arch/cris/include/asm/barrier.h deleted file mode 100644 index 198ad7fa6b25..000000000000 --- a/arch/cris/include/asm/barrier.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __ASM_CRIS_BARRIER_H -#define __ASM_CRIS_BARRIER_H - -#define nop() __asm__ __volatile__ ("nop"); - -#define barrier() __asm__ __volatile__("": : :"memory") -#define mb() barrier() -#define rmb() mb() -#define wmb() mb() -#define read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; mb(); } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() read_barrier_depends() -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while(0) -#endif - -#endif /* __ASM_CRIS_BARRIER_H */ diff --git a/arch/frv/include/asm/barrier.h b/arch/frv/include/asm/barrier.h index 06776ad9f5e9..abbef470154c 100644 --- a/arch/frv/include/asm/barrier.h +++ b/arch/frv/include/asm/barrier.h @@ -17,13 +17,7 @@ #define mb() asm volatile ("membar" : : :"memory") #define rmb() asm volatile ("membar" : : :"memory") #define wmb() asm volatile ("membar" : : :"memory") -#define read_barrier_depends() do { } while (0) -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do {} while(0) -#define set_mb(var, value) \ - do { var = (value); barrier(); } while (0) +#include #endif /* _ASM_BARRIER_H */ diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 67c3450309b7..a614ec9747a6 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -54,3 +54,4 @@ generic-y += ucontext.h generic-y += unaligned.h generic-y += xor.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/m32r/include/asm/barrier.h b/arch/m32r/include/asm/barrier.h index 6976621efd3f..1a40265e8d88 100644 --- a/arch/m32r/include/asm/barrier.h +++ b/arch/m32r/include/asm/barrier.h @@ -11,84 +11,6 @@ #define nop() __asm__ __volatile__ ("nop" : : ) -/* - * Memory barrier. - * - * mb() prevents loads and stores being reordered across this point. - * rmb() prevents loads being reordered across this point. - * wmb() prevents stores being reordered across this point. - */ -#define mb() barrier() -#define rmb() mb() -#define wmb() mb() - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif +#include #endif /* _ASM_M32R_BARRIER_H */ diff --git a/arch/m68k/include/asm/barrier.h b/arch/m68k/include/asm/barrier.h index 445ce22c23cb..15c5f77c1614 100644 --- a/arch/m68k/include/asm/barrier.h +++ b/arch/m68k/include/asm/barrier.h @@ -1,20 +1,8 @@ #ifndef _M68K_BARRIER_H #define _M68K_BARRIER_H -/* - * Force strict CPU ordering. - * Not really required on m68k... - */ #define nop() do { asm volatile ("nop"); barrier(); } while (0) -#define mb() barrier() -#define rmb() barrier() -#define wmb() barrier() -#define read_barrier_depends() ((void)0) -#define set_mb(var, value) ({ (var) = (value); wmb(); }) -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() ((void)0) +#include #endif /* _M68K_BARRIER_H */ diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index ce0bbf8f5640..f77fb6630b11 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -4,3 +4,4 @@ generic-y += exec.h generic-y += trace_clock.h generic-y += syscalls.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/microblaze/include/asm/barrier.h b/arch/microblaze/include/asm/barrier.h deleted file mode 100644 index df5be3e87044..000000000000 --- a/arch/microblaze/include/asm/barrier.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2006 Atmark Techno, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#ifndef _ASM_MICROBLAZE_BARRIER_H -#define _ASM_MICROBLAZE_BARRIER_H - -#define nop() asm volatile ("nop") - -#define smp_read_barrier_depends() do {} while (0) -#define read_barrier_depends() do {} while (0) - -#define mb() barrier() -#define rmb() mb() -#define wmb() mb() -#define set_mb(var, value) do { var = value; mb(); } while (0) -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() - -#endif /* _ASM_MICROBLAZE_BARRIER_H */ diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 74742dc6a3da..367ef399ddf7 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/mn10300/include/asm/barrier.h b/arch/mn10300/include/asm/barrier.h deleted file mode 100644 index 2bd97a5c8af7..000000000000 --- a/arch/mn10300/include/asm/barrier.h +++ /dev/null @@ -1,37 +0,0 @@ -/* MN10300 memory barrier definitions - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_BARRIER_H -#define _ASM_BARRIER_H - -#define nop() asm volatile ("nop") - -#define mb() asm volatile ("": : :"memory") -#define rmb() mb() -#define wmb() asm volatile ("": : :"memory") - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define set_mb(var, value) do { xchg(&var, value); } while (0) -#else /* CONFIG_SMP */ -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; mb(); } while (0) -#endif /* CONFIG_SMP */ - -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -#define read_barrier_depends() do {} while (0) -#define smp_read_barrier_depends() do {} while (0) - -#endif /* _ASM_BARRIER_H */ diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index a603b9ebe54c..8df06d0074f4 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \ poll.h xor.h clkdev.h exec.h generic-y += trace_clock.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h deleted file mode 100644 index e77d834aa803..000000000000 --- a/arch/parisc/include/asm/barrier.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __PARISC_BARRIER_H -#define __PARISC_BARRIER_H - -/* -** This is simply the barrier() macro from linux/kernel.h but when serial.c -** uses tqueue.h uses smp_mb() defined using barrier(), linux/kernel.h -** hasn't yet been included yet so it fails, thus repeating the macro here. -** -** PA-RISC architecture allows for weakly ordered memory accesses although -** none of the processors use it. There is a strong ordered bit that is -** set in the O-bit of the page directory entry. Operating systems that -** can not tolerate out of order accesses should set this bit when mapping -** pages. The O-bit of the PSW should also be set to 1 (I don't believe any -** of the processor implemented the PSW O-bit). The PCX-W ERS states that -** the TLB O-bit is not implemented so the page directory does not need to -** have the O-bit set when mapping pages (section 3.1). This section also -** states that the PSW Y, Z, G, and O bits are not implemented. -** So it looks like nothing needs to be done for parisc-linux (yet). -** (thanks to chada for the above comment -ggg) -** -** The __asm__ op below simple prevents gcc/ld from reordering -** instructions across the mb() "call". -*/ -#define mb() __asm__ __volatile__("":::"memory") /* barrier() */ -#define rmb() mb() -#define wmb() mb() -#define smp_mb() mb() -#define smp_rmb() mb() -#define smp_wmb() mb() -#define smp_read_barrier_depends() do { } while(0) -#define read_barrier_depends() do { } while(0) - -#define set_mb(var, value) do { var = value; mb(); } while (0) - -#endif /* __PARISC_BARRIER_H */ diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index f3414ade77a3..ee2993b6e5d1 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += clkdev.h generic-y += trace_clock.h generic-y += xor.h generic-y += preempt.h +generic-y += barrier.h diff --git a/arch/score/include/asm/barrier.h b/arch/score/include/asm/barrier.h deleted file mode 100644 index 0eacb6471e6d..000000000000 --- a/arch/score/include/asm/barrier.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _ASM_SCORE_BARRIER_H -#define _ASM_SCORE_BARRIER_H - -#define mb() barrier() -#define rmb() barrier() -#define wmb() barrier() -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() - -#define read_barrier_depends() do {} while (0) -#define smp_read_barrier_depends() do {} while (0) - -#define set_mb(var, value) do {var = value; wmb(); } while (0) - -#endif /* _ASM_SCORE_BARRIER_H */ diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h index 72c103dae300..43715308b068 100644 --- a/arch/sh/include/asm/barrier.h +++ b/arch/sh/include/asm/barrier.h @@ -26,29 +26,14 @@ #if defined(CONFIG_CPU_SH4A) || defined(CONFIG_CPU_SH5) #define mb() __asm__ __volatile__ ("synco": : :"memory") #define rmb() mb() -#define wmb() __asm__ __volatile__ ("synco": : :"memory") +#define wmb() mb() #define ctrl_barrier() __icbi(PAGE_OFFSET) -#define read_barrier_depends() do { } while(0) #else -#define mb() __asm__ __volatile__ ("": : :"memory") -#define rmb() mb() -#define wmb() __asm__ __volatile__ ("": : :"memory") #define ctrl_barrier() __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop") -#define read_barrier_depends() do { } while(0) -#endif - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() read_barrier_depends() -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while(0) #endif #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#include + #endif /* __ASM_SH_BARRIER_H */ diff --git a/arch/sparc/include/asm/barrier_32.h b/arch/sparc/include/asm/barrier_32.h index c1b76654ee76..ae69eda288f4 100644 --- a/arch/sparc/include/asm/barrier_32.h +++ b/arch/sparc/include/asm/barrier_32.h @@ -1,15 +1,7 @@ #ifndef __SPARC_BARRIER_H #define __SPARC_BARRIER_H -/* XXX Change this if we ever use a PSO mode kernel. */ -#define mb() __asm__ __volatile__ ("" : : : "memory") -#define rmb() mb() -#define wmb() mb() -#define read_barrier_depends() do { } while(0) -#define set_mb(__var, __value) do { __var = __value; mb(); } while(0) -#define smp_mb() __asm__ __volatile__("":::"memory") -#define smp_rmb() __asm__ __volatile__("":::"memory") -#define smp_wmb() __asm__ __volatile__("":::"memory") -#define smp_read_barrier_depends() do { } while(0) +#include /* for nop() */ +#include #endif /* !(__SPARC_BARRIER_H) */ diff --git a/arch/tile/include/asm/barrier.h b/arch/tile/include/asm/barrier.h index a9a73da5865d..b5a05d050a8f 100644 --- a/arch/tile/include/asm/barrier.h +++ b/arch/tile/include/asm/barrier.h @@ -22,59 +22,6 @@ #include #include -/* - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - */ -#define read_barrier_depends() do { } while (0) - #define __sync() __insn_mf() #include @@ -125,20 +72,7 @@ mb_incoherent(void) #define mb() fast_mb() #define iob() fast_iob() -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() read_barrier_depends() -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#endif - -#define set_mb(var, value) \ - do { var = value; mb(); } while (0) +#include #endif /* !__ASSEMBLY__ */ #endif /* _ASM_TILE_BARRIER_H */ diff --git a/arch/unicore32/include/asm/barrier.h b/arch/unicore32/include/asm/barrier.h index a6620e5336b6..83d6a520f4bd 100644 --- a/arch/unicore32/include/asm/barrier.h +++ b/arch/unicore32/include/asm/barrier.h @@ -14,15 +14,6 @@ #define dsb() __asm__ __volatile__ ("" : : : "memory") #define dmb() __asm__ __volatile__ ("" : : : "memory") -#define mb() barrier() -#define rmb() barrier() -#define wmb() barrier() -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define read_barrier_depends() do { } while (0) -#define smp_read_barrier_depends() do { } while (0) - -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#include #endif /* __UNICORE_BARRIER_H__ */ diff --git a/arch/xtensa/include/asm/barrier.h b/arch/xtensa/include/asm/barrier.h index ef021677d536..e1ee6b51dfc5 100644 --- a/arch/xtensa/include/asm/barrier.h +++ b/arch/xtensa/include/asm/barrier.h @@ -9,21 +9,14 @@ #ifndef _XTENSA_SYSTEM_H #define _XTENSA_SYSTEM_H -#define smp_read_barrier_depends() do { } while(0) -#define read_barrier_depends() do { } while(0) - #define mb() ({ __asm__ __volatile__("memw" : : : "memory"); }) #define rmb() barrier() #define wmb() mb() #ifdef CONFIG_SMP #error smp_* not defined -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() #endif -#define set_mb(var, value) do { var = value; mb(); } while (0) +#include #endif /* _XTENSA_SYSTEM_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 639d7a4d033b..d12a90f93689 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -1,4 +1,5 @@ -/* Generic barrier definitions, based on MN10300 definitions. +/* + * Generic barrier definitions, originally based on MN10300 definitions. * * It should be possible to use these on really simple architectures, * but it serves more as a starting point for new ports. @@ -16,35 +17,50 @@ #ifndef __ASSEMBLY__ -#define nop() asm volatile ("nop") +#include + +#ifndef nop +#define nop() asm volatile ("nop") +#endif /* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. + * Force strict CPU ordering. And yes, this is required on UP too when we're + * talking to devices. * - * This implementation only contains a compiler barrier. + * Fall back to compiler barriers if nothing better is provided. */ -#define mb() asm volatile ("": : :"memory") +#ifndef mb +#define mb() barrier() +#endif + +#ifndef rmb #define rmb() mb() -#define wmb() asm volatile ("": : :"memory") +#endif + +#ifndef wmb +#define wmb() mb() +#endif + +#ifndef read_barrier_depends +#define read_barrier_depends() do { } while (0) +#endif #ifdef CONFIG_SMP #define smp_mb() mb() #define smp_rmb() rmb() #define smp_wmb() wmb() +#define smp_read_barrier_depends() read_barrier_depends() #else #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) #endif -#define set_mb(var, value) do { var = value; mb(); } while (0) -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -#define read_barrier_depends() do {} while (0) -#define smp_read_barrier_depends() do {} while (0) +#ifndef set_mb +#define set_mb(var, value) do { (var) = (value); mb(); } while (0) +#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ From 47933ad41a86a4a9b50bed7c9b9bd2ba242aac63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2013 14:57:36 +0100 Subject: [PATCH 28/34] arch: Introduce smp_load_acquire(), smp_store_release() A number of situations currently require the heavyweight smp_mb(), even though there is no need to order prior stores against later loads. Many architectures have much cheaper ways to handle these situations, but the Linux kernel currently has no portable way to make use of them. This commit therefore supplies smp_load_acquire() and smp_store_release() to remedy this situation. The new smp_load_acquire() primitive orders the specified load against any subsequent reads or writes, while the new smp_store_release() primitive orders the specifed store against any prior reads or writes. These primitives allow array-based circular FIFOs to be implemented without an smp_mb(), and also allow a theoretical hole in rcu_assign_pointer() to be closed at no additional expense on most architectures. In addition, the RCU experience transitioning from explicit smp_read_barrier_depends() and smp_wmb() to rcu_dereference() and rcu_assign_pointer(), respectively resulted in substantial improvements in readability. It therefore seems likely that replacing other explicit barriers with smp_load_acquire() and smp_store_release() will provide similar benefits. It appears that roughly half of the explicit barriers in core kernel code might be so replaced. [Changelog by PaulMck] Reviewed-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: Russell King Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Victor Kaplansky Cc: Tony Luck Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131213150640.908486364@infradead.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/barrier.h | 15 +++++++++ arch/arm64/include/asm/barrier.h | 50 +++++++++++++++++++++++++++++ arch/ia64/include/asm/barrier.h | 23 +++++++++++++ arch/metag/include/asm/barrier.h | 15 +++++++++ arch/mips/include/asm/barrier.h | 15 +++++++++ arch/powerpc/include/asm/barrier.h | 21 +++++++++++- arch/s390/include/asm/barrier.h | 15 +++++++++ arch/sparc/include/asm/barrier_64.h | 15 +++++++++ arch/x86/include/asm/barrier.h | 43 ++++++++++++++++++++++++- include/asm-generic/barrier.h | 15 +++++++++ include/linux/compiler.h | 9 ++++++ 11 files changed, 234 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h index 60f15e274e6d..2f59f7443396 100644 --- a/arch/arm/include/asm/barrier.h +++ b/arch/arm/include/asm/barrier.h @@ -59,6 +59,21 @@ #define smp_wmb() dmb(ishst) #endif +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index d4a63338a53c..78e20ba8806b 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -35,10 +35,60 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #else + #define smp_mb() asm volatile("dmb ish" : : : "memory") #define smp_rmb() asm volatile("dmb ishld" : : : "memory") #define smp_wmb() asm volatile("dmb ishst" : : : "memory") + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + switch (sizeof(*p)) { \ + case 4: \ + asm volatile ("stlr %w1, %0" \ + : "=Q" (*p) : "r" (v) : "memory"); \ + break; \ + case 8: \ + asm volatile ("stlr %1, %0" \ + : "=Q" (*p) : "r" (v) : "memory"); \ + break; \ + } \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1; \ + compiletime_assert_atomic_type(*p); \ + switch (sizeof(*p)) { \ + case 4: \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + break; \ + case 8: \ + asm volatile ("ldar %0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + break; \ + } \ + ___p1; \ +}) + #endif #define read_barrier_depends() do { } while(0) diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h index 60576e06b6fb..d0a69aa35e27 100644 --- a/arch/ia64/include/asm/barrier.h +++ b/arch/ia64/include/asm/barrier.h @@ -45,13 +45,36 @@ # define smp_rmb() rmb() # define smp_wmb() wmb() # define smp_read_barrier_depends() read_barrier_depends() + #else + # define smp_mb() barrier() # define smp_rmb() barrier() # define smp_wmb() barrier() # define smp_read_barrier_depends() do { } while(0) + #endif +/* + * IA64 GCC turns volatile stores into st.rel and volatile loads into ld.acq no + * need for asm trickery! + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + /* * XXX check on this ---I suspect what Linus really wants here is * acquire vs release semantics but we can't discuss this stuff with diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h index c90bfc6bf648..5d6b4b407dda 100644 --- a/arch/metag/include/asm/barrier.h +++ b/arch/metag/include/asm/barrier.h @@ -82,4 +82,19 @@ static inline void fence(void) #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* _ASM_METAG_BARRIER_H */ diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h index f26d8e1bf3c3..e1aa4e4c2984 100644 --- a/arch/mips/include/asm/barrier.h +++ b/arch/mips/include/asm/barrier.h @@ -180,4 +180,19 @@ #define nudge_writes() mb() #endif +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* __ASM_BARRIER_H */ diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index ae782254e731..f89da808ce31 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -45,11 +45,15 @@ # define SMPWMB eieio #endif +#define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") + #define smp_mb() mb() -#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") +#define smp_rmb() __lwsync() #define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") #define smp_read_barrier_depends() read_barrier_depends() #else +#define __lwsync() barrier() + #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() @@ -65,4 +69,19 @@ #define data_barrier(x) \ asm volatile("twi 0,%0,0; isync" : : "r" (x) : "memory"); +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + __lwsync(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + __lwsync(); \ + ___p1; \ +}) + #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 16760eeb79b0..578680f6207a 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -32,4 +32,19 @@ #define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* __ASM_BARRIER_H */ diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h index 95d45986f908..b5aad964558e 100644 --- a/arch/sparc/include/asm/barrier_64.h +++ b/arch/sparc/include/asm/barrier_64.h @@ -53,4 +53,19 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define smp_read_barrier_depends() do { } while(0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* !(__SPARC64_BARRIER_H) */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index c6cd358a1eec..04a48903b2eb 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -92,12 +92,53 @@ #endif #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else +#else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif /* SMP */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +/* + * For either of these options x86 doesn't have a strong TSO memory + * model and we should fall back to full barriers. + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + +#else /* regular x86 TSO memory ordering */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index d12a90f93689..6f692f8ac664 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -62,5 +62,20 @@ #define set_mb(var, value) do { (var) = (value); mb(); } while (0) #endif +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 92669cd182a6..fe7a686dfd8d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -298,6 +298,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #endif +/* Is this type a native word size -- useful for atomic operations */ +#ifndef __native_word +# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#endif + /* Compile time object size, -1 for unknown */ #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 @@ -337,6 +342,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define compiletime_assert(condition, msg) \ _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) +#define compiletime_assert_atomic_type(t) \ + compiletime_assert(__native_word(t), \ + "Need native word sized stores/loads for atomicity.") + /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), From 0d00c7b20c7716ce08399570ea48813ecf001aa8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Sun, 12 Jan 2014 15:31:22 -0800 Subject: [PATCH 29/34] futexes: Clean up various details - Remove unnecessary head variables. - Delete unused parameter in queue_unlock(). Reviewed-by: Darren Hart Reviewed-by: Peter Zijlstra Reviewed-by: Paul E. McKenney Reviewed-by: Thomas Gleixner Signed-off-by: Jason Low Signed-off-by: Davidlohr Bueso Cc: Mike Galbraith Cc: Jeff Mahoney Cc: Linus Torvalds Cc: Scott Norton Cc: Tom Vaden Cc: Aswin Chandramouleeswaran Cc: Waiman Long Cc: Andrew Morton Link: http://lkml.kernel.org/r/1389569486-25487-2-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/futex.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index f6ff0191ecf7..085f5fa0b342 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -598,13 +598,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up @@ -986,7 +983,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; int ret; @@ -999,9 +995,8 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) hb = hash_futex(&key); spin_lock(&hb->lock); - head = &hb->chain; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1034,7 +1029,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; struct futex_q *this, *next; int ret, op_ret; @@ -1082,9 +1076,7 @@ retry_private: goto retry; } - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (match_futex (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1097,10 +1089,8 @@ retry_private: } if (op_ret > 0) { - head = &hb2->chain; - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (match_futex (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1270,7 +1260,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, int drop_count = 0, task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; struct futex_q *this, *next; u32 curval2; @@ -1393,8 +1382,7 @@ retry_private: } } - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; @@ -1494,7 +1482,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); @@ -1867,7 +1855,7 @@ retry_private: ret = get_futex_value_locked(&uval, uaddr); if (ret) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = get_user(uval, uaddr); if (ret) @@ -1881,7 +1869,7 @@ retry_private: } if (uval != val) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = -EWOULDBLOCK; } @@ -2029,7 +2017,7 @@ retry_private: * Task is exiting and we just wait for the * exit to complete. */ - queue_unlock(&q, hb); + queue_unlock(hb); put_futex_key(&q.key); cond_resched(); goto retry; @@ -2081,7 +2069,7 @@ retry_private: goto out_put_key; out_unlock_put_key: - queue_unlock(&q, hb); + queue_unlock(hb); out_put_key: put_futex_key(&q.key); @@ -2091,7 +2079,7 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - queue_unlock(&q, hb); + queue_unlock(hb); ret = fault_in_user_writeable(uaddr); if (ret) @@ -2113,7 +2101,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; u32 uval, vpid = task_pid_vnr(current); int ret; @@ -2153,9 +2140,7 @@ retry: * Ok, other tasks may need to be woken up - check waiters * and do the wakeup if necessary: */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); From a52b89ebb6d4499be38780db8d176c5d3a6fbc17 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 12 Jan 2014 15:31:23 -0800 Subject: [PATCH 30/34] futexes: Increase hash table size for better performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the futex global hash table suffers from its fixed, smallish (for today's standards) size of 256 entries, as well as its lack of NUMA awareness. Large systems, using many futexes, can be prone to high amounts of collisions; where these futexes hash to the same bucket and lead to extra contention on the same hb->lock. Furthermore, cacheline bouncing is a reality when we have multiple hb->locks residing on the same cacheline and different futexes hash to adjacent buckets. This patch keeps the current static size of 16 entries for small systems, or otherwise, 256 * ncpus (or larger as we need to round the number to a power of 2). Note that this number of CPUs accounts for all CPUs that can ever be available in the system, taking into consideration things like hotpluging. While we do impose extra overhead at bootup by making the hash table larger, this is a one time thing, and does not shadow the benefits of this patch. Furthermore, as suggested by tglx, by cache aligning the hash buckets we can avoid access across cacheline boundaries and also avoid massive cache line bouncing if multiple cpus are hammering away at different hash buckets which happen to reside in the same cache line. Also, similar to other core kernel components (pid, dcache, tcp), by using alloc_large_system_hash() we benefit from its NUMA awareness and thus the table is distributed among the nodes instead of in a single one. For a custom microbenchmark that pounds on the uaddr hashing -- making the wait path fail at futex_wait_setup() returning -EWOULDBLOCK for large amounts of futexes, we can see the following benefits on a 80-core, 8-socket 1Tb server: +---------+--------------------+------------------------+-----------------------+-------------------------------+ | threads | baseline (ops/sec) | aligned-only (ops/sec) | large table (ops/sec) | large table+aligned (ops/sec) | +---------+--------------------+------------------------+-----------------------+-------------------------------+ |     512 |              32426 | 50531  (+55.8%)        | 255274  (+687.2%)     | 292553  (+802.2%)             | |     256 |              65360 | 99588  (+52.3%)        | 443563  (+578.6%)     | 508088  (+677.3%)             | |     128 |             125635 | 200075 (+59.2%)        | 742613  (+491.1%)     | 835452  (+564.9%)             | |      80 |             193559 | 323425 (+67.1%)        | 1028147 (+431.1%)     | 1130304 (+483.9%)             | |      64 |             247667 | 443740 (+79.1%)        | 997300  (+302.6%)     | 1145494 (+362.5%)             | |      32 |             628412 | 721401 (+14.7%)        | 965996  (+53.7%)      | 1122115 (+78.5%)              | +---------+--------------------+------------------------+-----------------------+-------------------------------+ Reviewed-by: Darren Hart Reviewed-by: Peter Zijlstra Reviewed-by: Paul E. McKenney Reviewed-by: Waiman Long Reviewed-and-tested-by: Jason Low Reviewed-by: Thomas Gleixner Signed-off-by: Davidlohr Bueso Cc: Mike Galbraith Cc: Jeff Mahoney Cc: Linus Torvalds Cc: Scott Norton Cc: Tom Vaden Cc: Aswin Chandramouleeswaran Link: http://lkml.kernel.org/r/1389569486-25487-3-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/futex.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 085f5fa0b342..577481d5c59d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,6 +63,7 @@ #include #include #include +#include #include @@ -70,8 +71,6 @@ int __read_mostly futex_cmpxchg_enabled; -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) - /* * Futex flags used to encode options to functions and preserve them across * restarts. @@ -149,9 +148,11 @@ static const struct futex_q futex_q_init = { struct futex_hash_bucket { spinlock_t lock; struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; -static struct futex_hash_bucket futex_queues[1<both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; + return &futex_queues[hash & (futex_hashsize - 1)]; } /* @@ -2719,7 +2720,18 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, static int __init futex_init(void) { u32 curval; - int i; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + NULL, NULL, futex_hashsize, futex_hashsize); /* * This will fail and we want it. Some arch implementations do @@ -2734,7 +2746,7 @@ static int __init futex_init(void) if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + for (i = 0; i < futex_hashsize; i++) { plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } From 99b60ce69734dfeda58c6184a326b9475ce1dba3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 12 Jan 2014 15:31:24 -0800 Subject: [PATCH 31/34] futexes: Document multiprocessor ordering guarantees That's essential, if you want to hack on futexes. Reviewed-by: Darren Hart Reviewed-by: Peter Zijlstra Reviewed-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Signed-off-by: Davidlohr Bueso Cc: Mike Galbraith Cc: Jeff Mahoney Cc: Linus Torvalds Cc: Randy Dunlap Cc: Scott Norton Cc: Tom Vaden Cc: Aswin Chandramouleeswaran Cc: Waiman Long Cc: Jason Low Cc: Andrew Morton Link: http://lkml.kernel.org/r/1389569486-25487-4-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/futex.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index 577481d5c59d..fcc6850483fb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -69,6 +69,63 @@ #include "locking/rtmutex_common.h" +/* + * Basic futex operation and ordering guarantees: + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not + * changed it enqueues itself into the hash bucket, releases the hash + * bucket lock and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This functions computes the hash bucket and acquires + * the hash bucket lock. Then it looks for waiters on that futex in the + * hash bucket and wakes them. + * + * Note that the spin_lock serializes waiters and wakers, so that the + * following scenario is avoided: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * The spinlock serializes that: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * lock(hash_bucket(futex)); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * lock(hash_bucket(futex)); + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (!queue_empty()) + * wake_waiters(futex); + * unlock(hash_bucket(futex)); + */ + int __read_mostly futex_cmpxchg_enabled; /* From b0c29f79ecea0b6fbcefc999e70f2843ae8306db Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 12 Jan 2014 15:31:25 -0800 Subject: [PATCH 32/34] futexes: Avoid taking the hb->lock if there's nothing to wake up In futex_wake() there is clearly no point in taking the hb->lock if we know beforehand that there are no tasks to be woken. While the hash bucket's plist head is a cheap way of knowing this, we cannot rely 100% on it as there is a racy window between the futex_wait call and when the task is actually added to the plist. To this end, we couple it with the spinlock check as tasks trying to enter the critical region are most likely potential waiters that will be added to the plist, thus preventing tasks sleeping forever if wakers don't acknowledge all possible waiters. Furthermore, the futex ordering guarantees are preserved, ensuring that waiters either observe the changed user space value before blocking or is woken by a concurrent waker. For wakers, this is done by relying on the barriers in get_futex_key_refs() -- for archs that do not have implicit mb in atomic_inc(), we explicitly add them through a new futex_get_mm function. For waiters we rely on the fact that spin_lock calls already update the head counter, so spinners are visible even if the lock hasn't been acquired yet. For more details please refer to the updated comments in the code and related discussion: https://lkml.org/lkml/2013/11/26/556 Special thanks to tglx for careful review and feedback. Suggested-by: Linus Torvalds Reviewed-by: Darren Hart Reviewed-by: Thomas Gleixner Reviewed-by: Peter Zijlstra Signed-off-by: Davidlohr Bueso Cc: Paul E. McKenney Cc: Mike Galbraith Cc: Jeff Mahoney Cc: Scott Norton Cc: Tom Vaden Cc: Aswin Chandramouleeswaran Cc: Waiman Long Cc: Jason Low Cc: Andrew Morton Link: http://lkml.kernel.org/r/1389569486-25487-5-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/futex.c | 117 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 25 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index fcc6850483fb..30971b5c0e2d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -75,17 +75,20 @@ * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value - * again and verifies that the data has not changed. If it has not - * changed it enqueues itself into the hash bucket, releases the hash - * bucket lock and schedules. + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. * * The waker side modifies the user space value of the futex and calls - * futex_wake(). This functions computes the hash bucket and acquires - * the hash bucket lock. Then it looks for waiters on that futex in the - * hash bucket and wakes them. + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. * - * Note that the spin_lock serializes waiters and wakers, so that the - * following scenario is avoided: + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; @@ -106,24 +109,52 @@ * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. - * The spinlock serializes that: * - * CPU 0 CPU 1 + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); - * lock(hash_bucket(futex)); - * uval = *futex; - * *futex = newval; - * sys_futex(WAKE, futex); - * futex_wake(futex); - * lock(hash_bucket(futex)); + * + * waiters++; + * mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `-------> mb(); (B) * if (uval == val) - * queue(); + * queue(); * unlock(hash_bucket(futex)); - * schedule(); if (!queue_empty()) - * wake_waiters(futex); - * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * wake_waiters(futex); + * unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read -- this + * is guaranteed by the head counter in the hb spinlock; and where (B) + * orders the write to futex and the waiters read -- this is done by the + * barriers in get_futex_key_refs(), through either ihold or atomic_inc, + * depending on the futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. */ int __read_mostly futex_cmpxchg_enabled; @@ -211,6 +242,36 @@ static unsigned long __read_mostly futex_hashsize; static struct futex_hash_bucket *futex_queues; +static inline void futex_get_mm(union futex_key *key) +{ + atomic_inc(&key->private.mm->mm_count); + /* + * Ensure futex_get_mm() implies a full barrier such that + * get_futex_key() implies a full barrier. This is relied upon + * as full barrier (B), see the ordering comment above. + */ + smp_mb__after_atomic_inc(); +} + +static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + /* + * Tasks trying to enter the critical region are most likely + * potential waiters that will be added to the plist. Ensure + * that wakers won't miss to-be-slept tasks in the window between + * the wait call and the actual plist_add. + */ + if (spin_is_locked(&hb->lock)) + return true; + smp_rmb(); /* Make sure we check the lock state first */ + + return !plist_head_empty(&hb->chain); +#else + return true; +#endif +} + /* * We hash on the keys returned from get_futex_key (see below). */ @@ -245,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); + ihold(key->shared.inode); /* implies MB (B) */ break; case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); + futex_get_mm(key); /* implies MB (B) */ break; } } @@ -322,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ return 0; } @@ -429,7 +490,7 @@ again: key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ out: unlock_page(page_head); @@ -1052,6 +1113,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) goto out; hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; + spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { @@ -1072,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); +out_put_key: put_futex_key(&key); out: return ret; @@ -1535,7 +1602,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies MB (A) */ return hb; } From f549ed1abc7e4f0292ce08c4143c64a610c8b2cb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 14 Jan 2014 16:36:10 +1100 Subject: [PATCH 33/34] arch: Re-sort some Kbuild files to hopefully help avoid some conflicts Checkin: 93ea02bb8435 arch: Clean up asm/barrier.h implementations using asm-generic/barrier.h ... unfortunately left some Kbuild files out of order, which caused unnecessary merge conflicts, in particular with checkin: e3fec2f74f7f lib: Add missing arch generic-y entries for asm-generic/hash.h Put them back in order to make the upcoming merges cleaner. Signed-off-by: Stephen Rothwell Link: http://lkml.kernel.org/r/20140114164420.d296fbcc4be3a5f126c86069@canb.auug.org.au Signed-off-by: H. Peter Anvin Cc: Geert Uytterhoeven Cc: "Paul E. McKenney" Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: David Miller --- arch/arc/include/asm/Kbuild | 2 +- arch/cris/include/asm/Kbuild | 2 +- arch/hexagon/include/asm/Kbuild | 2 +- arch/microblaze/include/asm/Kbuild | 2 +- arch/mn10300/include/asm/Kbuild | 2 +- arch/parisc/include/asm/Kbuild | 2 +- arch/score/include/asm/Kbuild | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index e07c786011af..9ae21c198007 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += auxvec.h +generic-y += barrier.h generic-y += bugs.h generic-y += bitsperlong.h generic-y += clkdev.h @@ -47,4 +48,3 @@ generic-y += user.h generic-y += vga.h generic-y += xor.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 35ec2e5ca832..199b1a9dab89 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -3,6 +3,7 @@ header-y += arch-v10/ header-y += arch-v32/ +generic-y += barrier.h generic-y += clkdev.h generic-y += exec.h generic-y += kvm_para.h @@ -12,4 +13,3 @@ generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index a614ec9747a6..ada843c701ef 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -2,6 +2,7 @@ header-y += ucontext.h generic-y += auxvec.h +generic-y += barrier.h generic-y += bug.h generic-y += bugs.h generic-y += clkdev.h @@ -54,4 +55,3 @@ generic-y += ucontext.h generic-y += unaligned.h generic-y += xor.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index f77fb6630b11..a82426589fff 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -1,7 +1,7 @@ +generic-y += barrier.h generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h generic-y += syscalls.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 367ef399ddf7..032143ec2324 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -1,6 +1,6 @@ +generic-y += barrier.h generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 8df06d0074f4..34b0be4ca52d 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,8 +1,8 @@ +generic-y += barrier.h generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \ segment.h topology.h vga.h device.h percpu.h hw_irq.h mutex.h \ div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ poll.h xor.h clkdev.h exec.h generic-y += trace_clock.h generic-y += preempt.h -generic-y += barrier.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index ee2993b6e5d1..fe7471eb0167 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -1,8 +1,8 @@ header-y += +generic-y += barrier.h generic-y += clkdev.h generic-y += trace_clock.h generic-y += xor.h generic-y += preempt.h -generic-y += barrier.h From 63b1a81699c2a45c9f737419b1ec1da0ecf92812 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Jan 2014 14:54:50 +0100 Subject: [PATCH 34/34] futexes: Fix futex_hashsize initialization "futexes: Increase hash table size for better performance" introduces a new alloc_large_system_hash() call. alloc_large_system_hash() however may allocate less memory than requested, e.g. limited by MAX_ORDER. Hence pass a pointer to alloc_large_system_hash() which will contain the hash shift when the function returns. Afterwards correctly set futex_hashsize. Fixes a crash on s390 where the requested allocation size was 4MB but only 1MB was allocated. Signed-off-by: Heiko Carstens Cc: Darren Hart Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Waiman Long Cc: Jason Low Cc: Davidlohr Bueso Link: http://lkml.kernel.org/r/20140116135450.GA4345@osiris Signed-off-by: Ingo Molnar --- kernel/futex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 30971b5c0e2d..1ddc4498f1e1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2844,6 +2844,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, static int __init futex_init(void) { u32 curval; + unsigned int futex_shift; unsigned long i; #if CONFIG_BASE_SMALL @@ -2855,8 +2856,9 @@ static int __init futex_init(void) futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), futex_hashsize, 0, futex_hashsize < 256 ? HASH_SMALL : 0, - NULL, NULL, futex_hashsize, futex_hashsize); - + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; /* * This will fail and we want it. Some arch implementations do * runtime detection of the futex_atomic_cmpxchg_inatomic()