alistair23-linux/kernel/locking/mcs_spinlock.h
Waiman Long a33fda35e3 locking/qspinlock: Introduce a simple generic 4-byte queued spinlock
This patch introduces a new generic queued spinlock implementation that
can serve as an alternative to the default ticket spinlock. Compared
with the ticket spinlock, this queued spinlock should be almost as fair
as the ticket spinlock. It has about the same speed in single-thread
and it can be much faster in high contention situations especially when
the spinlock is embedded within the data structure to be protected.

Only in light to moderate contention where the average queue depth
is around 1-3 will this queued spinlock be potentially a bit slower
due to the higher slowpath overhead.

This queued spinlock is especially suit to NUMA machines with a large
number of cores as the chance of spinlock contention is much higher
in those machines. The cost of contention is also higher because of
slower inter-node memory traffic.

Due to the fact that spinlocks are acquired with preemption disabled,
the process will not be migrated to another CPU while it is trying
to get a spinlock. Ignoring interrupt handling, a CPU can only be
contending in one spinlock at any one time. Counting soft IRQ, hard
IRQ and NMI, a CPU can only have a maximum of 4 concurrent lock waiting
activities.  By allocating a set of per-cpu queue nodes and used them
to form a waiting queue, we can encode the queue node address into a
much smaller 24-bit size (including CPU number and queue node index)
leaving one byte for the lock.

Please note that the queue node is only needed when waiting for the
lock. Once the lock is acquired, the queue node can be released to
be used later.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Douglas Hatch <doug.hatch@hp.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <paolo.bonzini@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1429901803-29771-2-git-send-email-Waiman.Long@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-08 12:36:25 +02:00

113 lines
3.3 KiB
C

/*
* MCS lock defines
*
* This file contains the main data structure and API definitions of MCS lock.
*
* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
* with the desirable properties of being fair, and with each cpu trying
* to acquire the lock spinning on a local variable.
* It avoids expensive cache bouncings that common test-and-set spin-lock
* implementations incur.
*/
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H
#include <asm/mcs_spinlock.h>
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
int count; /* nesting count, see qspinlock.c */
};
#ifndef arch_mcs_spin_lock_contended
/*
* Using smp_load_acquire() provides a memory barrier that ensures
* subsequent operations happen after the lock is acquired.
*/
#define arch_mcs_spin_lock_contended(l) \
do { \
while (!(smp_load_acquire(l))) \
cpu_relax_lowlatency(); \
} while (0)
#endif
#ifndef arch_mcs_spin_unlock_contended
/*
* smp_store_release() provides a memory barrier to ensure all
* operations in the critical section has been completed before
* unlocking.
*/
#define arch_mcs_spin_unlock_contended(l) \
smp_store_release((l), 1)
#endif
/*
* Note: the smp_load_acquire/smp_store_release pair is not
* sufficient to form a full memory barrier across
* cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
* For applications that need a full barrier across multiple cpus
* with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
* used after mcs_lock.
*/
/*
* In order to acquire the lock, the caller should declare a local node and
* pass a reference of the node to this function in addition to the lock.
* If the lock has already been acquired, then this will proceed to spin
* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().
*/
static inline
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *prev;
/* Init node */
node->locked = 0;
node->next = NULL;
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads
* only spin on its own node->locked value for lock acquisition.
* However, since this thread can immediately acquire the lock
* and does not proceed to spin on its own node->locked, this
* value won't be used. If a debug mode is needed to
* audit lock status, then set node->locked value here.
*/
return;
}
WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
arch_mcs_spin_lock_contended(&node->locked);
}
/*
* Releases the lock. The caller should pass in the corresponding node that
* was used to acquire the lock.
*/
static inline
void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *next = READ_ONCE(node->next);
if (likely(!next)) {
/*
* Release the lock by setting it to NULL
*/
if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
while (!(next = READ_ONCE(node->next)))
cpu_relax_lowlatency();
}
/* Pass lock to next waiter. */
arch_mcs_spin_unlock_contended(&next->locked);
}
#endif /* __LINUX_MCS_SPINLOCK_H */