diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-22 14:54:22 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-22 14:54:22 -0700 |
commit | 1bf7067c6e173dc10411704db48338ed69c05565 (patch) | |
tree | 06d731d9647c525fa598d03d7ec957ff9772ff40 /kernel | |
parent | fc934d40178ad4e551a17e2733241d9f29fddd70 (diff) | |
parent | 68722101ec3a0e179408a13708dd020e04f54aab (diff) | |
download | linux-next-1bf7067c6e173dc10411704db48338ed69c05565.tar.gz |
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar:
"The main changes are:
- 'qspinlock' support, enabled on x86: queued spinlocks - these are
now the spinlock variant used by x86 as they outperform ticket
spinlocks in every category. (Waiman Long)
- 'pvqspinlock' support on x86: paravirtualized variant of queued
spinlocks. (Waiman Long, Peter Zijlstra)
- 'qrwlock' support, enabled on x86: queued rwlocks. Similar to
queued spinlocks, they are now the variant used by x86:
CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y
CONFIG_QUEUED_SPINLOCKS=y
CONFIG_ARCH_USE_QUEUED_RWLOCKS=y
CONFIG_QUEUED_RWLOCKS=y
- various lockdep fixlets
- various locking primitives cleanups, further WRITE_ONCE()
propagation"
* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
locking/lockdep: Remove hard coded array size dependency
locking/qrwlock: Don't contend with readers when setting _QW_WAITING
lockdep: Do not break user-visible string
locking/arch: Rename set_mb() to smp_store_mb()
locking/arch: Add WRITE_ONCE() to set_mb()
rtmutex: Warn if trylock is called from hard/softirq context
arch: Remove __ARCH_HAVE_CMPXCHG
locking/rtmutex: Drop usage of __HAVE_ARCH_CMPXCHG
locking/qrwlock: Rename QUEUE_RWLOCK to QUEUED_RWLOCKS
locking/pvqspinlock: Rename QUEUED_SPINLOCK to QUEUED_SPINLOCKS
locking/pvqspinlock: Replace xchg() by the more descriptive set_mb()
locking/pvqspinlock, x86: Enable PV qspinlock for Xen
locking/pvqspinlock, x86: Enable PV qspinlock for KVM
locking/pvqspinlock, x86: Implement the paravirt qspinlock call patching
locking/pvqspinlock: Implement simple paravirt support for the qspinlock
locking/qspinlock: Revert to test-and-set on hypervisors
locking/qspinlock: Use a simple write to grab the lock
locking/qspinlock: Optimize for smaller NR_CPUS
locking/qspinlock: Extract out code snippets for the next patch
locking/qspinlock: Add pending bit
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.locks | 13 | ||||
-rw-r--r-- | kernel/futex.c | 2 | ||||
-rw-r--r-- | kernel/locking/Makefile | 3 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 3 | ||||
-rw-r--r-- | kernel/locking/mcs_spinlock.h | 1 | ||||
-rw-r--r-- | kernel/locking/qrwlock.c | 30 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 473 | ||||
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 325 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 13 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 44 | ||||
-rw-r--r-- | kernel/sched/wait.c | 4 |
11 files changed, 894 insertions, 17 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 08561f1acd13..ebdb0043203a 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER def_bool y depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER -config ARCH_USE_QUEUE_RWLOCK +config ARCH_USE_QUEUED_SPINLOCKS bool -config QUEUE_RWLOCK - def_bool y if ARCH_USE_QUEUE_RWLOCK +config QUEUED_SPINLOCKS + def_bool y if ARCH_USE_QUEUED_SPINLOCKS + depends on SMP + +config ARCH_USE_QUEUED_RWLOCKS + bool + +config QUEUED_RWLOCKS + def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP diff --git a/kernel/futex.c b/kernel/futex.c index 2579e407ff67..55ca63ad9622 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2055,7 +2055,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, { /* * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using set_mb() and + * wake it. set_current_state() is implemented using smp_store_mb() and * queue_me() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index de7a416cca2a..7dd5c9918e4c 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o @@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o -obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index aaeae885d9af..456614136f1a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4067,8 +4067,7 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); + printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 75e114bdf3f2..fd91aaa4554c 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -17,6 +17,7 @@ struct mcs_spinlock { struct mcs_spinlock *next; int locked; /* 1 if lock acquired */ + int count; /* nesting count, see qspinlock.c */ }; #ifndef arch_mcs_spin_lock_contended diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index f956ede7f90d..6c5da483966b 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,5 +1,5 @@ /* - * Queue read/write lock + * Queued read/write locks * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +22,26 @@ #include <linux/hardirq.h> #include <asm/qrwlock.h> +/* + * This internal data structure is used for optimizing access to some of + * the subfields within the atomic_t cnts. + */ +struct __qrwlock { + union { + atomic_t cnts; + struct { +#ifdef __LITTLE_ENDIAN + u8 wmode; /* Writer mode */ + u8 rcnts[3]; /* Reader counts */ +#else + u8 rcnts[3]; /* Reader counts */ + u8 wmode; /* Writer mode */ +#endif + }; + }; + arch_spinlock_t lock; +}; + /** * rspin_until_writer_unlock - inc reader count & spin until writer is gone * @lock : Pointer to queue rwlock structure @@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock) * or wait for a previous writer to go away. */ for (;;) { - cnts = atomic_read(&lock->cnts); - if (!(cnts & _QW_WMASK) && - (atomic_cmpxchg(&lock->cnts, cnts, - cnts | _QW_WAITING) == cnts)) + struct __qrwlock *l = (struct __qrwlock *)lock; + + if (!READ_ONCE(l->wmode) && + (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c new file mode 100644 index 000000000000..38c49202d532 --- /dev/null +++ b/kernel/locking/qspinlock.c @@ -0,0 +1,473 @@ +/* + * Queued spinlock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * + * Authors: Waiman Long <waiman.long@hp.com> + * Peter Zijlstra <peterz@infradead.org> + */ + +#ifndef _GEN_PV_LOCK_SLOWPATH + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/byteorder.h> +#include <asm/qspinlock.h> + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. The paper below provides a good description for this kind + * of lock. + * + * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * + * This queued spinlock implementation is based on the MCS lock, however to make + * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing + * API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +#include "mcs_spinlock.h" + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define MAX_NODES 8 +#else +#define MAX_NODES 4 +#endif + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + * + * PV doubles the storage and uses the second cacheline for PV state. + */ +static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(idx > 3); +#endif + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct mcs_spinlock *decode_tail(u32 tail) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&mcs_nodes[idx], cpu); +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/* + * By using the whole 2nd least significant byte for the pending bit, we + * can allow better optimization of the lock acquisition for the pending + * bit holder. + * + * This internal structure is also used by the set_locked function which + * is not restricted to _Q_PENDING_BITS == 8. + */ +struct __qspinlock { + union { + atomic_t val; +#ifdef __LITTLE_ENDIAN + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; +#else + struct { + u16 tail; + u16 locked_pending; + }; + struct { + u8 reserved[2]; + u8 pending; + u8 locked; + }; +#endif + }; +}; + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + struct __qspinlock *l = (void *)lock; + + return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new, val = atomic_read(&lock->val); + + for (;;) { + new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); +} + + +/* + * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for + * all the PV callbacks. + */ + +static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } + +static __always_inline void __pv_wait_head(struct qspinlock *lock, + struct mcs_spinlock *node) { } + +#define pv_enabled() false + +#define pv_init_node __pv_init_node +#define pv_wait_node __pv_wait_node +#define pv_kick_node __pv_kick_node +#define pv_wait_head __pv_wait_head + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath +#endif + +#endif /* _GEN_PV_LOCK_SLOWPATH */ + +/** + * queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + u32 new, old, tail; + int idx; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (pv_enabled()) + goto queue; + + if (virt_queued_spin_lock(lock)) + return; + + /* + * wait for in-progress pending->locked hand-overs + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) + cpu_relax(); + } + + /* + * trylock || pending + * + * 0,0,0 -> 0,0,1 ; trylock + * 0,0,1 -> 0,1,1 ; pending + */ + for (;;) { + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + new = _Q_LOCKED_VAL; + if (val == new) + new |= _Q_PENDING_VAL; + + old = atomic_cmpxchg(&lock->val, val, new); + if (old == val) + break; + + val = old; + } + + /* + * we won the trylock + */ + if (new == _Q_LOCKED_VAL) + return; + + /* + * we're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all clear_pending_set_locked() + * implementations imply full barriers. + */ + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) + cpu_relax(); + + /* + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 + */ + clear_pending_set_locked(lock); + return; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node = this_cpu_ptr(&mcs_nodes[0]); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + node += idx; + node->locked = 0; + node->next = NULL; + pv_init_node(node); + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + WRITE_ONCE(prev->next, node); + + pv_wait_node(node); + arch_mcs_spin_lock_contended(&node->locked); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + */ + pv_wait_head(lock, node); + while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) + cpu_relax(); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,0,0 -> *,0,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail), + * clear the tail code and grab the lock. Otherwise, we only need + * to grab the lock. + */ + for (;;) { + if (val != tail) { + set_locked(lock); + break; + } + old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + if (old == val) + goto release; /* No contention */ + + val = old; + } + + /* + * contended path; wait for next, release. + */ + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + + arch_mcs_spin_unlock_contended(&next->locked); + pv_kick_node(next); + +release: + /* + * release the node + */ + this_cpu_dec(mcs_nodes[0].count); +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +/* + * Generate the paravirt code for queued_spin_unlock_slowpath(). + */ +#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#define _GEN_PV_LOCK_SLOWPATH + +#undef pv_enabled +#define pv_enabled() true + +#undef pv_init_node +#undef pv_wait_node +#undef pv_kick_node +#undef pv_wait_head + +#undef queued_spin_lock_slowpath +#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath + +#include "qspinlock_paravirt.h" +#include "qspinlock.c" + +#endif diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h new file mode 100644 index 000000000000..04ab18151cc8 --- /dev/null +++ b/kernel/locking/qspinlock_paravirt.h @@ -0,0 +1,325 @@ +#ifndef _GEN_PV_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include <linux/hash.h> +#include <linux/bootmem.h> + +/* + * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead + * of spinning them. + * + * This relies on the architecture to provide two paravirt hypercalls: + * + * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val + * pv_kick(cpu) -- wakes a suspended vcpu + * + * Using these we implement __pv_queued_spin_lock_slowpath() and + * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and + * native_queued_spin_unlock(). + */ + +#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) + +enum vcpu_state { + vcpu_running = 0, + vcpu_halted, +}; + +struct pv_node { + struct mcs_spinlock mcs; + struct mcs_spinlock __res[3]; + + int cpu; + u8 state; +}; + +/* + * Lock and MCS node addresses hash table for fast lookup + * + * Hashing is done on a per-cacheline basis to minimize the need to access + * more than one cacheline. + * + * Dynamically allocate a hash table big enough to hold at least 4X the + * number of possible cpus in the system. Allocation is done on page + * granularity. So the minimum number of hash buckets should be at least + * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page. + * + * Since we should not be holding locks from NMI context (very rare indeed) the + * max load factor is 0.75, which is around the point where open addressing + * breaks down. + * + */ +struct pv_hash_entry { + struct qspinlock *lock; + struct pv_node *node; +}; + +#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry)) +#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry)) + +static struct pv_hash_entry *pv_lock_hash; +static unsigned int pv_lock_hash_bits __read_mostly; + +/* + * Allocate memory for the PV qspinlock hash buckets + * + * This function should be called from the paravirt spinlock initialization + * routine. + */ +void __init __pv_init_lock_hash(void) +{ + int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE); + + if (pv_hash_size < PV_HE_MIN) + pv_hash_size = PV_HE_MIN; + + /* + * Allocate space from bootmem which should be page-size aligned + * and hence cacheline aligned. + */ + pv_lock_hash = alloc_large_system_hash("PV qspinlock", + sizeof(struct pv_hash_entry), + pv_hash_size, 0, HASH_EARLY, + &pv_lock_hash_bits, NULL, + pv_hash_size, pv_hash_size); +} + +#define for_each_hash_entry(he, offset, hash) \ + for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \ + offset < (1 << pv_lock_hash_bits); \ + offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)]) + +static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + + for_each_hash_entry(he, offset, hash) { + if (!cmpxchg(&he->lock, NULL, lock)) { + WRITE_ONCE(he->node, node); + return &he->lock; + } + } + /* + * Hard assume there is a free entry for us. + * + * This is guaranteed by ensuring every blocked lock only ever consumes + * a single entry, and since we only have 4 nesting levels per CPU + * and allocated 4*nr_possible_cpus(), this must be so. + * + * The single entry is guaranteed by having the lock owner unhash + * before it releases. + */ + BUG(); +} + +static struct pv_node *pv_unhash(struct qspinlock *lock) +{ + unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); + struct pv_hash_entry *he; + struct pv_node *node; + + for_each_hash_entry(he, offset, hash) { + if (READ_ONCE(he->lock) == lock) { + node = READ_ONCE(he->node); + WRITE_ONCE(he->lock, NULL); + return node; + } + } + /* + * Hard assume we'll find an entry. + * + * This guarantees a limited lookup time and is itself guaranteed by + * having the lock owner do the unhash -- IFF the unlock sees the + * SLOW flag, there MUST be a hash entry. + */ + BUG(); +} + +/* + * Initialize the PV part of the mcs_spinlock node. + */ +static void pv_init_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + + BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); + + pn->cpu = smp_processor_id(); + pn->state = vcpu_running; +} + +/* + * Wait for node->locked to become true, halt the vcpu after a short spin. + * pv_kick_node() is used to wake the vcpu again. + */ +static void pv_wait_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + int loop; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (READ_ONCE(node->locked)) + return; + cpu_relax(); + } + + /* + * Order pn->state vs pn->locked thusly: + * + * [S] pn->state = vcpu_halted [S] next->locked = 1 + * MB MB + * [L] pn->locked [RmW] pn->state = vcpu_running + * + * Matches the xchg() from pv_kick_node(). + */ + smp_store_mb(pn->state, vcpu_halted); + + if (!READ_ONCE(node->locked)) + pv_wait(&pn->state, vcpu_halted); + + /* + * Reset the vCPU state to avoid unncessary CPU kicking + */ + WRITE_ONCE(pn->state, vcpu_running); + + /* + * If the locked flag is still not set after wakeup, it is a + * spurious wakeup and the vCPU should wait again. However, + * there is a pretty high overhead for CPU halting and kicking. + * So it is better to spin for a while in the hope that the + * MCS lock will be released soon. + */ + } + /* + * By now our node->locked should be 1 and our caller will not actually + * spin-wait for it. We do however rely on our caller to do a + * load-acquire for us. + */ +} + +/* + * Called after setting next->locked = 1, used to wake those stuck in + * pv_wait_node(). + */ +static void pv_kick_node(struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + + /* + * Note that because node->locked is already set, this actual + * mcs_spinlock entry could be re-used already. + * + * This should be fine however, kicking people for no reason is + * harmless. + * + * See the comment in pv_wait_node(). + */ + if (xchg(&pn->state, vcpu_running) == vcpu_halted) + pv_kick(pn->cpu); +} + +/* + * Wait for l->locked to become clear; halt the vcpu after a short spin. + * __pv_queued_spin_unlock() will wake us. + */ +static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +{ + struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; + struct qspinlock **lp = NULL; + int loop; + + for (;;) { + for (loop = SPIN_THRESHOLD; loop; loop--) { + if (!READ_ONCE(l->locked)) + return; + cpu_relax(); + } + + WRITE_ONCE(pn->state, vcpu_halted); + if (!lp) { /* ONCE */ + lp = pv_hash(lock, pn); + /* + * lp must be set before setting _Q_SLOW_VAL + * + * [S] lp = lock [RmW] l = l->locked = 0 + * MB MB + * [S] l->locked = _Q_SLOW_VAL [L] lp + * + * Matches the cmpxchg() in __pv_queued_spin_unlock(). + */ + if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + /* + * The lock is free and _Q_SLOW_VAL has never + * been set. Therefore we need to unhash before + * getting the lock. + */ + WRITE_ONCE(*lp, NULL); + return; + } + } + pv_wait(&l->locked, _Q_SLOW_VAL); + + /* + * The unlocker should have freed the lock before kicking the + * CPU. So if the lock is still not free, it is a spurious + * wakeup and so the vCPU should wait again after spinning for + * a while. + */ + } + + /* + * Lock is unlocked now; the caller will acquire it without waiting. + * As with pv_wait_node() we rely on the caller to do a load-acquire + * for us. + */ +} + +/* + * PV version of the unlock function to be used in stead of + * queued_spin_unlock(). + */ +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + struct pv_node *node; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) + return; + + /* + * Since the above failed to release, this must be the SLOW path. + * Therefore start by looking up the blocked node and unhashing it. + */ + node = pv_unhash(lock); + + /* + * Now that we have a reference to the (likely) blocked pv_node, + * release the lock. + */ + smp_store_release(&l->locked, 0); + + /* + * At this point the memory pointed at by lock can be freed/reused, + * however we can still use the pv_node to kick the CPU. + */ + if (READ_ONCE(node->state) == vcpu_halted) + pv_kick(node->cpu); +} +/* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() near the top of the file to make sure + * that the callee-save thunk and the real unlock function are close + * to each other sharing consecutive instruction cachelines. + */ +#include <asm/qspinlock_paravirt.h> + diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b025295f4966..30ec5b46cd8c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) } /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up + * We can speed up the acquire/release, if there's no debugging state to be + * set up. */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +#ifndef CONFIG_DEBUG_RT_MUTEXES # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { @@ -1443,10 +1443,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); * * @lock: the rt_mutex to be locked * + * This function can only be called in thread context. It's safe to + * call it from atomic regions, but not from hard interrupt or soft + * interrupt context. + * * Returns 1 on success and 0 on contention */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + return 0; + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } EXPORT_SYMBOL_GPL(rt_mutex_trylock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3417d0172a5d..0f189714e457 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -409,11 +409,24 @@ done: return taken; } +/* + * Return true if the rwsem has active spinner + */ +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return osq_is_locked(&sem->osq); +} + #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } + +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return false; +} #endif /* @@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + /* + * If a spinner is present, it is not necessary to do the wakeup. + * Try to do wakeup only if the trylock succeeds to minimize + * spinlock contention which may introduce too much delay in the + * unlock operation. + * + * spinning writer up_write/up_read caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * MB RMB + * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) + * + * Here, it is important to make sure that there won't be a missed + * wakeup while the rwsem is free and the only spinning writer goes + * to sleep without taking the rwsem. Even when the spinning writer + * is just going to break out of the waiting loop, it will still do + * a trylock in rwsem_down_write_failed() before sleeping. IOW, if + * rwsem_has_spinner() is true, it will guarantee at least one + * trylock attempt on the rwsem later on. + */ + if (rwsem_has_spinner(sem)) { + /* + * The smp_rmb() here is to make sure that the spinner + * state is consulted before reading the wait_lock. + */ + smp_rmb(); + if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) + return sem; + goto locked; + } raw_spin_lock_irqsave(&sem->wait_lock, flags); +locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 852143a79f36..9bc82329eaad 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } @@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) * doesn't imply write barrier and the users expects write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with set_mb() in wait_woken(). + * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ wait->flags |= WQ_FLAG_WOKEN; |