Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "The main changes in this cycle were: - Implement wraparound-safe refcount_t and kref_t types based on generic atomic primitives (Peter Zijlstra) - Improve and fix the ww_mutex code (Nicolai Hähnle) - Add self-tests to the ww_mutex code (Chris Wilson) - Optimize percpu-rwsems with the 'rcuwait' mechanism (Davidlohr Bueso) - Micro-optimize the current-task logic all around the core kernel (Davidlohr Bueso) - Tidy up after recent optimizations: remove stale code and APIs, clean up the code (Waiman Long) - ... plus misc fixes, updates and cleanups" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) fork: Fix task_struct alignment locking/spinlock/debug: Remove spinlock lockup detection code lockdep: Fix incorrect condition to print bug msgs for MAX_LOCKDEP_CHAIN_HLOCKS lkdtm: Convert to refcount_t testing kref: Implement 'struct kref' using refcount_t refcount_t: Introduce a special purpose refcount type sched/wake_q: Clarify queue reinit comment sched/wait, rcuwait: Fix typo in comment locking/mutex: Fix lockdep_assert_held() fail locking/rtmutex: Flip unlikely() branch to likely() in __rt_mutex_slowlock() locking/rwsem: Reinit wake_q after use locking/rwsem: Remove unnecessary atomic_long_t casts jump_labels: Move header guard #endif down where it belongs locking/atomic, kref: Implement kref_put_lock() locking/ww_mutex: Turn off __must_check for now locking/atomic, kref: Avoid more abuse locking/atomic, kref: Use kref_get_unless_zero() more locking/atomic, kref: Kill kref_sub() locking/atomic, kref: Add kref_read() locking/atomic, kref: Add KREF_INIT() ...
2017-02-20 13:23:30 -08:00 · 2017-02-20 13:23:30 -08:00 · 42e1b14b6e
parent 828cad8ea0 95cb64c1fe
commit 42e1b14b6e
116 changed files with 1872 additions and 769 deletions
--- a/Documentation/locking/ww-mutex-design.txt
+++ b/Documentation/locking/ww-mutex-design.txt
@ -309,11 +309,15 @@ Design:
  normal mutex locks, which are far more common. As such there is only a small
  increase in code size if wait/wound mutexes are not used.

+  We maintain the following invariants for the wait list:
+  (1) Waiters with an acquire context are sorted by stamp order; waiters
+      without an acquire context are interspersed in FIFO order.
+  (2) Among waiters with contexts, only the first one can have other locks
+      acquired already (ctx->acquired > 0). Note that this waiter may come
+      after other waiters without contexts in the list.
+
  In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices. The only way to make wakeups
-  smarter would be at the cost of adding a field to struct mutex_waiter. This
-  would add overhead to all cases where normal mutexes are used, and
-  ww_mutexes are generally less performance sensitive.
+  serialize access to resources for devices.

 Lockdep:
  Special care has been taken to warn for as many cases of api abuse
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 			add_sigio_fd(random_fd);

 			add_wait_queue(&host_read_wait, &wait);
-			set_task_state(current, TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE);

 			schedule();
 			remove_wait_queue(&host_read_wait, &wait);
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@ -23,9 +23,6 @@
 /* How long a lock should spin before we consider blocking */
 #define SPIN_THRESHOLD	(1 << 15)

-extern struct static_key paravirt_ticketlocks_enabled;
-static __always_inline bool static_key_false(struct static_key *key);
-
 #include <asm/qspinlock.h>

 /*
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line)
 	 * Something went wrong. Crash the box, as something could be
 	 * corrupting the kernel.
 	 */
-	pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
-	       ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+	pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
 	BUG();
 }

--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@ -620,18 +620,4 @@ void __init kvm_spinlock_init(void)
 	}
 }

-static __init int kvm_spinlock_init_jump(void)
-{
-	if (!kvm_para_available())
-		return 0;
-	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
-		return 0;
-
-	static_key_slow_inc(&paravirt_ticketlocks_enabled);
-	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
-
-	return 0;
-}
-early_initcall(kvm_spinlock_init_jump);
-
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@ -42,6 +42,3 @@ struct pv_lock_ops pv_lock_ops = {
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-
-struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@ -141,25 +141,6 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
 }

-/*
- * While the jump_label init code needs to happend _after_ the jump labels are
- * enabled and before SMP is started. Hence we use pre-SMP initcall level
- * init. We cannot do it in xen_init_spinlocks as that is done before
- * jump labels are activated.
- */
-static __init int xen_init_spinlocks_jump(void)
-{
-	if (!xen_pvspin)
-		return 0;
-
-	if (!xen_domain())
-		return 0;
-
-	static_key_slow_inc(&paravirt_ticketlocks_enabled);
-	return 0;
-}
-early_initcall(xen_init_spinlocks_jump);
-
 static __init int xen_parse_nopvspin(char *arg)
 {
 	xen_pvspin = false;
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
 		.done = 0,
 		.flags = flags,
 		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
+		.kref = KREF_INIT(2),
 	};

 	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@ -2948,7 +2948,6 @@ void drbd_delete_device(struct drbd_device *device)
 	struct drbd_resource *resource = device->resource;
 	struct drbd_connection *connection;
 	struct drbd_peer_device *peer_device;
-	int refs = 3;

 	/* move to free_peer_device() */
 	for_each_peer_device(peer_device, device)
@ -2956,13 +2955,15 @@ void drbd_delete_device(struct drbd_device *device)
 	drbd_debugfs_device_cleanup(device);
 	for_each_connection(connection, resource) {
 		idr_remove(&connection->peer_devices, device->vnr);
-		refs++;
+		kref_put(&device->kref, drbd_destroy_device);
 	}
 	idr_remove(&resource->devices, device->vnr);
+	kref_put(&device->kref, drbd_destroy_device);
 	idr_remove(&drbd_devices, device_to_minor(device));
+	kref_put(&device->kref, drbd_destroy_device);
 	del_gendisk(device->vdisk);
 	synchronize_rcu();
-	kref_sub(&device->kref, refs, drbd_destroy_device);
+	kref_put(&device->kref, drbd_destroy_device);
 }

 static int __init drbd_init(void)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@ -421,7 +421,6 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	unsigned s = req->rq_state;
 	int c_put = 0;
-	int k_put = 0;

 	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
 		set |= RQ_COMPLETION_SUSP;
@ -437,6 +436,8 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,

 	/* intent: get references */

+	kref_get(&req->kref);
+
 	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
 		atomic_inc(&req->completion_ref);

@ -473,15 +474,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,

 	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
 		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
-		/* local completion may still come in later,
-		 * we need to keep the req object around. */
-		kref_get(&req->kref);
 		++c_put;
 	}

 	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
 		if (req->rq_state & RQ_LOCAL_ABORTED)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		else
 			++c_put;
 		list_del_init(&req->req_pending_local);
@ -503,7 +501,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		if (s & RQ_NET_SENT)
 			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
 		if (s & RQ_EXP_BARR_ACK)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		req->net_done_jif = jiffies;

 		/* in ahead/behind mode, or just in case,
@ -516,25 +514,16 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,

 	/* potentially complete and destroy */

-	if (k_put || c_put) {
-		/* Completion does it's own kref_put.  If we are going to
-		 * kref_sub below, we need req to be still around then. */
-		int at_least = k_put + !!c_put;
-		int refcount = atomic_read(&req->kref.refcount);
-		if (refcount < at_least)
-			drbd_err(device,
-				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
-				s, req->rq_state, refcount, at_least);
-	}
-
 	/* If we made progress, retry conflicting peer requests, if any. */
 	if (req->i.waiting)
 		wake_up(&device->misc_wait);

-	if (c_put)
-		k_put += drbd_req_put_completion_ref(req, m, c_put);
-	if (k_put)
-		kref_sub(&req->kref, k_put, drbd_req_destroy);
+	if (c_put) {
+		if (drbd_req_put_completion_ref(req, m, c_put))
+			kref_put(&req->kref, drbd_req_destroy);
+	} else {
+		kref_put(&req->kref, drbd_req_destroy);
+	}
 }

 static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@ -1535,7 +1535,7 @@ static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_get(&obj_request->kref);
 }

@ -1544,14 +1544,14 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
 {
 	rbd_assert(obj_request != NULL);
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
 }

 static void rbd_img_request_get(struct rbd_img_request *img_request)
 {
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-	     atomic_read(&img_request->kref.refcount));
+	     kref_read(&img_request->kref));
 	kref_get(&img_request->kref);
 }

@ -1562,7 +1562,7 @@ static void rbd_img_request_put(struct rbd_img_request *img_request)
 {
 	rbd_assert(img_request != NULL);
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-		atomic_read(&img_request->kref.refcount));
+		kref_read(&img_request->kref));
 	if (img_request_child_test(img_request))
 		kref_put(&img_request->kref, rbd_parent_request_destroy);
 	else
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@ -770,7 +770,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);

-	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
+	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
 	put_disk(vblk->disk);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk->vqs);
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@ -376,7 +376,7 @@ void drm_gem_cma_describe(struct drm_gem_cma_object *cma_obj,
 	off = drm_vma_node_start(&obj->vma_node);

 	seq_printf(m, "%2d (%2d) %08llx %pad %p %zu",
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, &cma_obj->paddr, cma_obj->vaddr, obj->size);

 	seq_printf(m, "\n");
--- a/drivers/gpu/drm/drm_info.c
+++ b/drivers/gpu/drm/drm_info.c
@ -118,7 +118,7 @@ static int drm_gem_one_name_info(int id, void *ptr, void *data)
 	seq_printf(m, "%6d %8zd %7d %8d\n",
 		   obj->name, obj->size,
 		   obj->handle_count,
-		   atomic_read(&obj->refcount.refcount));
+		   kref_read(&obj->refcount));
 	return 0;
 }

--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@ -159,7 +159,7 @@ EXPORT_SYMBOL(drm_mode_object_find);
 void drm_mode_object_unreference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_put(&obj->refcount, obj->free_cb);
 	}
 }
@ -176,7 +176,7 @@ EXPORT_SYMBOL(drm_mode_object_unreference);
 void drm_mode_object_reference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_get(&obj->refcount);
 	}
 }
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@ -486,7 +486,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)

 	seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n",
 			etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, etnaviv_obj->vaddr, obj->size);

 	rcu_read_lock();
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@ -256,7 +256,7 @@ extern void drm_gem_object_unreference_unlocked(struct drm_gem_object *);
 static inline bool
 i915_gem_object_is_dead(const struct drm_i915_gem_object *obj)
 {
-	return atomic_read(&obj->base.refcount.refcount) == 0;
+	return kref_read(&obj->base.refcount) == 0;
 }

 static inline bool
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@ -642,7 +642,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)

 	seq_printf(m, "%08x: %c %2d (%2d) %08llx %p\t",
 			msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, msm_obj->vaddr);

 	for (id = 0; id < priv->num_aspaces; id++)
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@ -527,7 +527,7 @@ static bool nouveau_fence_no_signaling(struct dma_fence *f)
 	 * caller should have a reference on the fence,
 	 * else fence could get freed here
 	 */
-	WARN_ON(atomic_read(&fence->base.refcount.refcount) <= 1);
+	WARN_ON(kref_read(&fence->base.refcount) <= 1);

 	/*
 	 * This needs uevents to work correctly, but dma_fence_add_callback relies on
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@ -1033,7 +1033,7 @@ void omap_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 	off = drm_vma_node_start(&obj->vma_node);

 	seq_printf(m, "%08x: %2d (%2d) %08llx %pad (%2d) %p %4d",
-			omap_obj->flags, obj->name, obj->refcount.refcount.counter,
+			omap_obj->flags, obj->name, kref_read(&obj->refcount),
 			off, &omap_obj->paddr, omap_obj->paddr_cnt,
 			omap_obj->vaddr, omap_obj->roll);

--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@ -140,8 +140,8 @@ static void ttm_bo_release_list(struct kref *list_kref)
 	struct ttm_bo_device *bdev = bo->bdev;
 	size_t acc_size = bo->acc_size;

-	BUG_ON(atomic_read(&bo->list_kref.refcount));
-	BUG_ON(atomic_read(&bo->kref.refcount));
+	BUG_ON(kref_read(&bo->list_kref));
+	BUG_ON(kref_read(&bo->kref));
 	BUG_ON(atomic_read(&bo->cpu_writers));
 	BUG_ON(bo->mem.mm_node != NULL);
 	BUG_ON(!list_empty(&bo->lru));
@ -181,61 +181,46 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 }
 EXPORT_SYMBOL(ttm_bo_add_to_lru);

-int ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static void ttm_bo_ref_bug(struct kref *list_kref)
+{
+	BUG();
+}
+
+void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;

 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);

 	if (!list_empty(&bo->swap)) {
 		list_del_init(&bo->swap);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
 	if (!list_empty(&bo->lru)) {
 		list_del_init(&bo->lru);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
-
-	return put_count;
-}
-
-static void ttm_bo_ref_bug(struct kref *list_kref)
-{
-	BUG();
-}
-
-void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-			 bool never_free)
-{
-	kref_sub(&bo->list_kref, count,
-		 (never_free) ? ttm_bo_ref_bug : ttm_bo_release_list);
 }

 void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo)
 {
-	int put_count;
-
 	spin_lock(&bo->glob->lru_lock);
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&bo->glob->lru_lock);
-	ttm_bo_list_ref_sub(bo, put_count, true);
 }
 EXPORT_SYMBOL(ttm_bo_del_sub_from_lru);

 void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;

 	lockdep_assert_held(&bo->resv->lock.base);

 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);

-	put_count = ttm_bo_del_from_lru(bo);
-	ttm_bo_list_ref_sub(bo, put_count, true);
+	ttm_bo_del_from_lru(bo);
 	ttm_bo_add_to_lru(bo);
 }
 EXPORT_SYMBOL(ttm_bo_move_to_lru_tail);
@ -447,7 +432,6 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;

 	spin_lock(&glob->lru_lock);
@ -455,13 +439,10 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)

 	if (!ret) {
 		if (!ttm_bo_wait(bo, false, true)) {
-			put_count = ttm_bo_del_from_lru(bo);
-
+			ttm_bo_del_from_lru(bo);
 			spin_unlock(&glob->lru_lock);
 			ttm_bo_cleanup_memtype_use(bo);

-			ttm_bo_list_ref_sub(bo, put_count, true);
-
 			return;
 		} else
 			ttm_bo_flush_all_fences(bo);
@ -504,7 +485,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 					  bool no_wait_gpu)
 {
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;

 	ret = ttm_bo_wait(bo, false, true);
@ -554,15 +534,13 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 		return ret;
 	}

-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	list_del_init(&bo->ddestroy);
-	++put_count;
+	kref_put(&bo->list_kref, ttm_bo_ref_bug);

 	spin_unlock(&glob->lru_lock);
 	ttm_bo_cleanup_memtype_use(bo);

-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	return 0;
 }

@ -740,7 +718,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo;
-	int ret = -EBUSY, put_count;
+	int ret = -EBUSY;

 	spin_lock(&glob->lru_lock);
 	list_for_each_entry(bo, &man->lru, lru) {
@ -771,13 +749,11 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		return ret;
 	}

-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);

 	BUG_ON(ret != 0);

-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	ret = ttm_bo_evict(bo, interruptible, no_wait_gpu);
 	ttm_bo_unreserve(bo);

@ -1669,7 +1645,6 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	    container_of(shrink, struct ttm_bo_global, shrink);
 	struct ttm_buffer_object *bo;
 	int ret = -EBUSY;
-	int put_count;
 	uint32_t swap_placement = (TTM_PL_FLAG_CACHED | TTM_PL_FLAG_SYSTEM);

 	spin_lock(&glob->lru_lock);
@ -1692,11 +1667,9 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 		return ret;
 	}

-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);

-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	/**
 	 * Move to system cached
 	 */
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@ -48,9 +48,7 @@ static void ttm_eu_del_from_lru_locked(struct list_head *list)

 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
-		unsigned put_count = ttm_bo_del_from_lru(bo);
-
-		ttm_bo_list_ref_sub(bo, put_count, true);
+		ttm_bo_del_from_lru(bo);
 	}
 }

--- a/drivers/gpu/drm/ttm/ttm_object.c
+++ b/drivers/gpu/drm/ttm/ttm_object.c
@ -304,7 +304,7 @@ bool ttm_ref_object_exists(struct ttm_object_file *tfile,
 	 * Verify that the ref->obj pointer was actually valid!
 	 */
 	rmb();
-	if (unlikely(atomic_read(&ref->kref.refcount) == 0))
+	if (unlikely(kref_read(&ref->kref) == 0))
 		goto out_false;

 	rcu_read_unlock();
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@ -55,14 +55,14 @@

 #define put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), __free_ep); \
 }

 #define get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }

--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@ -961,7 +961,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 	case IWCH_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case IWCH_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			qhp->attr.state = IWCH_QP_STATE_CLOSING;
 			if (!internal) {
 				abort=0;
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@ -672,14 +672,14 @@ enum c4iw_mmid_state {

 #define c4iw_put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), _c4iw_free_ep); \
 }

 #define c4iw_get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }
 void _c4iw_free_ep(struct kref *kref);
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@ -1580,7 +1580,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 	case C4IW_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_CLOSING);
 			ep = qhp->ep;
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@ -80,7 +80,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 	left = PAGE_SIZE;

 	mutex_lock(&us_ibdev->usdev_lock);
-	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+	if (kref_read(&us_ibdev->vf_cnt) > 0) {
 		char *busname;

 		/*
@ -99,7 +99,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 			PCI_FUNC(us_ibdev->pdev->devfn),
 			netdev_name(us_ibdev->netdev),
 			us_ibdev->ufdev->mac,
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 		UPDATE_PTR_LEFT(n, ptr, left);

 		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
@ -147,7 +147,7 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
 	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);

 	return scnprintf(buf, PAGE_SIZE, "%u\n",
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 }

 static ssize_t
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@ -291,11 +291,11 @@ int usnic_ib_query_device(struct ib_device *ibdev,
 	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
 	props->max_qp = qp_per_vf *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
 		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
 	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
 	props->local_ca_ack_delay = 0;
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	DECLARE_WAITQUEUE(wait, current);

 	add_wait_queue(&c->free_buffer_wait, &wait);
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	dm_bufio_unlock(c);

 	io_schedule();
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@ -1210,14 +1210,14 @@ continue_locked:
 		spin_unlock_irq(&cc->write_thread_wait.lock);

 		if (unlikely(kthread_should_stop())) {
-			set_task_state(current, TASK_RUNNING);
+			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&cc->write_thread_wait, &wait);
 			break;
 		}

 		schedule();

-		set_task_state(current, TASK_RUNNING);
+		set_current_state(TASK_RUNNING);
 		spin_lock_irq(&cc->write_thread_wait.lock);
 		__remove_wait_queue(&cc->write_thread_wait, &wait);
 		goto continue_locked;
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock)
 static void __wait(struct waiter *w)
 {
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);

 		if (!w->task)
 			break;
@ -128,7 +128,7 @@ static void __wait(struct waiter *w)
 		schedule();
 	}

-	set_task_state(current, TASK_RUNNING);
+	set_current_state(TASK_RUNNING);
 }

 static void __wake_waiter(struct waiter *w)
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@ -1396,7 +1396,7 @@ int genwqe_device_remove(struct genwqe_dev *cd)
 	 * application which will decrease this reference from
 	 * 1/unused to 0/illegal and not from 2/used 1/empty.
 	 */
-	rc = atomic_read(&cd->cdev_genwqe.kobj.kref.refcount);
+	rc = kref_read(&cd->cdev_genwqe.kobj.kref);
 	if (rc != 1) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: cdev_genwqe...refcount=%d\n", __func__, rc);
--- a/drivers/misc/lkdtm.h
+++ b/drivers/misc/lkdtm.h
@ -19,8 +19,12 @@ void lkdtm_SOFTLOCKUP(void);
 void lkdtm_HARDLOCKUP(void);
 void lkdtm_SPINLOCKUP(void);
 void lkdtm_HUNG_TASK(void);
-void lkdtm_ATOMIC_UNDERFLOW(void);
-void lkdtm_ATOMIC_OVERFLOW(void);
+void lkdtm_REFCOUNT_SATURATE_INC(void);
+void lkdtm_REFCOUNT_SATURATE_ADD(void);
+void lkdtm_REFCOUNT_ZERO_DEC(void);
+void lkdtm_REFCOUNT_ZERO_INC(void);
+void lkdtm_REFCOUNT_ZERO_SUB(void);
+void lkdtm_REFCOUNT_ZERO_ADD(void);
 void lkdtm_CORRUPT_LIST_ADD(void);
 void lkdtm_CORRUPT_LIST_DEL(void);

--- a/drivers/misc/lkdtm_bugs.c
+++ b/drivers/misc/lkdtm_bugs.c
@ -6,6 +6,7 @@
 */
 #include "lkdtm.h"
 #include <linux/list.h>
+#include <linux/refcount.h>
 #include <linux/sched.h>

 struct lkdtm_list {
@ -129,28 +130,86 @@ void lkdtm_HUNG_TASK(void)
 	schedule();
 }

-void lkdtm_ATOMIC_UNDERFLOW(void)
+void lkdtm_REFCOUNT_SATURATE_INC(void)
 {
-	atomic_t under = ATOMIC_INIT(INT_MIN);
+	refcount_t over = REFCOUNT_INIT(UINT_MAX - 1);

-	pr_info("attempting good atomic increment\n");
-	atomic_inc(&under);
-	atomic_dec(&under);
+	pr_info("attempting good refcount decrement\n");
+	refcount_dec(&over);
+	refcount_inc(&over);

-	pr_info("attempting bad atomic underflow\n");
-	atomic_dec(&under);
+	pr_info("attempting bad refcount inc overflow\n");
+	refcount_inc(&over);
+	refcount_inc(&over);
+	if (refcount_read(&over) == UINT_MAX)
+		pr_err("Correctly stayed saturated, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount wrapped\n");
 }

-void lkdtm_ATOMIC_OVERFLOW(void)
+void lkdtm_REFCOUNT_SATURATE_ADD(void)
 {
-	atomic_t over = ATOMIC_INIT(INT_MAX);
+	refcount_t over = REFCOUNT_INIT(UINT_MAX - 1);

-	pr_info("attempting good atomic decrement\n");
-	atomic_dec(&over);
-	atomic_inc(&over);
+	pr_info("attempting good refcount decrement\n");
+	refcount_dec(&over);
+	refcount_inc(&over);

-	pr_info("attempting bad atomic overflow\n");
-	atomic_inc(&over);
+	pr_info("attempting bad refcount add overflow\n");
+	refcount_add(2, &over);
+	if (refcount_read(&over) == UINT_MAX)
+		pr_err("Correctly stayed saturated, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount wrapped\n");
+}
+
+void lkdtm_REFCOUNT_ZERO_DEC(void)
+{
+	refcount_t zero = REFCOUNT_INIT(1);
+
+	pr_info("attempting bad refcount decrement to zero\n");
+	refcount_dec(&zero);
+	if (refcount_read(&zero) == 0)
+		pr_err("Stayed at zero, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount went crazy\n");
+}
+
+void lkdtm_REFCOUNT_ZERO_SUB(void)
+{
+	refcount_t zero = REFCOUNT_INIT(1);
+
+	pr_info("attempting bad refcount subtract past zero\n");
+	if (!refcount_sub_and_test(2, &zero))
+		pr_info("wrap attempt was noticed\n");
+	if (refcount_read(&zero) == 1)
+		pr_err("Correctly stayed above 0, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount wrapped\n");
+}
+
+void lkdtm_REFCOUNT_ZERO_INC(void)
+{
+	refcount_t zero = REFCOUNT_INIT(0);
+
+	pr_info("attempting bad refcount increment from zero\n");
+	refcount_inc(&zero);
+	if (refcount_read(&zero) == 0)
+		pr_err("Stayed at zero, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount went past zero\n");
+}
+
+void lkdtm_REFCOUNT_ZERO_ADD(void)
+{
+	refcount_t zero = REFCOUNT_INIT(0);
+
+	pr_info("attempting bad refcount addition from zero\n");
+	refcount_add(2, &zero);
+	if (refcount_read(&zero) == 0)
+		pr_err("Stayed at zero, but no BUG?!\n");
+	else
+		pr_err("Fail: refcount went past zero\n");
 }

 void lkdtm_CORRUPT_LIST_ADD(void)
--- a/drivers/misc/lkdtm_core.c
+++ b/drivers/misc/lkdtm_core.c
@ -220,8 +220,12 @@ struct crashtype crashtypes[] = {
 	CRASHTYPE(WRITE_RO),
 	CRASHTYPE(WRITE_RO_AFTER_INIT),
 	CRASHTYPE(WRITE_KERN),
-	CRASHTYPE(ATOMIC_UNDERFLOW),
-	CRASHTYPE(ATOMIC_OVERFLOW),
+	CRASHTYPE(REFCOUNT_SATURATE_INC),
+	CRASHTYPE(REFCOUNT_SATURATE_ADD),
+	CRASHTYPE(REFCOUNT_ZERO_DEC),
+	CRASHTYPE(REFCOUNT_ZERO_INC),
+	CRASHTYPE(REFCOUNT_ZERO_SUB),
+	CRASHTYPE(REFCOUNT_ZERO_ADD),
 	CRASHTYPE(USERCOPY_HEAP_SIZE_TO),
 	CRASHTYPE(USERCOPY_HEAP_SIZE_FROM),
 	CRASHTYPE(USERCOPY_HEAP_FLAG_TO),
--- a/drivers/misc/mei/debugfs.c
+++ b/drivers/misc/mei/debugfs.c
@ -67,7 +67,7 @@ static ssize_t mei_dbgfs_read_meclients(struct file *fp, char __user *ubuf,
 				me_cl->props.max_number_of_connections,
 				me_cl->props.max_msg_length,
 				me_cl->props.single_recv_buf,
-				atomic_read(&me_cl->refcnt.refcount));
+				kref_read(&me_cl->refcnt));

 			mei_me_cl_put(me_cl);
 		}
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@ -155,7 +155,7 @@ static void pnv_php_detach_device_nodes(struct device_node *parent)
 		pnv_php_detach_device_nodes(dn);

 		of_node_put(dn);
-		refcount = atomic_read(&dn->kobj.kref.refcount);
+		refcount = kref_read(&dn->kobj.kref);
 		if (refcount != 1)
 			pr_warn("Invalid refcount %d on <%s>\n",
 				refcount, of_node_full_name(dn));
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(pci_create_slot);
 void pci_destroy_slot(struct pci_slot *slot)
 {
 	dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
-		slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
+		slot->number, kref_read(&slot->kobj.kref) - 1);

 	mutex_lock(&pci_slot_mutex);
 	kobject_put(&slot->kobj);
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@ -74,7 +74,7 @@ static void bnx2fc_cmd_timeout(struct work_struct *work)
 				    &io_req->req_flags)) {
 			/* Handle internally generated ABTS timeout */
 			BNX2FC_IO_DBG(io_req, "ABTS timed out refcnt = %d\n",
-					io_req->refcount.refcount.counter);
+					kref_read(&io_req->refcount));
 			if (!(test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 					       &io_req->req_flags))) {
 				/*
@ -1141,7 +1141,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
 		return SUCCESS;
 	}
 	BNX2FC_IO_DBG(io_req, "eh_abort - refcnt = %d\n",
-		      io_req->refcount.refcount.counter);
+		      kref_read(&io_req->refcount));

 	/* Hold IO request across abort processing */
 	kref_get(&io_req->refcount);
@ -1299,7 +1299,7 @@ void bnx2fc_process_cleanup_compl(struct bnx2fc_cmd *io_req,
 {
 	BNX2FC_IO_DBG(io_req, "Entered process_cleanup_compl "
 			      "refcnt = %d, cmd_type = %d\n",
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);
 	bnx2fc_scsi_done(io_req, DID_ERROR);
 	kref_put(&io_req->refcount, bnx2fc_cmd_release);
 	if (io_req->wait_for_comp)
@ -1318,7 +1318,7 @@ void bnx2fc_process_abts_compl(struct bnx2fc_cmd *io_req,
 	BNX2FC_IO_DBG(io_req, "Entered process_abts_compl xid = 0x%x"
 			      "refcnt = %d, cmd_type = %d\n",
 		   io_req->xid,
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);

 	if (test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 				       &io_req->req_flags)) {
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@ -301,7 +301,7 @@ static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, put csk 0x%p, ref %u-1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_put(&csk->refcnt, cxgbi_sock_free);
 }
 #define cxgbi_sock_put(csk)	__cxgbi_sock_put(__func__, csk)
@ -310,7 +310,7 @@ static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, get csk 0x%p, ref %u+1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_get(&csk->refcnt);
 }
 #define cxgbi_sock_get(csk)	__cxgbi_sock_get(__func__, csk)
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@ -607,7 +607,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size)
 		len += snprintf(buf+len, size-len, "usgmap:%x ",
 			ndlp->nlp_usg_map);
 		len += snprintf(buf+len, size-len, "refcnt:%x",
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		len +=  snprintf(buf+len, size-len, "\n");
 	}
 	spin_unlock_irq(shost->host_lock);
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@ -3690,7 +3690,7 @@ lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
 				 "0006 rpi%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if (NLP_CHK_NODE_ACT(ndlp)) {
 			lpfc_nlp_put(ndlp);
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@ -3440,7 +3440,7 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0002 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	if (ndlp->nlp_flag & NLP_REG_LOGIN_SEND)
 		ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND;
@ -3861,7 +3861,7 @@ out:
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0003 rpi:%x DID:%x flg:%x %d map%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);

 	if (vport->port_state < LPFC_VPORT_READY) {
@ -4238,7 +4238,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0277 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}
 	/* The ndlp should not already be in active mode */
@ -4248,7 +4248,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0278 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}

@ -4272,7 +4272,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0008 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 	}

@ -4546,7 +4546,7 @@ lpfc_unreg_rpi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				    (bf_get(lpfc_sli_intf_if_type,
 				     &phba->sli4_hba.sli_intf) ==
 				      LPFC_SLI_INTF_IF_TYPE_2) &&
-				    (atomic_read(&ndlp->kref.refcount) > 0)) {
+				    (kref_read(&ndlp->kref) > 0)) {
 					mbox->context1 = lpfc_nlp_get(ndlp);
 					mbox->mbox_cmpl =
 						lpfc_sli4_unreg_rpi_cmpl_clr;
@ -4695,14 +4695,14 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				"0280 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_dequeue_node(vport, ndlp);
 	} else {
 		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
 				"0281 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_disable_node(vport, ndlp);
 	}

@ -4791,7 +4791,7 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 		lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
 				 "0005 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if ((mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL))
 			!= NULL) {
@ -5557,7 +5557,7 @@ lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0004 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	/*
 	 * Start issuing Fabric-Device Management Interface (FDMI) command to
@ -5728,7 +5728,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0007 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);

 		ndlp->active_rrqs_xri_bitmap =
@ -5767,7 +5767,7 @@ lpfc_nlp_release(struct kref *kref)
 			"0279 lpfc_nlp_release: ndlp:x%p did %x "
 			"usgmap:x%x refcnt:%d rpi:%x\n",
 			(void *)ndlp, ndlp->nlp_DID, ndlp->nlp_usg_map,
-			atomic_read(&ndlp->kref.refcount), ndlp->nlp_rpi);
+			kref_read(&ndlp->kref), ndlp->nlp_rpi);

 	/* remove ndlp from action. */
 	lpfc_nlp_remove(ndlp->vport, ndlp);
@ -5804,7 +5804,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 			"node get:        did:x%x flg:x%x refcnt:x%x",
 			ndlp->nlp_DID, ndlp->nlp_flag,
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		/* The check of ndlp usage to prevent incrementing the
 		 * ndlp reference count that is in the process of being
 		 * released.
@ -5817,7 +5817,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 				"0276 lpfc_nlp_get: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 			return NULL;
 		} else
 			kref_get(&ndlp->kref);
@ -5844,7 +5844,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 	"node put:        did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
+		kref_read(&ndlp->kref));
 	phba = ndlp->phba;
 	spin_lock_irqsave(&phba->ndlp_lock, flags);
 	/* Check the ndlp memory free acknowledge flag to avoid the
@ -5857,7 +5857,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0274 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* Check the ndlp inactivate log flag to avoid the possible
@ -5870,7 +5870,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0275 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* For last put, mark the ndlp usage flags to make sure no
@ -5878,7 +5878,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	 * in between the process when the final kref_put has been
 	 * invoked on this ndlp.
 	 */
-	if (atomic_read(&ndlp->kref.refcount) == 1) {
+	if (kref_read(&ndlp->kref) == 1) {
 		/* Indicate ndlp is put to inactive state. */
 		NLP_SET_IACT_REQ(ndlp);
 		/* Acknowledge ndlp memory free has been seen. */
@ -5906,8 +5906,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 		"node not used:   did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
-	if (atomic_read(&ndlp->kref.refcount) == 1)
+		kref_read(&ndlp->kref));
+	if (kref_read(&ndlp->kref) == 1)
 		if (lpfc_nlp_put(ndlp))
 			return 1;
 	return 0;
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@ -2660,8 +2660,7 @@ lpfc_cleanup(struct lpfc_vport *vport)
 						"usgmap:x%x refcnt:%d\n",
 						ndlp->nlp_DID, (void *)ndlp,
 						ndlp->nlp_usg_map,
-						atomic_read(
-							&ndlp->kref.refcount));
+						kref_read(&ndlp->kref));
 			}
 			break;
 		}
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@ -371,7 +371,7 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 		 */
 		pr_debug("write_pending aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
@ -584,7 +584,7 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
 		 */
 		pr_debug("queue_data_in aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@ -1300,7 +1300,7 @@ static int ion_debug_heap_show(struct seq_file *s, void *unused)
 			seq_printf(s, "%16s %16u %16zu %d %d\n",
 				   buffer->task_comm, buffer->pid,
 				   buffer->size, buffer->kmap_cnt,
-				   atomic_read(&buffer->ref.refcount));
+				   kref_read(&buffer->ref));
 			total_orphaned_size += buffer->size;
 		}
 	}
--- a/drivers/staging/comedi/comedi_buf.c
+++ b/drivers/staging/comedi/comedi_buf.c
@ -188,7 +188,7 @@ bool comedi_buf_is_mmapped(struct comedi_subdevice *s)
 {
 	struct comedi_buf_map *bm = s->async->buf_map;

-	return bm && (atomic_read(&bm->refcount.refcount) > 1);
+	return bm && (kref_read(&bm->refcount) > 1);
 }

 int comedi_buf_alloc(struct comedi_device *dev, struct comedi_subdevice *s,
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
 		libcfs_debug_dumplog();
 	if (libcfs_panic_on_lbug)
 		panic("LBUG");
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (1)
 		schedule();
 }
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@ -788,7 +788,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			 * __core_scsi3_add_registration()
 			 */
 			dest_lun = rcu_dereference_check(deve_tmp->se_lun,
-				atomic_read(&deve_tmp->pr_kref.refcount) != 0);
+				kref_read(&deve_tmp->pr_kref) != 0);

 			pr_reg_atp = __core_scsi3_do_alloc_registration(dev,
 						nacl_tmp, dest_lun, deve_tmp,
@ -1463,7 +1463,7 @@ static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl)
 		return 0;

@ -1478,7 +1478,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl) {
 		kref_put(&se_deve->pr_kref, target_pr_kref_release);
 		return;
@ -1759,7 +1759,7 @@ core_scsi3_decode_spec_i_port(
 		 * 2nd loop which will never fail.
 		 */
 		dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);

 		dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev,
 					dest_node_acl, dest_lun, dest_se_deve,
@ -3466,7 +3466,7 @@ after_iport_check:
 					iport_ptr);
 	if (!dest_pr_reg) {
 		struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);

 		spin_unlock(&dev->dev_reservation_lock);
 		if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl,
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@ -454,7 +454,7 @@ static void ft_sess_free(struct kref *kref)

 void ft_sess_put(struct ft_sess *sess)
 {
-	int sess_held = atomic_read(&sess->kref.refcount);
+	int sess_held = kref_read(&sess->kref);

 	BUG_ON(!sess_held);
 	kref_put(&sess->kref, ft_sess_free);
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@ -200,7 +200,6 @@ static struct ld_semaphore __sched *
 down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 {
 	struct ldsem_waiter waiter;
-	struct task_struct *tsk = current;
 	long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS;

 	/* set up my own style of waitqueue */
@ -221,8 +220,8 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 	list_add_tail(&waiter.list, &sem->read_wait);
 	sem->wait_readers++;

-	waiter.task = tsk;
-	get_task_struct(tsk);
+	waiter.task = current;
+	get_task_struct(current);

 	/* if there are no active locks, wake the new lock owner(s) */
 	if ((count & LDSEM_ACTIVE_MASK) == 0)
@ -232,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)

 	/* wait to be given the lock */
 	for (;;) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);

 		if (!waiter.task)
 			break;
@ -241,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 		timeout = schedule_timeout(timeout);
 	}

-	__set_task_state(tsk, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);

 	if (!timeout) {
 		/* lock timed out but check if this task was just
@ -268,7 +267,6 @@ static struct ld_semaphore __sched *
 down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 {
 	struct ldsem_waiter waiter;
-	struct task_struct *tsk = current;
 	long adjust = -LDSEM_ACTIVE_BIAS;
 	int locked = 0;

@ -289,16 +287,16 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)

 	list_add_tail(&waiter.list, &sem->write_wait);

-	waiter.task = tsk;
+	waiter.task = current;

-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	for (;;) {
 		if (!timeout)
 			break;
 		raw_spin_unlock_irq(&sem->wait_lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->wait_lock);
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		locked = writer_trylock(sem);
 		if (locked)
 			break;
@ -309,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);

-	__set_task_state(tsk, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);

 	/* lock wait may have timed out */
 	if (!locked)
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@ -3698,7 +3698,7 @@ static void ffs_closed(struct ffs_data *ffs)
 		goto done;

 	if (opts->no_configfs || !opts->func_inst.group.cg_item.ci_parent
-	    || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount))
+	    || !kref_read(&opts->func_inst.group.cg_item.ci_kref))
 		goto done;

 	ci = opts->func_inst.group.cg_item.ci_parent->ci_parent;
--- a/drivers/usb/mon/mon_main.c
+++ b/drivers/usb/mon/mon_main.c
@ -409,7 +409,7 @@ static void __exit mon_exit(void)
 			printk(KERN_ERR TAG
 			    ": Outstanding opens (%d) on usb%d, leaking...\n",
 			    mbus->nreaders, mbus->u_bus->busnum);
-			atomic_set(&mbus->ref.refcount, 2);	/* Force leak */
+			kref_get(&mbus->ref); /* Force leak */
 		}

 		mon_dissolve(mbus, mbus->u_bus);
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void)
 	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
 		printk(KERN_INFO "%s: name %s ref %d\n",
 			__func__, kobject_name(k_name),
-			(int)atomic_read(&k_name->kref.refcount));
+			(int)kref_read(&k_name->kref));
 	}
 #endif
 }
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@ -256,7 +256,7 @@ struct fuse_io_priv {

 #define FUSE_IO_PRIV_SYNC(f) \
 {					\
-	.refcnt = { ATOMIC_INIT(1) },	\
+	.refcnt = KREF_INIT(1),		\
 	.async = 0,			\
 	.file = f,			\
 }
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq,
 		   "  func key:        0x%08x\n"
 		   "  func type:       %u\n",
 		   sc,
-		   atomic_read(&sc->sc_kref.refcount),
+		   kref_read(&sc->sc_kref),
 		   &saddr, inet ? ntohs(sport) : 0,
 		   &daddr, inet ? ntohs(dport) : 0,
 		   sc->sc_node->nd_name,
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@ -97,7 +97,7 @@
 	typeof(sc) __sc = (sc);						\
 	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
 	     "pg_off %zu] " fmt, __sc,					\
-	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	     kref_read(&__sc->sc_kref), __sc->sc_sock,	\
 	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
 	    ##args);							\
 } while (0)
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-	       atomic_read(&lock->lock_refs.refcount),
+	       kref_read(&lock->lock_refs),
 	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
 	       (lock->ast_pending ? 'y' : 'n'),
 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
-	       res->last_used, atomic_read(&res->refs.refcount),
+	       res->last_used, kref_read(&res->refs),
 	       list_empty(&res->purge) ? "no" : "yes");
 	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 			mle_type, mle->master, mle->new_master,
 			!list_empty(&mle->hb_events),
 			!!mle->inuse,
-			atomic_read(&mle->mle_refs.refcount));
+			kref_read(&mle->mle_refs));

 	out += snprintf(buf + out, len - out, "Maybe=");
 	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 		       lock->ast_pending, lock->bast_pending,
 		       lock->convert_pending, lock->lock_pending,
 		       lock->cancel_pending, lock->unlock_pending,
-		       atomic_read(&lock->lock_refs.refcount));
+		       kref_read(&lock->lock_refs));
 	spin_unlock(&lock->spinlock);

 	return out;
@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
 			!list_empty(&res->recovering),
 			res->inflight_locks, res->migration_pending,
 			atomic_read(&res->asts_reserved),
-			atomic_read(&res->refs.refcount));
+			kref_read(&res->refs));

 	/* refmap */
 	out += snprintf(buf + out, len - out, "RMAP:");
@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 	/* Purge Count: xxx  Refs: xxx */
 	out += snprintf(buf + out, len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
-			atomic_read(&dlm->dlm_refs.refcount));
+			kref_read(&dlm->dlm_refs));

 	/* Dead Node: xxx */
 	out += snprintf(buf + out, len - out,
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);

 	mlog(0, "context init: refcount %u\n",
-		  atomic_read(&dlm->dlm_refs.refcount));
+		  kref_read(&dlm->dlm_refs));

 leave:
 	if (ret < 0 && dlm) {
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)

 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	if (!atomic_read(&mle->mle_refs.refcount)) {
+	if (!kref_read(&mle->mle_refs)) {
 		/* this may or may not crash, but who cares.
 		 * it's a BUG. */
 		mlog(ML_ERROR, "bad mle: %p\n", mle);
@ -1124,9 +1124,9 @@ recheck:
 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);

 		/*
-		if (atomic_read(&mle->mle_refs.refcount) < 2)
+		if (kref_read(&mle->mle_refs) < 2)
 			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
-			atomic_read(&mle->mle_refs.refcount),
+			kref_read(&mle->mle_refs),
 			res->lockname.len, res->lockname.name);
 		*/
 		atomic_set(&mle->woken, 0);
@ -1979,7 +1979,7 @@ ok:
 		 * on this mle. */
 		spin_lock(&dlm->master_lock);

-		rr = atomic_read(&mle->mle_refs.refcount);
+		rr = kref_read(&mle->mle_refs);
 		if (mle->inuse > 0) {
 			if (extra_ref && rr < 3)
 				err = 1;
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@ -251,7 +251,7 @@ leave:
 		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
 		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     atomic_read(&lock->lock_refs.refcount)-1);
+		     kref_read(&lock->lock_refs)-1);
 		dlm_lock_put(lock);
 	}
 	if (actions & DLM_UNLOCK_CALL_AST)
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@ -33,7 +33,7 @@
 */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
 }

@ -58,7 +58,7 @@ static inline void __down_write(struct rw_semaphore *sem)
 	long tmp;

 	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-				     (atomic_long_t *)&sem->count);
+					     &sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
 }
@ -68,7 +68,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
 	long tmp;

 	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-				     (atomic_long_t *)&sem->count);
+					     &sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
 			return -EINTR;
@ -91,7 +91,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;

-	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release(&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
 }
@ -102,7 +102,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 static inline void __up_write(struct rw_semaphore *sem)
 {
 	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
-				 (atomic_long_t *)&sem->count) < 0))
+						    &sem->count) < 0))
 		rwsem_wake(sem);
 }

@ -120,8 +120,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * read-locked region is ok to be re-ordered into the
 	 * write side. As such, rely on RELEASE semantics.
 	 */
-	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS,
-				     (atomic_long_t *)&sem->count);
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
 }
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@ -247,7 +247,7 @@ static inline void drm_framebuffer_unreference(struct drm_framebuffer *fb)
 */
 static inline uint32_t drm_framebuffer_read_refcount(struct drm_framebuffer *fb)
 {
-	return atomic_read(&fb->base.refcount.refcount);
+	return kref_read(&fb->base.refcount);
 }

 /**
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@ -332,19 +332,6 @@ extern int ttm_bo_validate(struct ttm_buffer_object *bo,
 */
 extern void ttm_bo_unref(struct ttm_buffer_object **bo);

-
-/**
- * ttm_bo_list_ref_sub
- *
- * @bo: The buffer object.
- * @count: The number of references with which to decrease @bo::list_kref;
- * @never_free: The refcount should not reach zero with this operation.
- *
- * Release @count lru list references to this buffer object.
- */
-extern void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-				bool never_free);
-
 /**
 * ttm_bo_add_to_lru
 *
@ -367,7 +354,7 @@ extern void ttm_bo_add_to_lru(struct ttm_buffer_object *bo);
 * and is usually called just immediately after the bo has been reserved to
 * avoid recursive reservation from lru lists.
 */
-extern int ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
+extern void ttm_bo_del_from_lru(struct ttm_buffer_object *bo);

 /**
 * ttm_bo_move_to_lru_tail
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@ -878,7 +878,7 @@ static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
 {
 	int ret;

-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));

 	ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
 	if (likely(ret == 0))
@ -903,7 +903,7 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo,
 {
 	int ret = 0;

-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));

 	if (interruptible)
 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@ -402,6 +402,6 @@ extern bool ____wrong_branch_error(void);
 #define static_branch_enable(x)		static_key_enable(&(x)->key)
 #define static_branch_disable(x)	static_key_disable(&(x)->key)

-#endif	/* _LINUX_JUMP_LABEL_H */
-
 #endif /* __ASSEMBLY__ */
+
+#endif	/* _LINUX_JUMP_LABEL_H */
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@ -15,22 +15,27 @@
 #ifndef _KREF_H_
 #define _KREF_H_

-#include <linux/bug.h>
-#include <linux/atomic.h>
-#include <linux/kernel.h>
-#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>

 struct kref {
-	atomic_t refcount;
+	refcount_t refcount;
 };

+#define KREF_INIT(n)	{ .refcount = REFCOUNT_INIT(n), }
+
 /**
 * kref_init - initialize object.
 * @kref: object in question.
 */
 static inline void kref_init(struct kref *kref)
 {
-	atomic_set(&kref->refcount, 1);
+	refcount_set(&kref->refcount, 1);
+}
+
+static inline unsigned int kref_read(const struct kref *kref)
+{
+	return refcount_read(&kref->refcount);
 }

 /**
@ -39,41 +44,7 @@ static inline void kref_init(struct kref *kref)
 */
 static inline void kref_get(struct kref *kref)
 {
-	/* If refcount was 0 before incrementing then we have a race
-	 * condition when this kref is freeing by some other thread right now.
-	 * In this case one should use kref_get_unless_zero()
-	 */
-	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
-}
-
-/**
- * kref_sub - subtract a number of refcounts for object.
- * @kref: object.
- * @count: Number of recounts to subtract.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.  If the caller does pass kfree to this
- *	     function, you will be publicly mocked mercilessly by the kref
- *	     maintainer, and anyone else who happens to notice it.  You have
- *	     been warned.
- *
- * Subtract @count from the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0.  Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory.  Only use the return value if you want to see if the kref is now
- * gone, not present.
- */
-static inline int kref_sub(struct kref *kref, unsigned int count,
-	     void (*release)(struct kref *kref))
-{
-	WARN_ON(release == NULL);
-
-	if (atomic_sub_and_test((int) count, &kref->refcount)) {
-		release(kref);
-		return 1;
-	}
-	return 0;
+	refcount_inc(&kref->refcount);
 }

 /**
@ -95,7 +66,13 @@ static inline int kref_sub(struct kref *kref, unsigned int count,
 */
 static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 {
-	return kref_sub(kref, 1, release);
+	WARN_ON(release == NULL);
+
+	if (refcount_dec_and_test(&kref->refcount)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
 }

 static inline int kref_put_mutex(struct kref *kref,
@ -103,12 +80,21 @@ static inline int kref_put_mutex(struct kref *kref,
 				 struct mutex *lock)
 {
 	WARN_ON(release == NULL);
-	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
-		mutex_lock(lock);
-		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
-			mutex_unlock(lock);
-			return 0;
-		}
+
+	if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int kref_put_lock(struct kref *kref,
+				void (*release)(struct kref *kref),
+				spinlock_t *lock)
+{
+	WARN_ON(release == NULL);
+
+	if (refcount_dec_and_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
 	}
@ -133,6 +119,6 @@ static inline int kref_put_mutex(struct kref *kref,
 */
 static inline int __must_check kref_get_unless_zero(struct kref *kref)
 {
-	return atomic_add_unless(&kref->refcount, 1, 0);
+	return refcount_inc_not_zero(&kref->refcount);
 }
 #endif /* _KREF_H_ */
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@ -20,6 +20,8 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>

+struct ww_acquire_ctx;
+
 /*
 * Simple, straightforward mutexes with strict semantics:
 *
@ -65,7 +67,7 @@ struct mutex {

 static inline struct task_struct *__mutex_owner(struct mutex *lock)
 {
-	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03);
+	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07);
 }

 /*
@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
+	struct ww_acquire_ctx	*ww_ctx;
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@ -4,15 +4,15 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
+#include <linux/rcuwait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>

 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem;
-	wait_queue_head_t	writer;
+	struct rw_semaphore	rw_sem; /* slowpath */
+	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
 };

@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
 }

 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@ -80,6 +80,7 @@
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)

 /********** lib/flex_array.c **********/
 #define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+	struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name)		\
+	{ .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+	w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition)				\
+({									\
+	/*								\
+	 * Complain if we are called after do_exit()/exit_notify(),     \
+	 * as we cannot rely on the rcu critical region for the		\
+	 * wakeup side.							\
+	 */                                                             \
+	WARN_ON(current->exit_state);                                   \
+									\
+	rcu_assign_pointer((w)->task, current);				\
+	for (;;) {							\
+		/*							\
+		 * Implicit barrier (A) pairs with (B) in		\
+		 * rcuwait_wake_up().					\
+		 */							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+									\
+	WRITE_ONCE((w)->task, NULL);					\
+	__set_current_state(TASK_RUNNING);				\
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@ -0,0 +1,294 @@
+#ifndef _LINUX_REFCOUNT_H
+#define _LINUX_REFCOUNT_H
+
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_REFCOUNT
+#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
+#define __refcount_check	__must_check
+#else
+#define REFCOUNT_WARN(cond, str) (void)(cond)
+#define __refcount_check
+#endif
+
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+
+static inline void refcount_set(refcount_t *r, unsigned int n)
+{
+	atomic_set(&r->refs, n);
+}
+
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+	return atomic_read(&r->refs);
+}
+
+static inline __refcount_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (!val)
+			return false;
+
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		new = val + i;
+		if (new < val)
+			new = UINT_MAX;
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+static inline void refcount_add(unsigned int i, refcount_t *r)
+{
+	REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_inc_not_zero(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		new = val + 1;
+
+		if (!val)
+			return false;
+
+		if (unlikely(!new))
+			return true;
+
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+	REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return false;
+
+		new = val - i;
+		if (new > val) {
+			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return false;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return !new;
+}
+
+static inline __refcount_check
+bool refcount_dec_and_test(refcount_t *r)
+{
+	return refcount_sub_and_test(1, r);
+}
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+static inline
+void refcount_dec(refcount_t *r)
+{
+	REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+static inline __refcount_check
+bool refcount_dec_if_one(refcount_t *r)
+{
+	return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+static inline __refcount_check
+bool refcount_dec_not_one(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		if (val == 1)
+			return false;
+
+		new = val - 1;
+		if (new > val) {
+			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return true;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return true;
+}
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	mutex_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		mutex_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	spin_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		spin_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _LINUX_REFCOUNT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -226,7 +226,7 @@ extern void proc_sched_set_task(struct task_struct *p);
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];

-/* Convenience macros for the sake of set_task_state */
+/* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
@ -253,17 +253,6 @@ extern char ___assert_task_state[1 - 2*!!(

 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP

-#define __set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		(tsk)->state = (state_value);			\
-	} while (0)
-#define set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		smp_store_mb((tsk)->state, (state_value));	\
-	} while (0)
-
 #define __set_current_state(state_value)			\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
@ -276,20 +265,6 @@ extern char ___assert_task_state[1 - 2*!!(
 	} while (0)

 #else
-
-/*
- * @tsk had better be current, or you get to keep the pieces.
- *
- * The only reason is that computing current can be more expensive than
- * using a pointer that's already available.
- *
- * Therefore, see set_current_state().
- */
-#define __set_task_state(tsk, state_value)		\
-	do { (tsk)->state = (state_value); } while (0)
-#define set_task_state(tsk, state_value)		\
-	smp_store_mb((tsk)->state, (state_value))
-
 /*
 * set_current_state() includes a barrier so that the write of current->state
 * is correctly serialised wrt the caller's subsequent test of whether to
@ -1018,8 +993,8 @@ enum cpu_idle_type {
 *
 * The DEFINE_WAKE_Q macro declares and initializes the list head.
 * wake_up_q() does NOT reinitialize the list; it's expected to be
- * called near the end of a function, where the fact that the queue is
- * not used again will be easy to see by inspection.
+ * called near the end of a function. Otherwise, the list can be
+ * re-initialized for later re-use by wake_q_init().
 *
 * Note that this can cause spurious wakeups. schedule() callers
 * must ensure the call is done inside a loop, confirming that the
@ -1039,6 +1014,12 @@ struct wake_q_head {
 #define DEFINE_WAKE_Q(name)				\
 	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }

+static inline void wake_q_init(struct wake_q_head *head)
+{
+	head->first = WAKE_Q_TAIL;
+	head->lastp = &head->first;
+}
+
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@ -180,8 +180,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
-# define raw_spin_lock_bh_nested(lock, subclass) \
-	_raw_spin_lock_bh_nested(lock, subclass)

 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@ -197,7 +195,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
-# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif

 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@ -317,11 +314,6 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)

-#define spin_lock_bh_nested(lock, subclass)			\
-do {								\
-	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
-} while (0)
-
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@ -22,8 +22,6 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@ -57,7 +57,6 @@

 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@ -198,7 +198,7 @@ static inline struct cache_head  *cache_get(struct cache_head *h)

 static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 {
-	if (atomic_read(&h->ref.refcount) <= 2 &&
+	if (kref_read(&h->ref) <= 2 &&
 	    h->expiry_time < cd->nextcheck)
 		cd->nextcheck = h->expiry_time;
 	kref_put(&h->ref, cd->cache_put);
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@ -51,10 +51,10 @@ struct ww_mutex {
 };

 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
-		, .ww_class = &ww_class
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \
+		, .ww_class = class
 #else
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class)
 #endif

 #define __WW_CLASS_INITIALIZER(ww_class) \
@ -63,7 +63,7 @@ struct ww_mutex {
 		, .mutex_name = #ww_class "_mutex" }

 #define __WW_MUTEX_INITIALIZER(lockname, class) \
-		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		{ .base =  __MUTEX_INITIALIZER(lockname.base) \
 		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }

 #define DEFINE_WW_CLASS(classname) \
@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 #endif
 }

-extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
-					struct ww_acquire_ctx *ctx);
-extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
-						      struct ww_acquire_ctx *ctx);
-
 /**
 * ww_mutex_lock - acquire the w/w mutex
 * @lock: the mutex to be acquired
@ -220,14 +215,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
 *
 * A mutex acquired with this function must be released with ww_mutex_unlock.
 */
-static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	if (ctx)
-		return __ww_mutex_lock(lock, ctx);
-
-	mutex_lock(&lock->base);
-	return 0;
-}
+extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx);

 /**
 * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
@ -259,14 +247,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
 *
 * A mutex acquired with this function must be released with ww_mutex_unlock.
 */
-static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
-							   struct ww_acquire_ctx *ctx)
-{
-	if (ctx)
-		return __ww_mutex_lock_interruptible(lock, ctx);
-	else
-		return mutex_lock_interruptible(&lock->base);
-}
+extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						    struct ww_acquire_ctx *ctx);

 /**
 * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@ -987,7 +987,7 @@ static inline void hci_conn_drop(struct hci_conn *conn)
 static inline void hci_dev_put(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));

 	put_device(&d->dev);
 }
@ -995,7 +995,7 @@ static inline void hci_dev_put(struct hci_dev *d)
 static inline struct hci_dev *hci_dev_hold(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));

 	get_device(&d->dev);
 	return d;
--- a/init/version.c
+++ b/init/version.c
@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE);
 #endif

 struct uts_namespace init_uts_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -55,6 +55,7 @@
 #include <linux/shm.h>
 #include <linux/kcov.h>
 #include <linux/random.h>
+#include <linux/rcuwait.h>

 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@ -282,6 +283,35 @@ retry:
 	return task;
 }

+void rcuwait_wake_up(struct rcuwait *w)
+{
+	struct task_struct *task;
+
+	rcu_read_lock();
+
+	/*
+	 * Order condition vs @task, such that everything prior to the load
+	 * of @task is visible. This is the condition as to why the user called
+	 * rcuwait_trywake() in the first place. Pairs with set_current_state()
+	 * barrier (A) in rcuwait_wait_event().
+	 *
+	 *    WAIT                WAKE
+	 *    [S] tsk = current	  [S] cond = true
+	 *        MB (A)	      MB (B)
+	 *    [L] cond		  [L] tsk
+	 */
+	smp_rmb(); /* (B) */
+
+	/*
+	 * Avoid using task_rcu_dereference() magic as long as we are careful,
+	 * see comment in rcuwait_wait_event() regarding ->exit_state.
+	 */
+	task = rcu_dereference(w->task);
+	if (task)
+		wake_up_process(task);
+	rcu_read_unlock();
+}
+
 struct task_struct *try_get_task_struct(struct task_struct **ptask)
 {
 	struct task_struct *task;
@ -468,12 +498,12 @@ assign_new_owner:
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
-static void exit_mm(struct task_struct *tsk)
+static void exit_mm(void)
 {
-	struct mm_struct *mm = tsk->mm;
+	struct mm_struct *mm = current->mm;
 	struct core_state *core_state;

-	mm_release(tsk, mm);
+	mm_release(current, mm);
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
@ -491,7 +521,7 @@ static void exit_mm(struct task_struct *tsk)

 		up_read(&mm->mmap_sem);

-		self.task = tsk;
+		self.task = current;
 		self.next = xchg(&core_state->dumper.next, &self);
 		/*
 		 * Implies mb(), the result of xchg() must be visible
@ -501,22 +531,22 @@ static void exit_mm(struct task_struct *tsk)
 			complete(&core_state->startup);

 		for (;;) {
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
-		__set_task_state(tsk, TASK_RUNNING);
+		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-	BUG_ON(mm != tsk->active_mm);
+	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
-	task_lock(tsk);
-	tsk->mm = NULL;
+	task_lock(current);
+	current->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
-	task_unlock(tsk);
+	task_unlock(current);
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
@ -823,7 +853,7 @@ void __noreturn do_exit(long code)
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);

-	exit_mm(tsk);
+	exit_mm();

 	if (group_dead)
 		acct_process();
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -432,11 +432,13 @@ void __init fork_init(void)
 	int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN	0
 #endif
+	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
-			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			arch_task_struct_size, align,
 			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif

--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@ -28,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@ -2203,7 +2203,7 @@ cache_hit:
 	 * Important for check_no_collision().
 	 */
 	if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
-		if (debug_locks_off_graph_unlock())
+		if (!debug_locks_off_graph_unlock())
 			return 0;

 		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@ -372,6 +372,78 @@ static struct lock_torture_ops mutex_lock_ops = {
 	.name		= "mutex_lock"
 };

+#include <linux/ww_mutex.h>
+static DEFINE_WW_CLASS(torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+
+static int torture_ww_mutex_lock(void)
+__acquires(torture_ww_mutex_0)
+__acquires(torture_ww_mutex_1)
+__acquires(torture_ww_mutex_2)
+{
+	LIST_HEAD(list);
+	struct reorder_lock {
+		struct list_head link;
+		struct ww_mutex *lock;
+	} locks[3], *ll, *ln;
+	struct ww_acquire_ctx ctx;
+
+	locks[0].lock = &torture_ww_mutex_0;
+	list_add(&locks[0].link, &list);
+
+	locks[1].lock = &torture_ww_mutex_1;
+	list_add(&locks[1].link, &list);
+
+	locks[2].lock = &torture_ww_mutex_2;
+	list_add(&locks[2].link, &list);
+
+	ww_acquire_init(&ctx, &torture_ww_class);
+
+	list_for_each_entry(ll, &list, link) {
+		int err;
+
+		err = ww_mutex_lock(ll->lock, &ctx);
+		if (!err)
+			continue;
+
+		ln = ll;
+		list_for_each_entry_continue_reverse(ln, &list, link)
+			ww_mutex_unlock(ln->lock);
+
+		if (err != -EDEADLK)
+			return err;
+
+		ww_mutex_lock_slow(ll->lock, &ctx);
+		list_move(&ll->link, &list);
+	}
+
+	ww_acquire_fini(&ctx);
+	return 0;
+}
+
+static void torture_ww_mutex_unlock(void)
+__releases(torture_ww_mutex_0)
+__releases(torture_ww_mutex_1)
+__releases(torture_ww_mutex_2)
+{
+	ww_mutex_unlock(&torture_ww_mutex_0);
+	ww_mutex_unlock(&torture_ww_mutex_1);
+	ww_mutex_unlock(&torture_ww_mutex_2);
+}
+
+static struct lock_torture_ops ww_mutex_lock_ops = {
+	.writelock	= torture_ww_mutex_lock,
+	.write_delay	= torture_mutex_delay,
+	.task_boost     = torture_boost_dummy,
+	.writeunlock	= torture_ww_mutex_unlock,
+	.readlock       = NULL,
+	.read_delay     = NULL,
+	.readunlock     = NULL,
+	.name		= "ww_mutex_lock"
+};
+
 #ifdef CONFIG_RT_MUTEXES
 static DEFINE_RT_MUTEX(torture_rtmutex);

@ -797,6 +869,7 @@ static int __init lock_torture_init(void)
 		&spin_lock_ops, &spin_lock_irq_ops,
 		&rw_lock_ops, &rw_lock_irq_ops,
 		&mutex_lock_ops,
+		&ww_mutex_lock_ops,
 #ifdef CONFIG_RT_MUTEXES
 		&rtmutex_lock_ops,
 #endif
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@ -26,20 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
-
-#define spin_lock_mutex(lock, flags)			\
-	do {						\
-		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
-							\
-		DEBUG_LOCKS_WARN_ON(in_interrupt());	\
-		local_irq_save(flags);			\
-		arch_spin_lock(&(lock)->rlock.raw_lock);\
-		DEBUG_LOCKS_WARN_ON(l->magic != l);	\
-	} while (0)
-
-#define spin_unlock_mutex(lock, flags)				\
-	do {							\
-		arch_spin_unlock(&(lock)->rlock.raw_lock);	\
-		local_irq_restore(flags);			\
-		preempt_check_resched();			\
-	} while (0)
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init);
 /*
 * @owner: contains: 'struct task_struct *' to the current lock owner,
 * NULL means not owned. Since task_struct pointers are aligned at
- * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low
- * bits to store extra state.
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
 *
 * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
 * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
 */
 #define MUTEX_FLAG_WAITERS	0x01
 #define MUTEX_FLAG_HANDOFF	0x02
+#define MUTEX_FLAG_PICKUP	0x04

-#define MUTEX_FLAGS		0x03
+#define MUTEX_FLAGS		0x07

 static inline struct task_struct *__owner_task(unsigned long owner)
 {
@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner)
 }

 /*
- * Actual trylock that will work on any unlocked state.
- *
- * When setting the owner field, we must preserve the low flag bits.
- *
- * Be careful with @handoff, only set that in a wait-loop (where you set
- * HANDOFF) to avoid recursive lock attempts.
+ * Trylock variant that retuns the owning task on failure.
 */
-static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
 {
 	unsigned long owner, curr = (unsigned long)current;

 	owner = atomic_long_read(&lock->owner);
 	for (;;) { /* must loop, can race against a flag */
 		unsigned long old, flags = __owner_flags(owner);
+		unsigned long task = owner & ~MUTEX_FLAGS;

-		if (__owner_task(owner)) {
-			if (handoff && unlikely(__owner_task(owner) == current)) {
-				/*
-				 * Provide ACQUIRE semantics for the lock-handoff.
-				 *
-				 * We cannot easily use load-acquire here, since
-				 * the actual load is a failed cmpxchg, which
-				 * doesn't imply any barriers.
-				 *
-				 * Also, this is a fairly unlikely scenario, and
-				 * this contains the cost.
-				 */
-				smp_mb(); /* ACQUIRE */
-				return true;
-			}
+		if (task) {
+			if (likely(task != curr))
+				break;

-			return false;
+			if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+				break;
+
+			flags &= ~MUTEX_FLAG_PICKUP;
+		} else {
+#ifdef CONFIG_DEBUG_MUTEXES
+			DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
 		}

 		/*
@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 		 * past the point where we acquire it. This would be possible
 		 * if we (accidentally) set the bit on an unlocked mutex.
 		 */
-		if (handoff)
-			flags &= ~MUTEX_FLAG_HANDOFF;
+		flags &= ~MUTEX_FLAG_HANDOFF;

 		old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 		if (old == owner)
-			return true;
+			return NULL;

 		owner = old;
 	}
+
+	return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+	return !__mutex_trylock_or_owner(lock);
 }

 #ifndef CONFIG_DEBUG_LOCK_ALLOC
@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait

 /*
 * Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
- * semantics like a regular unlock, the __mutex_trylock() provides matching
- * ACQUIRE semantics for the handoff.
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
 */
 static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 {
@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)

 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif

 		new = (owner & MUTEX_FLAG_WAITERS);
 		new |= (unsigned long)task;
+		if (task)
+			new |= MUTEX_FLAG_PICKUP;

 		old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
 		if (old == owner)
@ -237,8 +241,8 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif

-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
-						   struct ww_acquire_ctx *ww_ctx)
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
 	/*
@ -277,17 +281,50 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 	ww_ctx->acquired++;
 }

+static inline bool __sched
+__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+	return a->stamp - b->stamp <= LONG_MAX &&
+	       (a->stamp != b->stamp || a > b);
+}
+
+/*
+ * Wake up any waiters that may have to back off when the lock is held by the
+ * given context.
+ *
+ * Due to the invariants on the wait list, this can only affect the first
+ * waiter with a context.
+ *
+ * The current task must not be on the wait list.
+ */
+static void __sched
+__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+
+	lockdep_assert_held(&lock->wait_lock);
+
+	list_for_each_entry(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (cur->ww_ctx->acquired > 0 &&
+		    __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
+
+		break;
+	}
+}
+
 /*
 * After acquiring lock with fastpath or when we lost out in contested
 * slowpath, set ctx and wake up any waiters so they can recheck.
 */
 static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
-			       struct ww_acquire_ctx *ctx)
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	unsigned long flags;
-	struct mutex_waiter *cur;
-
 	ww_mutex_lock_acquired(lock, ctx);

 	lock->ctx = ctx;
@ -311,46 +348,79 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
 	 * Uh oh, we raced in fastpath, wake up everyone in this case,
 	 * so they can see the new lock->ctx.
 	 */
-	spin_lock_mutex(&lock->base.wait_lock, flags);
-	list_for_each_entry(cur, &lock->base.wait_list, list) {
-		debug_mutex_wake_waiter(&lock->base, cur);
-		wake_up_process(cur->task);
-	}
-	spin_unlock_mutex(&lock->base.wait_lock, flags);
+	spin_lock(&lock->base.wait_lock);
+	__ww_mutex_wakeup_for_backoff(&lock->base, ctx);
+	spin_unlock(&lock->base.wait_lock);
 }

 /*
- * After acquiring lock in the slowpath set ctx and wake up any
- * waiters so they can recheck.
+ * After acquiring lock in the slowpath set ctx.
+ *
+ * Unlike for the fast path, the caller ensures that waiters are woken up where
+ * necessary.
 *
 * Callers must hold the mutex wait_lock.
 */
 static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock,
-			      struct ww_acquire_ctx *ctx)
+ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	struct mutex_waiter *cur;
-
 	ww_mutex_lock_acquired(lock, ctx);
 	lock->ctx = ctx;
-
-	/*
-	 * Give any possible sleeping processes the chance to wake up,
-	 * so they can recheck if they have to back off.
-	 */
-	list_for_each_entry(cur, &lock->base.wait_list, list) {
-		debug_mutex_wake_waiter(&lock->base, cur);
-		wake_up_process(cur->task);
-	}
 }

 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline
+bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+			    struct mutex_waiter *waiter)
+{
+	struct ww_mutex *ww;
+
+	ww = container_of(lock, struct ww_mutex, base);
+
+	/*
+	 * If ww->ctx is set the contents are undefined, only
+	 * by acquiring wait_lock there is a guarantee that
+	 * they are not invalid when reading.
+	 *
+	 * As such, when deadlock detection needs to be
+	 * performed the optimistic spinning cannot be done.
+	 *
+	 * Check this in every inner iteration because we may
+	 * be racing against another thread's ww_mutex_lock.
+	 */
+	if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx))
+		return false;
+
+	/*
+	 * If we aren't on the wait list yet, cancel the spin
+	 * if there are waiters. We want  to avoid stealing the
+	 * lock from a waiter with an earlier stamp, since the
+	 * other thread may already own a lock that we also
+	 * need.
+	 */
+	if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS))
+		return false;
+
+	/*
+	 * Similarly, stop spinning if we are no longer the
+	 * first waiter.
+	 */
+	if (waiter && !__mutex_waiter_is_first(lock, waiter))
+		return false;
+
+	return true;
+}
+
 /*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+ * Look out! "owner" is an entirely speculative pointer access and not
+ * reliable.
+ *
+ * "noinline" so that this function shows up on perf profiles.
 */
 static noinline
-bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+			 struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
 {
 	bool ret = true;

@ -373,6 +443,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 			break;
 		}

+		if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) {
+			ret = false;
+			break;
+		}
+
 		cpu_relax();
 	}
 	rcu_read_unlock();
@ -431,12 +506,10 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 * with the spinner at the head of the OSQ, if present, until the owner is
 * changed to itself.
 */
-static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx,
-				  const bool use_ww_ctx, const bool waiter)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+		      const bool use_ww_ctx, struct mutex_waiter *waiter)
 {
-	struct task_struct *task = current;
-
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@ -460,40 +533,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 	for (;;) {
 		struct task_struct *owner;

-		if (use_ww_ctx && ww_ctx->acquired > 0) {
-			struct ww_mutex *ww;
-
-			ww = container_of(lock, struct ww_mutex, base);
-			/*
-			 * If ww->ctx is set the contents are undefined, only
-			 * by acquiring wait_lock there is a guarantee that
-			 * they are not invalid when reading.
-			 *
-			 * As such, when deadlock detection needs to be
-			 * performed the optimistic spinning cannot be done.
-			 */
-			if (READ_ONCE(ww->ctx))
-				goto fail_unlock;
-		}
+		/* Try to acquire the mutex... */
+		owner = __mutex_trylock_or_owner(lock);
+		if (!owner)
+			break;

 		/*
-		 * If there's an owner, wait for it to either
+		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		owner = __mutex_owner(lock);
-		if (owner) {
-			if (waiter && owner == task) {
-				smp_mb(); /* ACQUIRE */
-				break;
-			}
-
-			if (!mutex_spin_on_owner(lock, owner))
-				goto fail_unlock;
-		}
-
-		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock, waiter))
-			break;
+		if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter))
+			goto fail_unlock;

 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@ -532,9 +582,9 @@ fail:
 	return false;
 }
 #else
-static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx,
-				  const bool use_ww_ctx, const bool waiter)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+		      const bool use_ww_ctx, struct mutex_waiter *waiter)
 {
 	return false;
 }
@ -594,24 +644,89 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 EXPORT_SYMBOL(ww_mutex_unlock);

 static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
+			    struct ww_acquire_ctx *ctx)
 {
 	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
 	struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+	struct mutex_waiter *cur;

-	if (!hold_ctx)
-		return 0;
+	if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
+		goto deadlock;

-	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
-	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
-#ifdef CONFIG_DEBUG_MUTEXES
-		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
-		ctx->contending_lock = ww;
-#endif
-		return -EDEADLK;
+	/*
+	 * If there is a waiter in front of us that has a context, then its
+	 * stamp is earlier than ours and we must back off.
+	 */
+	cur = waiter;
+	list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
+		if (cur->ww_ctx)
+			goto deadlock;
 	}

 	return 0;
+
+deadlock:
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+	ctx->contending_lock = ww;
+#endif
+	return -EDEADLK;
+}
+
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+		      struct mutex *lock,
+		      struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+	struct list_head *pos;
+
+	if (!ww_ctx) {
+		list_add_tail(&waiter->list, &lock->wait_list);
+		return 0;
+	}
+
+	/*
+	 * Add the waiter before the first waiter with a higher stamp.
+	 * Waiters without a context are skipped to avoid starving
+	 * them.
+	 */
+	pos = &lock->wait_list;
+	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+			/* Back off immediately if necessary. */
+			if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+				struct ww_mutex *ww;
+
+				ww = container_of(lock, struct ww_mutex, base);
+				DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+				ww_ctx->contending_lock = ww;
+#endif
+				return -EDEADLK;
+			}
+
+			break;
+		}
+
+		pos = &cur->list;
+
+		/*
+		 * Wake up the waiter so that it gets a chance to back
+		 * off.
+		 */
+		if (cur->ww_ctx->acquired > 0) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
+	}
+
+	list_add_tail(&waiter->list, pos);
+	return 0;
 }

 /*
@ -622,15 +737,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
-	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned long flags;
 	bool first = false;
 	struct ww_mutex *ww;
 	int ret;

-	if (use_ww_ctx) {
-		ww = container_of(lock, struct ww_mutex, base);
+	might_sleep();
+
+	ww = container_of(lock, struct ww_mutex, base);
+	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
 	}
@ -638,36 +753,54 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);

-	if (__mutex_trylock(lock, false) ||
-	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
+	if (__mutex_trylock(lock) ||
+	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
-		if (use_ww_ctx)
+		if (use_ww_ctx && ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
 		preempt_enable();
 		return 0;
 	}

-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock, false))
+	if (__mutex_trylock(lock)) {
+		if (use_ww_ctx && ww_ctx)
+			__ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+
 		goto skip_wait;
+	}

 	debug_mutex_lock_common(lock, &waiter);
-	debug_mutex_add_waiter(lock, &waiter, task);
+	debug_mutex_add_waiter(lock, &waiter, current);

-	/* add waiting tasks to the end of the waitqueue (FIFO): */
-	list_add_tail(&waiter.list, &lock->wait_list);
-	waiter.task = task;
+	lock_contended(&lock->dep_map, ip);
+
+	if (!use_ww_ctx) {
+		/* add waiting tasks to the end of the waitqueue (FIFO): */
+		list_add_tail(&waiter.list, &lock->wait_list);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
+	} else {
+		/* Add in stamp order, waking up waiters that must back off. */
+		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+		if (ret)
+			goto err_early_backoff;
+
+		waiter.ww_ctx = ww_ctx;
+	}
+
+	waiter.task = current;

 	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);

-	lock_contended(&lock->dep_map, ip);
-
-	set_task_state(task, state);
+	set_current_state(state);
 	for (;;) {
 		/*
 		 * Once we hold wait_lock, we're serialized against
@ -675,7 +808,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * before testing the error conditions to make sure we pick up
 		 * the handoff.
 		 */
-		if (__mutex_trylock(lock, first))
+		if (__mutex_trylock(lock))
 			goto acquired;

 		/*
@ -683,42 +816,47 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * wait_lock. This ensures the lock cancellation is ordered
 		 * against mutex_unlock() and wake-ups do not go missing.
 		 */
-		if (unlikely(signal_pending_state(state, task))) {
+		if (unlikely(signal_pending_state(state, current))) {
 			ret = -EINTR;
 			goto err;
 		}

-		if (use_ww_ctx && ww_ctx->acquired > 0) {
-			ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
+			ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
 			if (ret)
 				goto err;
 		}

-		spin_unlock_mutex(&lock->wait_lock, flags);
+		spin_unlock(&lock->wait_lock);
 		schedule_preempt_disabled();

-		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
-			first = true;
-			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+		/*
+		 * ww_mutex needs to always recheck its position since its waiter
+		 * list is not FIFO ordered.
+		 */
+		if ((use_ww_ctx && ww_ctx) || !first) {
+			first = __mutex_waiter_is_first(lock, &waiter);
+			if (first)
+				__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}

-		set_task_state(task, state);
+		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) ||
-		     __mutex_trylock(lock, first))
+		if (__mutex_trylock(lock) ||
+		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
 			break;

-		spin_lock_mutex(&lock->wait_lock, flags);
+		spin_lock(&lock->wait_lock);
 	}
-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 acquired:
-	__set_task_state(task, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);

-	mutex_remove_waiter(lock, &waiter, task);
+	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
 		__mutex_clear_flag(lock, MUTEX_FLAGS);

@ -728,30 +866,44 @@ skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);

-	if (use_ww_ctx)
+	if (use_ww_ctx && ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);

-	spin_unlock_mutex(&lock->wait_lock, flags);
+	spin_unlock(&lock->wait_lock);
 	preempt_enable();
 	return 0;

 err:
-	__set_task_state(task, TASK_RUNNING);
-	mutex_remove_waiter(lock, &waiter, task);
-	spin_unlock_mutex(&lock->wait_lock, flags);
+	__set_current_state(TASK_RUNNING);
+	mutex_remove_waiter(lock, &waiter, current);
+err_early_backoff:
+	spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
 	preempt_enable();
 	return ret;
 }

+static int __sched
+__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+	     struct lockdep_map *nest_lock, unsigned long ip)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+}
+
+static int __sched
+__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+		struct lockdep_map *nest_lock, unsigned long ip,
+		struct ww_acquire_ctx *ww_ctx)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    subclass, NULL, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }

 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@ -759,27 +911,21 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
 void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
-	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    0, nest, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);

 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	return __mutex_lock_common(lock, TASK_KILLABLE,
-				   subclass, NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);

 int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-				   subclass, NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);

@ -824,35 +970,37 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 }

 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;

 	might_sleep();
-	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-				   0, &ctx->dep_map, _RET_IP_, ctx, 1);
-	if (!ret && ctx->acquired > 1)
+	ret =  __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
+			       0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+			       ctx);
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);

 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);

 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;

 	might_sleep();
-	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				  0, &ctx->dep_map, _RET_IP_, ctx, 1);
+	ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
+			      0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+			      ctx);

-	if (!ret && ctx->acquired > 1)
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);

 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);

 #endif

@ -862,8 +1010,8 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 {
 	struct task_struct *next = NULL;
-	unsigned long owner, flags;
 	DEFINE_WAKE_Q(wake_q);
+	unsigned long owner;

 	mutex_release(&lock->dep_map, 1, ip);

@ -880,6 +1028,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne

 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif

 		if (owner & MUTEX_FLAG_HANDOFF)
@ -897,7 +1046,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		owner = old;
 	}

-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 	debug_mutex_unlock(lock);
 	if (!list_empty(&lock->wait_list)) {
 		/* get the first entry from the wait-list: */
@ -914,7 +1063,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 	if (owner & MUTEX_FLAG_HANDOFF)
 		__mutex_handoff(lock, next);

-	spin_unlock_mutex(&lock->wait_lock, flags);
+	spin_unlock(&lock->wait_lock);

 	wake_up_q(&wake_q);
 }
@ -977,37 +1126,34 @@ EXPORT_SYMBOL_GPL(mutex_lock_io);
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
 {
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
-			    NULL, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
 }

 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
-				   NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
 }

 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
 }

 static noinline int __sched
 __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, ctx, 1);
+	return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+			       _RET_IP_, ctx);
 }

 static noinline int __sched
 __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 					    struct ww_acquire_ctx *ctx)
 {
-	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, ctx, 1);
+	return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+			       _RET_IP_, ctx);
 }

 #endif
@ -1028,7 +1174,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	bool locked = __mutex_trylock(lock, false);
+	bool locked = __mutex_trylock(lock);

 	if (locked)
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
@ -1039,32 +1185,34 @@ EXPORT_SYMBOL(mutex_trylock);

 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();

 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}

 	return __ww_mutex_lock_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);

 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();

 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}

 	return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);

 #endif

--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@ -9,10 +9,6 @@
 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
 */

-#define spin_lock_mutex(lock, flags) \
-		do { spin_lock(lock); (void)(flags); } while (0)
-#define spin_unlock_mutex(lock, flags) \
-		do { spin_unlock(lock); (void)(flags); } while (0)
 #define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)

--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@ -1,7 +1,6 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
-	init_waitqueue_head(&sem->writer);
+	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
 	return 0;
 }
@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	__this_cpu_dec(*sem->read_count);

 	/* Prod writer to recheck readers_active */
-	wake_up(&sem->writer);
+	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);

@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 */

 	/* Wait for all now active readers to complete. */
-	wait_event(sem->writer, readers_active_check(sem));
+	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);

--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@ -263,7 +263,7 @@ pv_wait_early(struct pv_node *prev, int loop)
 	if ((loop & PV_PREV_CHECK_MASK) != 0)
 		return false;

-	return READ_ONCE(prev->state) != vcpu_running;
+	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
 }

 /*
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@ -1179,7 +1179,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		 * TASK_INTERRUPTIBLE checks for signals and
 		 * timeout. Ignored otherwise.
 		 */
-		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+		if (likely(state == TASK_INTERRUPTIBLE)) {
 			/* Signal pending? */
 			if (signal_pending(current))
 				ret = -EINTR;
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@ -128,7 +128,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 void __sched __down_read(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 	unsigned long flags;

 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
@ -140,13 +139,12 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}

-	tsk = current;
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);

 	/* set up my own style of waitqueue */
-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;
-	get_task_struct(tsk);
+	get_task_struct(current);

 	list_add_tail(&waiter.list, &sem->wait_list);

@ -158,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
 		if (!waiter.task)
 			break;
 		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 	}

-	__set_task_state(tsk, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 out:
 	;
 }
@ -194,15 +192,13 @@ int __down_read_trylock(struct rw_semaphore *sem)
 int __sched __down_write_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 	unsigned long flags;
 	int ret = 0;

 	raw_spin_lock_irqsave(&sem->wait_lock, flags);

 	/* set up my own style of waitqueue */
-	tsk = current;
-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	list_add_tail(&waiter.list, &sem->wait_list);

@ -220,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 			ret = -EINTR;
 			goto out;
 		}
-		set_task_state(tsk, state);
+		set_current_state(state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@ -224,10 +224,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk = current;
 	DEFINE_WAKE_Q(wake_q);

-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;

 	raw_spin_lock_irq(&sem->wait_lock);
@ -254,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)

 	/* wait to be given the lock */
 	while (true) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 		schedule();
 	}

-	__set_task_state(tsk, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
@ -503,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * wake any read locks that were queued ahead of us.
 		 */
 		if (count > RWSEM_WAITING_BIAS) {
-			DEFINE_WAKE_Q(wake_q);
-
 			__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 			/*
 			 * The wakeup is normally called _after_ the wait_lock
@ -514,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 			 * for attempting rwsem_try_write_lock().
 			 */
 			wake_up_q(&wake_q);
+
+			/*
+			 * Reinitialize wake_q after use.
+			 */
+			wake_q_init(&wake_q);
 		}

 	} else
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@ -204,19 +204,18 @@ struct semaphore_waiter {
 static inline int __sched __down_common(struct semaphore *sem, long state,
 								long timeout)
 {
-	struct task_struct *task = current;
 	struct semaphore_waiter waiter;

 	list_add_tail(&waiter.list, &sem->wait_list);
-	waiter.task = task;
+	waiter.task = current;
 	waiter.up = false;

 	for (;;) {
-		if (signal_pending_state(state, task))
+		if (signal_pending_state(state, current))
 			goto interrupted;
 		if (unlikely(timeout <= 0))
 			goto timed_out;
-		__set_task_state(task, state);
+		__set_current_state(state);
 		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->lock);
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);

-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-{
-	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
-	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-}
-EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
-
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 						   int subclass)
 {
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
 	lock->owner_cpu = -1;
 }

-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-
-	for (i = 0; i < loops; i++) {
-		if (arch_spin_trylock(&lock->raw_lock))
-			return;
-		__delay(1);
-	}
-	/* lockup suspected: */
-	spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
-	trigger_all_cpu_backtrace();
-#endif
-
-	/*
-	 * The trylock above was causing a livelock.  Give the lower level arch
-	 * specific lock code a chance to acquire the lock. We have already
-	 * printed a warning/backtrace at this point. The non-debug arch
-	 * specific code might actually succeed in acquiring the lock.  If it is
-	 * not successful, the end-result is the same - there is no forward
-	 * progress.
-	 */
-	arch_spin_lock(&lock->raw_lock);
-}
-
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
 void do_raw_spin_lock(raw_spinlock_t *lock)
 {
 	debug_spin_lock_before(lock);
-	if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
-		__spin_lock_debug(lock);
+	arch_spin_lock(&lock->raw_lock);
 	debug_spin_lock_after(lock);
 }

@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)

 #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)

-#if 0		/* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-	int print_once = 1;
-
-	for (;;) {
-		for (i = 0; i < loops; i++) {
-			if (arch_read_trylock(&lock->raw_lock))
-				return;
-			__delay(1);
-		}
-		/* lockup suspected: */
-		if (print_once) {
-			print_once = 0;
-			printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
-					"%s/%d, %p\n",
-				raw_smp_processor_id(), current->comm,
-				current->pid, lock);
-			dump_stack();
-		}
-	}
-}
-#endif
-
 void do_raw_read_lock(rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock)
 	lock->owner_cpu = -1;
 }

-#if 0		/* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-	int print_once = 1;
-
-	for (;;) {
-		for (i = 0; i < loops; i++) {
-			if (arch_write_trylock(&lock->raw_lock))
-				return;
-			__delay(1);
-		}
-		/* lockup suspected: */
-		if (print_once) {
-			print_once = 0;
-			printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
-					"%s/%d, %p\n",
-				raw_smp_processor_id(), current->comm,
-				current->pid, lock);
-			dump_stack();
-		}
-	}
-}
-#endif
-
 void do_raw_write_lock(rwlock_t *lock)
 {
 	debug_write_lock_before(lock);
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@ -0,0 +1,646 @@
+/*
+ * Module-based API test facility for ww_mutexes
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ww_mutex.h>
+
+static DEFINE_WW_CLASS(ww_class);
+struct workqueue_struct *wq;
+
+struct test_mutex {
+	struct work_struct work;
+	struct ww_mutex mutex;
+	struct completion ready, go, done;
+	unsigned int flags;
+};
+
+#define TEST_MTX_SPIN BIT(0)
+#define TEST_MTX_TRY BIT(1)
+#define TEST_MTX_CTX BIT(2)
+#define __TEST_MTX_LAST BIT(3)
+
+static void test_mutex_work(struct work_struct *work)
+{
+	struct test_mutex *mtx = container_of(work, typeof(*mtx), work);
+
+	complete(&mtx->ready);
+	wait_for_completion(&mtx->go);
+
+	if (mtx->flags & TEST_MTX_TRY) {
+		while (!ww_mutex_trylock(&mtx->mutex))
+			cpu_relax();
+	} else {
+		ww_mutex_lock(&mtx->mutex, NULL);
+	}
+	complete(&mtx->done);
+	ww_mutex_unlock(&mtx->mutex);
+}
+
+static int __test_mutex(unsigned int flags)
+{
+#define TIMEOUT (HZ / 16)
+	struct test_mutex mtx;
+	struct ww_acquire_ctx ctx;
+	int ret;
+
+	ww_mutex_init(&mtx.mutex, &ww_class);
+	ww_acquire_init(&ctx, &ww_class);
+
+	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
+	init_completion(&mtx.ready);
+	init_completion(&mtx.go);
+	init_completion(&mtx.done);
+	mtx.flags = flags;
+
+	schedule_work(&mtx.work);
+
+	wait_for_completion(&mtx.ready);
+	ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
+	complete(&mtx.go);
+	if (flags & TEST_MTX_SPIN) {
+		unsigned long timeout = jiffies + TIMEOUT;
+
+		ret = 0;
+		do {
+			if (completion_done(&mtx.done)) {
+				ret = -EINVAL;
+				break;
+			}
+			cpu_relax();
+		} while (time_before(jiffies, timeout));
+	} else {
+		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
+	}
+	ww_mutex_unlock(&mtx.mutex);
+	ww_acquire_fini(&ctx);
+
+	if (ret) {
+		pr_err("%s(flags=%x): mutual exclusion failure\n",
+		       __func__, flags);
+		ret = -EINVAL;
+	}
+
+	flush_work(&mtx.work);
+	destroy_work_on_stack(&mtx.work);
+	return ret;
+#undef TIMEOUT
+}
+
+static int test_mutex(void)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < __TEST_MTX_LAST; i++) {
+		ret = __test_mutex(i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int test_aa(void)
+{
+	struct ww_mutex mutex;
+	struct ww_acquire_ctx ctx;
+	int ret;
+
+	ww_mutex_init(&mutex, &ww_class);
+	ww_acquire_init(&ctx, &ww_class);
+
+	ww_mutex_lock(&mutex, &ctx);
+
+	if (ww_mutex_trylock(&mutex))  {
+		pr_err("%s: trylocked itself!\n", __func__);
+		ww_mutex_unlock(&mutex);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ww_mutex_lock(&mutex, &ctx);
+	if (ret != -EALREADY) {
+		pr_err("%s: missed deadlock for recursing, ret=%d\n",
+		       __func__, ret);
+		if (!ret)
+			ww_mutex_unlock(&mutex);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	ww_mutex_unlock(&mutex);
+	ww_acquire_fini(&ctx);
+	return ret;
+}
+
+struct test_abba {
+	struct work_struct work;
+	struct ww_mutex a_mutex;
+	struct ww_mutex b_mutex;
+	struct completion a_ready;
+	struct completion b_ready;
+	bool resolve;
+	int result;
+};
+
+static void test_abba_work(struct work_struct *work)
+{
+	struct test_abba *abba = container_of(work, typeof(*abba), work);
+	struct ww_acquire_ctx ctx;
+	int err;
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&abba->b_mutex, &ctx);
+
+	complete(&abba->b_ready);
+	wait_for_completion(&abba->a_ready);
+
+	err = ww_mutex_lock(&abba->a_mutex, &ctx);
+	if (abba->resolve && err == -EDEADLK) {
+		ww_mutex_unlock(&abba->b_mutex);
+		ww_mutex_lock_slow(&abba->a_mutex, &ctx);
+		err = ww_mutex_lock(&abba->b_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(&abba->a_mutex);
+	ww_mutex_unlock(&abba->b_mutex);
+	ww_acquire_fini(&ctx);
+
+	abba->result = err;
+}
+
+static int test_abba(bool resolve)
+{
+	struct test_abba abba;
+	struct ww_acquire_ctx ctx;
+	int err, ret;
+
+	ww_mutex_init(&abba.a_mutex, &ww_class);
+	ww_mutex_init(&abba.b_mutex, &ww_class);
+	INIT_WORK_ONSTACK(&abba.work, test_abba_work);
+	init_completion(&abba.a_ready);
+	init_completion(&abba.b_ready);
+	abba.resolve = resolve;
+
+	schedule_work(&abba.work);
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&abba.a_mutex, &ctx);
+
+	complete(&abba.a_ready);
+	wait_for_completion(&abba.b_ready);
+
+	err = ww_mutex_lock(&abba.b_mutex, &ctx);
+	if (resolve && err == -EDEADLK) {
+		ww_mutex_unlock(&abba.a_mutex);
+		ww_mutex_lock_slow(&abba.b_mutex, &ctx);
+		err = ww_mutex_lock(&abba.a_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(&abba.b_mutex);
+	ww_mutex_unlock(&abba.a_mutex);
+	ww_acquire_fini(&ctx);
+
+	flush_work(&abba.work);
+	destroy_work_on_stack(&abba.work);
+
+	ret = 0;
+	if (resolve) {
+		if (err || abba.result) {
+			pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n",
+			       __func__, err, abba.result);
+			ret = -EINVAL;
+		}
+	} else {
+		if (err != -EDEADLK && abba.result != -EDEADLK) {
+			pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n",
+			       __func__, err, abba.result);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+struct test_cycle {
+	struct work_struct work;
+	struct ww_mutex a_mutex;
+	struct ww_mutex *b_mutex;
+	struct completion *a_signal;
+	struct completion b_signal;
+	int result;
+};
+
+static void test_cycle_work(struct work_struct *work)
+{
+	struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
+	struct ww_acquire_ctx ctx;
+	int err;
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&cycle->a_mutex, &ctx);
+
+	complete(cycle->a_signal);
+	wait_for_completion(&cycle->b_signal);
+
+	err = ww_mutex_lock(cycle->b_mutex, &ctx);
+	if (err == -EDEADLK) {
+		ww_mutex_unlock(&cycle->a_mutex);
+		ww_mutex_lock_slow(cycle->b_mutex, &ctx);
+		err = ww_mutex_lock(&cycle->a_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(cycle->b_mutex);
+	ww_mutex_unlock(&cycle->a_mutex);
+	ww_acquire_fini(&ctx);
+
+	cycle->result = err;
+}
+
+static int __test_cycle(unsigned int nthreads)
+{
+	struct test_cycle *cycles;
+	unsigned int n, last = nthreads - 1;
+	int ret;
+
+	cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL);
+	if (!cycles)
+		return -ENOMEM;
+
+	for (n = 0; n < nthreads; n++) {
+		struct test_cycle *cycle = &cycles[n];
+
+		ww_mutex_init(&cycle->a_mutex, &ww_class);
+		if (n == last)
+			cycle->b_mutex = &cycles[0].a_mutex;
+		else
+			cycle->b_mutex = &cycles[n + 1].a_mutex;
+
+		if (n == 0)
+			cycle->a_signal = &cycles[last].b_signal;
+		else
+			cycle->a_signal = &cycles[n - 1].b_signal;
+		init_completion(&cycle->b_signal);
+
+		INIT_WORK(&cycle->work, test_cycle_work);
+		cycle->result = 0;
+	}
+
+	for (n = 0; n < nthreads; n++)
+		queue_work(wq, &cycles[n].work);
+
+	flush_workqueue(wq);
+
+	ret = 0;
+	for (n = 0; n < nthreads; n++) {
+		struct test_cycle *cycle = &cycles[n];
+
+		if (!cycle->result)
+			continue;
+
+		pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+		       n, nthreads, cycle->result);
+		ret = -EINVAL;
+		break;
+	}
+
+	for (n = 0; n < nthreads; n++)
+		ww_mutex_destroy(&cycles[n].a_mutex);
+	kfree(cycles);
+	return ret;
+}
+
+static int test_cycle(unsigned int ncpus)
+{
+	unsigned int n;
+	int ret;
+
+	for (n = 2; n <= ncpus + 1; n++) {
+		ret = __test_cycle(n);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+struct stress {
+	struct work_struct work;
+	struct ww_mutex *locks;
+	int nlocks;
+	int nloops;
+};
+
+static int *get_random_order(int count)
+{
+	int *order;
+	int n, r, tmp;
+
+	order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+	if (!order)
+		return order;
+
+	for (n = 0; n < count; n++)
+		order[n] = n;
+
+	for (n = count - 1; n > 1; n--) {
+		r = get_random_int() % (n + 1);
+		if (r != n) {
+			tmp = order[n];
+			order[n] = order[r];
+			order[r] = tmp;
+		}
+	}
+
+	return order;
+}
+
+static void dummy_load(struct stress *stress)
+{
+	usleep_range(1000, 2000);
+}
+
+static void stress_inorder_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	const int nlocks = stress->nlocks;
+	struct ww_mutex *locks = stress->locks;
+	struct ww_acquire_ctx ctx;
+	int *order;
+
+	order = get_random_order(nlocks);
+	if (!order)
+		return;
+
+	ww_acquire_init(&ctx, &ww_class);
+
+	do {
+		int contended = -1;
+		int n, err;
+
+retry:
+		err = 0;
+		for (n = 0; n < nlocks; n++) {
+			if (n == contended)
+				continue;
+
+			err = ww_mutex_lock(&locks[order[n]], &ctx);
+			if (err < 0)
+				break;
+		}
+		if (!err)
+			dummy_load(stress);
+
+		if (contended > n)
+			ww_mutex_unlock(&locks[order[contended]]);
+		contended = n;
+		while (n--)
+			ww_mutex_unlock(&locks[order[n]]);
+
+		if (err == -EDEADLK) {
+			ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+			goto retry;
+		}
+
+		if (err) {
+			pr_err_once("stress (%s) failed with %d\n",
+				    __func__, err);
+			break;
+		}
+	} while (--stress->nloops);
+
+	ww_acquire_fini(&ctx);
+
+	kfree(order);
+	kfree(stress);
+}
+
+struct reorder_lock {
+	struct list_head link;
+	struct ww_mutex *lock;
+};
+
+static void stress_reorder_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	LIST_HEAD(locks);
+	struct ww_acquire_ctx ctx;
+	struct reorder_lock *ll, *ln;
+	int *order;
+	int n, err;
+
+	order = get_random_order(stress->nlocks);
+	if (!order)
+		return;
+
+	for (n = 0; n < stress->nlocks; n++) {
+		ll = kmalloc(sizeof(*ll), GFP_KERNEL);
+		if (!ll)
+			goto out;
+
+		ll->lock = &stress->locks[order[n]];
+		list_add(&ll->link, &locks);
+	}
+	kfree(order);
+	order = NULL;
+
+	ww_acquire_init(&ctx, &ww_class);
+
+	do {
+		list_for_each_entry(ll, &locks, link) {
+			err = ww_mutex_lock(ll->lock, &ctx);
+			if (!err)
+				continue;
+
+			ln = ll;
+			list_for_each_entry_continue_reverse(ln, &locks, link)
+				ww_mutex_unlock(ln->lock);
+
+			if (err != -EDEADLK) {
+				pr_err_once("stress (%s) failed with %d\n",
+					    __func__, err);
+				break;
+			}
+
+			ww_mutex_lock_slow(ll->lock, &ctx);
+			list_move(&ll->link, &locks); /* restarts iteration */
+		}
+
+		dummy_load(stress);
+		list_for_each_entry(ll, &locks, link)
+			ww_mutex_unlock(ll->lock);
+	} while (--stress->nloops);
+
+	ww_acquire_fini(&ctx);
+
+out:
+	list_for_each_entry_safe(ll, ln, &locks, link)
+		kfree(ll);
+	kfree(order);
+	kfree(stress);
+}
+
+static void stress_one_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	const int nlocks = stress->nlocks;
+	struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+	int err;
+
+	do {
+		err = ww_mutex_lock(lock, NULL);
+		if (!err) {
+			dummy_load(stress);
+			ww_mutex_unlock(lock);
+		} else {
+			pr_err_once("stress (%s) failed with %d\n",
+				    __func__, err);
+			break;
+		}
+	} while (--stress->nloops);
+
+	kfree(stress);
+}
+
+#define STRESS_INORDER BIT(0)
+#define STRESS_REORDER BIT(1)
+#define STRESS_ONE BIT(2)
+#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
+
+static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+{
+	struct ww_mutex *locks;
+	int n;
+
+	locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
+	if (!locks)
+		return -ENOMEM;
+
+	for (n = 0; n < nlocks; n++)
+		ww_mutex_init(&locks[n], &ww_class);
+
+	for (n = 0; nthreads; n++) {
+		struct stress *stress;
+		void (*fn)(struct work_struct *work);
+
+		fn = NULL;
+		switch (n & 3) {
+		case 0:
+			if (flags & STRESS_INORDER)
+				fn = stress_inorder_work;
+			break;
+		case 1:
+			if (flags & STRESS_REORDER)
+				fn = stress_reorder_work;
+			break;
+		case 2:
+			if (flags & STRESS_ONE)
+				fn = stress_one_work;
+			break;
+		}
+
+		if (!fn)
+			continue;
+
+		stress = kmalloc(sizeof(*stress), GFP_KERNEL);
+		if (!stress)
+			break;
+
+		INIT_WORK(&stress->work, fn);
+		stress->locks = locks;
+		stress->nlocks = nlocks;
+		stress->nloops = nloops;
+
+		queue_work(wq, &stress->work);
+		nthreads--;
+	}
+
+	flush_workqueue(wq);
+
+	for (n = 0; n < nlocks; n++)
+		ww_mutex_destroy(&locks[n]);
+	kfree(locks);
+
+	return 0;
+}
+
+static int __init test_ww_mutex_init(void)
+{
+	int ncpus = num_online_cpus();
+	int ret;
+
+	wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+	if (!wq)
+		return -ENOMEM;
+
+	ret = test_mutex();
+	if (ret)
+		return ret;
+
+	ret = test_aa();
+	if (ret)
+		return ret;
+
+	ret = test_abba(false);
+	if (ret)
+		return ret;
+
+	ret = test_abba(true);
+	if (ret)
+		return ret;
+
+	ret = test_cycle(ncpus);
+	if (ret)
+		return ret;
+
+	ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+	if (ret)
+		return ret;
+
+	ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+	if (ret)
+		return ret;
+
+	ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void __exit test_ww_mutex_exit(void)
+{
+	destroy_workqueue(wq);
+}
+
+module_init(test_ww_mutex_init);
+module_exit(test_ww_mutex_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
--- a/kernel/pid.c
+++ b/kernel/pid.c
@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
 * the scheme scales to up to 4 million PIDs, runtime.
 */
 struct pid_namespace init_pid_ns = {
-	.kref = {
-		.refcount       = ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@ -716,6 +716,19 @@ source "lib/Kconfig.kmemcheck"

 source "lib/Kconfig.kasan"

+config DEBUG_REFCOUNT
+	bool "Verbose refcount checks"
+	help
+	  Say Y here if you want reference counters (refcount_t and kref) to
+	  generate WARNs on dubious usage. Without this refcount_t will still
+	  be a saturating counter and avoid Use-After-Free by turning it into
+	  a resource leak Denial-Of-Service.
+
+	  Use of this option will increase kernel text size but will alert the
+	  admin of potential abuse.
+
+	  If in doubt, say "N".
+
 endmenu # "Memory Debugging"

 config ARCH_HAS_KCOV
@ -1166,6 +1179,18 @@ config LOCK_TORTURE_TEST
 	  Say M if you want these torture tests to build as a module.
 	  Say N if you are unsure.

+config WW_MUTEX_SELFTEST
+	tristate "Wait/wound mutex selftests"
+	help
+	  This option provides a kernel module that runs tests on the
+	  on the struct ww_mutex locking API.
+
+	  It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction
+	  with this test harness.
+
+	  Say M if you want these self tests to build as a module.
+	  Say N if you are unsure.
+
 endmenu # lock debugging

 config TRACE_IRQFLAGS
--- a/Show More
+++ b/Show More