NFS client updates for Linux 4.8

Highlights include: Stable bugfixes: - nfs: don't create zero-length requests - Several LAYOUTGET bugfixes Features: - Several performance related features - More aggressive caching when we can rely on close-to-open cache consistency - Remove serialisation of O_DIRECT reads and writes - Optimise several code paths to not flush to disk unnecessarily. However allow for the idiosyncracies of pNFS for those layout types that need to issue a LAYOUTCOMMIT before the metadata can be updated on the server. - SUNRPC updates to the client data receive path - pNFS/SCSI support RH/Fedora dm-mpath device nodes - pNFS files/flexfiles can now use unprivileged ports when the generic NFS mount options allow it. Bugfixes: - Don't use RDMA direct data placement together with data integrity or privacy security flavours - Remove the RDMA ALLPHYSICAL memory registration mode as it has potential security holes. - Several layout recall fixes to improve NFSv4.1 protocol compliance. - Fix an Oops in the pNFS files and flexfiles connection setup to the DS - Allow retry of operations that used a returned delegation stateid - Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding - Fix writeback races in nfs4_copy_range() and nfs42_proc_deallocate() -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJXnSq8AAoJEGcL54qWCgDyn8cP/RCHLekUCq7Klh+NAnEsvuBi C7w9YpVHaC83/8Q0tR6LyFShSBJBWi/clWwO0IEomkNK/MuO77v4iyPujtEyqowK 0+eWFh/e8CsTf7mNGoi0avrHAZDB3deSuOQeYbwnNWHmd7qKVkB6tHus8LQjk852 eqwYmZ4kVr+eaCO6MttCCxJHf6datPnsbe0stiC9MpxmCzsdpZmFptfauidsFX+p 0U1IHi/ABN6zIFoc4R0iXXbaDb8ErxGw32SWIb8cnnWwdlSD8I0+Jqxs4opp23LY lAm9E0vtDJ49bJBllYl0dUmizdhJ3+NefK4aqPh5H5h3Csub+MLIsuQv/+r2AOhH qLBi5kThpspPhGHZ40VDmfV825+csUPTc8WkDaNLvb4f4UGIPakK/KBrBtxiqn+P 0etvYiWBuoBaqRVQpstawnyDdnBK0IMF/3LAULo+ozo7iTkpaZmOALYgPcBUYw2f d6pxZGeNN0GwWfjDmoUDGC07OpO/CSN5WouArgKsp5+VhjzPxjyaZLCnUhzHzXiM RV1oBytEs/iw2BLXX809noM9mqHYkdgSVmrZ9OvvDMslcLHaslpq6eaJKZSWqV2J fAws6rbcZdTFSnbAWr0OSxct6w6BijEjc3/uk+wWRtw9nkOhFqtlxI3y7k4odpW9 wVcEmRNkxfA0LlYNXWuL =WNyE -----END PGP SIGNATURE----- Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs Pull NFS client updates from Trond Myklebust: "Highlights include: Stable bugfixes: - nfs: don't create zero-length requests - several LAYOUTGET bugfixes Features: - several performance related features - more aggressive caching when we can rely on close-to-open cache consistency - remove serialisation of O_DIRECT reads and writes - optimise several code paths to not flush to disk unnecessarily. However allow for the idiosyncracies of pNFS for those layout types that need to issue a LAYOUTCOMMIT before the metadata can be updated on the server. - SUNRPC updates to the client data receive path - pNFS/SCSI support RH/Fedora dm-mpath device nodes - pNFS files/flexfiles can now use unprivileged ports when the generic NFS mount options allow it. Bugfixes: - Don't use RDMA direct data placement together with data integrity or privacy security flavours - Remove the RDMA ALLPHYSICAL memory registration mode as it has potential security holes. - Several layout recall fixes to improve NFSv4.1 protocol compliance. - Fix an Oops in the pNFS files and flexfiles connection setup to the DS - Allow retry of operations that used a returned delegation stateid - Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding - Fix writeback races in nfs4_copy_range() and nfs42_proc_deallocate()" * tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits) pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding NFSv4: Clean up lookup of SECINFO_NO_NAME NFSv4.2: Fix warning "variable ‘stateids’ set but not used" NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’" SUNRPC: Fix a compiler warning in fs/nfs/clnt.c pNFS: Remove redundant smp_mb() from pnfs_init_lseg() pNFS: Cleanup - do layout segment initialisation in one place pNFS: Remove redundant stateid invalidation pNFS: Remove redundant pnfs_mark_layout_returned_if_empty() pNFS: Clear the layout metadata if the server changed the layout stateid pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid() NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id pNFS: Do not set plh_return_seq for non-callback related layoutreturns pNFS: Ensure layoutreturn acts as a completion for layout callbacks pNFS: Fix CB_LAYOUTRECALL stateid verification pNFS: Always update the layout barrier seqid on LAYOUTGET pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set pNFS: Clear the layout return tracking on layout reinitialisation pNFS: LAYOUTRETURN should only update the stateid if the layout is valid nfs: don't create zero-length requests ...
2016-07-30 16:33:25 -07:00 · 2016-07-30 16:33:25 -07:00 · 7f155c7026
parent d761f3ed6e 944171cbf4
commit 7f155c7026
55 changed files with 1784 additions and 1536 deletions
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 CFLAGS_nfstrace.o += -I$(src)
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
-			   direct.o pagelist.o read.o symlink.o unlink.o \
+			   io.o direct.o pagelist.o read.o symlink.o unlink.o \
 			   write.o namespace.o mount_clnt.o nfstrace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_SYSCTL)	+= sysctl.o
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		if (!p)
 			return -EIO;
 		b->simple.nr_sigs = be32_to_cpup(p++);
-		if (!b->simple.nr_sigs) {
+		if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
-			dprintk("no signature\n");
+			dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
 			return -EIO;
 		}
@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 			memcpy(&b->simple.sigs[i].sig, p,
 				b->simple.sigs[i].sig_len);
-			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+			b->simple.len += 8 + 4 + \
 				(XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
 		}
 		break;
 	case PNFS_BLOCK_VOLUME_SLICE:
@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		p = xdr_inline_decode(xdr, 4);
 		if (!p)
 			return -EIO;
 		b->concat.volumes_count = be32_to_cpup(p++);
 		if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
 			dprintk("Too many volumes: %d\n", b->concat.volumes_count);
 			return -EIO;
 		}
 		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
 		if (!p)
@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 		p = xdr_inline_decode(xdr, 8 + 4);
 		if (!p)
 			return -EIO;
 		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
 		b->stripe.volumes_count = be32_to_cpup(p++);
 		if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
 			dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
 			return -EIO;
 		}
 		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
 		if (!p)
@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
 	struct block_device *bdev;
 	dev_t dev;
 	dev = bl_resolve_deviceid(server, v, gfp_mask);
 	if (!dev)
 		return -EIO;
-	d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
-	if (IS_ERR(d->bdev)) {
+	if (IS_ERR(bdev)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+			MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
-		return PTR_ERR(d->bdev);
+		return PTR_ERR(bdev);
 	}
 	d->bdev = bdev;
 	d->len = i_size_read(d->bdev->bd_inode);
@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
 	}
 }
 /*
 * Try to open the udev path for the WWN.  At least on Debian the udev
 * by-id path will always point to the dm-multipath device if one exists.
 */
 static struct block_device *
 bl_open_udev_path(struct pnfs_block_volume *v)
 {
 	struct block_device *bdev;
 	const char *devname;
 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
 				v->scsi.designator_len, v->scsi.designator);
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 	bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
 	if (IS_ERR(bdev)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
 			devname, PTR_ERR(bdev));
 	}
 	kfree(devname);
 	return bdev;
 }
 /*
 * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
 * wwn- links will only point to the first discovered SCSI device there.
 */
 static struct block_device *
 bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
 {
 	struct block_device *bdev;
 	const char *devname;
 	devname = kasprintf(GFP_KERNEL,
 			"/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
 			v->scsi.designator_type,
 			v->scsi.designator_len, v->scsi.designator);
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 	bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
 	kfree(devname);
 	return bdev;
 }
 static int
 bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
 	struct block_device *bdev;
 	const struct pr_ops *ops;
 	const char *devname;
 	int error;
 	if (!bl_validate_designator(v))
 		return -EINVAL;
-	switch (v->scsi.designator_len) {
+	bdev = bl_open_dm_mpath_udev_path(v);
-	case 8:
+	if (IS_ERR(bdev))
-		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+		bdev = bl_open_udev_path(v);
-				v->scsi.designator);
+	if (IS_ERR(bdev))
-		break;
+		return PTR_ERR(bdev);
-	case 12:
+	d->bdev = bdev;
 		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
 				v->scsi.designator);
 		break;
 	case 16:
 		devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
 				v->scsi.designator);
 		break;
 	default:
 		return -EINVAL;
 	}
 	d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
 	if (IS_ERR(d->bdev)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
 			devname, PTR_ERR(d->bdev));
 		kfree(devname);
 		return PTR_ERR(d->bdev);
 	}
 	kfree(devname);
 	d->len = i_size_read(d->bdev->bd_inode);
 	d->map = bl_map_simple;
@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 out_blkdev_put:
-	blkdev_put(d->bdev, FMODE_READ);
+	blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
 	return error;
 }
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 	return be;
 }
 static void __ext_put_deviceids(struct list_head *head)
 {
 	struct pnfs_block_extent *be, *tmp;
 	list_for_each_entry_safe(be, tmp, head, be_list) {
 		nfs4_put_deviceid_node(be->be_device);
 		kfree(be);
 	}
 }
 static void
 __ext_tree_insert(struct rb_root *root,
 		struct pnfs_block_extent *new, bool merge_ok)
@ -163,7 +173,8 @@ free_new:
 }
 static int
-__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+__ext_tree_remove(struct rb_root *root,
 		sector_t start, sector_t end, struct list_head *tmp)
 {
 	struct pnfs_block_extent *be;
 	sector_t len1 = 0, len2 = 0;
@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 			struct pnfs_block_extent *next = ext_tree_next(be);
 			rb_erase(&be->be_node, root);
-			nfs4_put_deviceid_node(be->be_device);
+			list_add_tail(&be->be_list, tmp);
 			kfree(be);
 			be = next;
 		}
@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
 		sector_t start, sector_t end)
 {
 	int err, err2;
 	LIST_HEAD(tmp);
 	spin_lock(&bl->bl_ext_lock);
-	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 	if (rw) {
-		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
 		if (!err)
 			err = err2;
 	}
 	spin_unlock(&bl->bl_ext_lock);
 	__ext_put_deviceids(&tmp);
 	return err;
 }
@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 	sector_t end = start + len;
 	struct pnfs_block_extent *be;
 	int err = 0;
 	LIST_HEAD(tmp);
 	spin_lock(&bl->bl_ext_lock);
 	/*
 	 * First remove all COW extents or holes from written to range.
 	 */
-	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 	if (err)
 		goto out;
@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 	}
 out:
 	spin_unlock(&bl->bl_ext_lock);
 	__ext_put_deviceids(&tmp);
 	return err;
 }
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@ -119,27 +119,30 @@ out:
 * hashed by filehandle.
 */
 static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
-		struct nfs_fh *fh, nfs4_stateid *stateid)
+		struct nfs_fh *fh)
 {
 	struct nfs_server *server;
 	struct nfs_inode *nfsi;
 	struct inode *ino;
 	struct pnfs_layout_hdr *lo;
 restart:
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		list_for_each_entry(lo, &server->layouts, plh_layouts) {
-			if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+			nfsi = NFS_I(lo->plh_inode);
 			if (nfs_compare_fh(fh, &nfsi->fh))
 				continue;
-			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+			if (nfsi->layout != lo)
 				continue;
 			ino = igrab(lo->plh_inode);
 			if (!ino)
 				break;
 			spin_lock(&ino->i_lock);
 			/* Is this layout in the process of being freed? */
-			if (NFS_I(ino)->layout != lo) {
+			if (nfsi->layout != lo) {
 				spin_unlock(&ino->i_lock);
 				iput(ino);
-				break;
+				goto restart;
 			}
 			pnfs_get_layout_hdr(lo);
 			spin_unlock(&ino->i_lock);
@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
 }
 static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
-		struct nfs_fh *fh, nfs4_stateid *stateid)
+		struct nfs_fh *fh)
 {
 	struct pnfs_layout_hdr *lo;
 	spin_lock(&clp->cl_lock);
 	rcu_read_lock();
-	lo = get_layout_by_fh_locked(clp, fh, stateid);
+	lo = get_layout_by_fh_locked(clp, fh);
 	rcu_read_unlock();
 	spin_unlock(&clp->cl_lock);
@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
 /*
 * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
 */
-static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
 					const nfs4_stateid *new)
 {
 	u32 oldseq, newseq;
-	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+	/* Is the stateid still not initialised? */
-	newseq = be32_to_cpu(new->seqid);
+	if (!pnfs_layout_is_valid(lo))
 		return NFS4ERR_DELAY;
 	/* Mismatched stateid? */
 	if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
 		return NFS4ERR_BAD_STATEID;
 	newseq = be32_to_cpu(new->seqid);
 	/* Are we already in a layout recall situation? */
 	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
 	    lo->plh_return_seq != 0) {
 		if (newseq < lo->plh_return_seq)
 			return NFS4ERR_OLD_STATEID;
 		if (newseq > lo->plh_return_seq)
 			return NFS4ERR_DELAY;
 		goto out;
 	}
 	/* Check that the stateid matches what we think it should be. */
 	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 	if (newseq > oldseq + 1)
-		return false;
+		return NFS4ERR_DELAY;
-	return true;
+	/* Crazy server! */
 	if (newseq <= oldseq)
 		return NFS4ERR_OLD_STATEID;
 out:
 	return NFS_OK;
 }
 static u32 initiate_file_draining(struct nfs_client *clp,
@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
 	LIST_HEAD(free_me_list);
-	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+	lo = get_layout_by_fh(clp, &args->cbl_fh);
 	if (!lo) {
 		trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
 				&args->cbl_stateid, -rv);
@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	}
 	ino = lo->plh_inode;
 	spin_lock(&ino->i_lock);
 	if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
 		rv = NFS4ERR_DELAY;
 		goto unlock;
 	}
 	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
 	spin_unlock(&ino->i_lock);
 	pnfs_layoutcommit_inode(ino, false);
 	spin_lock(&ino->i_lock);
 	rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
 	if (rv != NFS_OK)
 		goto unlock;
 	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
 	/*
 	 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
 	 */
@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 		goto unlock;
 	}
 	/* Embrace your forgetfulness! */
 	rv = NFS4ERR_NOMATCHING_LAYOUT;
 	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
 		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
 			&args->cbl_range);
 	}
 	pnfs_mark_layout_returned_if_empty(lo);
 unlock:
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	if (hdr_arg.minorversion == 0) {
 		cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
 		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
-			return rpc_drop_reply;
+			goto out_invalidcred;
 	}
 	cps.minorversion = hdr_arg.minorversion;
@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	nfs_put_client(cps.clp);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
 out_invalidcred:
 	pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
 	return rpc_autherr_badcred;
 }
 /*
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
 */
 struct nfs_client *
 nfs_get_client(const struct nfs_client_initdata *cl_init,
 	       const struct rpc_timeout *timeparms,
 	       const char *ip_addr,
 	       rpc_authflavor_t authflavour)
 {
 	struct nfs_client *clp, *new = NULL;
@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 					&nn->nfs_client_list);
 			spin_unlock(&nn->nfs_client_lock);
 			new->cl_flags = cl_init->init_flags;
-			return rpc_ops->init_client(new, timeparms, ip_addr);
+			return rpc_ops->init_client(new, cl_init);
 		}
 		spin_unlock(&nn->nfs_client_lock);
@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
 * Create an RPC client handle
 */
 int nfs_create_rpc_client(struct nfs_client *clp,
-			  const struct rpc_timeout *timeparms,
+			  const struct nfs_client_initdata *cl_init,
 			  rpc_authflavor_t flavor)
 {
 	struct rpc_clnt		*clnt = NULL;
@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
-		.timeout	= timeparms,
+		.timeout	= cl_init->timeparms,
 		.servername	= clp->cl_hostname,
 		.nodename	= cl_init->nodename,
 		.program	= &nfs_program,
 		.version	= clp->rpc_ops->version,
 		.authflavor	= flavor,
@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
 * nfs_init_client - Initialise an NFS2 or NFS3 client
 *
 * @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
+ * @cl_init: Initialisation parameters
 * @ip_addr: IP presentation address (not used)
 *
 * Returns pointer to an NFS client, or an ERR_PTR value.
 */
 struct nfs_client *nfs_init_client(struct nfs_client *clp,
-		    const struct rpc_timeout *timeparms,
+				   const struct nfs_client_initdata *cl_init)
 		    const char *ip_addr)
 {
 	int error;
@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+	error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
 			   const struct nfs_parsed_mount_data *data,
 			   struct nfs_subversion *nfs_mod)
 {
 	struct rpc_timeout timeparms;
 	struct nfs_client_initdata cl_init = {
 		.hostname = data->nfs_server.hostname,
 		.addr = (const struct sockaddr *)&data->nfs_server.address,
@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
 		.nfs_mod = nfs_mod,
 		.proto = data->nfs_server.protocol,
 		.net = data->net,
 		.timeparms = &timeparms,
 	};
 	struct rpc_timeout timeparms;
 	struct nfs_client *clp;
 	int error;
@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+	clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
 	return NULL;
 }
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_access_entry *cache;
-	int err = -ENOENT;
+	bool retry = true;
 	int err;
 	spin_lock(&inode->i_lock);
-	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+	for(;;) {
-		goto out_zap;
+		if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
-	cache = nfs_access_search_rbtree(inode, cred);
+			goto out_zap;
-	if (cache == NULL)
+		cache = nfs_access_search_rbtree(inode, cred);
-		goto out;
+		err = -ENOENT;
-	if (!nfs_have_delegated_attributes(inode) &&
+		if (cache == NULL)
-	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+			goto out;
-		goto out_stale;
+		/* Found an entry, is our attribute cache valid? */
 		if (!nfs_attribute_cache_expired(inode) &&
 		    !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
 			break;
 		err = -ECHILD;
 		if (!may_block)
 			goto out;
 		if (!retry)
 			goto out_zap;
 		spin_unlock(&inode->i_lock);
 		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 		if (err)
 			return err;
 		spin_lock(&inode->i_lock);
 		retry = false;
 	}
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 out:
 	spin_unlock(&inode->i_lock);
 	return err;
 out_stale:
 	rb_erase(&cache->rb_node, &nfsi->access_cache);
 	list_del(&cache->lru);
 	spin_unlock(&inode->i_lock);
 	nfs_access_free_entry(cache);
 	return -ENOENT;
 out_zap:
 	spin_unlock(&inode->i_lock);
 	nfs_access_zap_cache(inode);
@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
 		cache = NULL;
 	if (cache == NULL)
 		goto out;
-	if (!nfs_have_delegated_attributes(inode) &&
+	err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
-	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+	if (err)
 		goto out;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
 	err = 0;
 out:
 	rcu_read_unlock();
 	return err;
@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 {
 	struct nfs_access_entry cache;
 	bool may_block = (mask & MAY_NOT_BLOCK) == 0;
 	int status;
 	trace_nfs_access_enter(inode);
 	status = nfs_access_get_cached_rcu(inode, cred, &cache);
 	if (status != 0)
-		status = nfs_access_get_cached(inode, cred, &cache);
+		status = nfs_access_get_cached(inode, cred, &cache, may_block);
 	if (status == 0)
 		goto out_cached;
 	status = -ECHILD;
-	if (mask & MAY_NOT_BLOCK)
+	if (!may_block)
 		goto out;
 	/* Be clever: ask server to check for all possible rights */
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 	WARN_ON_ONCE(verfp->committed < 0);
 }
 static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
 		const struct nfs_writeverf *v2)
 {
 	return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
 }
 /*
 * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
 * @dreq - direct request possibly spanning multiple servers
@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 		nfs_direct_set_hdr_verf(dreq, hdr);
 		return 0;
 	}
-	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+	return nfs_direct_cmp_verf(verfp, &hdr->verf);
 }
 /*
@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 	if (verfp->committed < 0)
 		return 1;
-	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+	return nfs_direct_cmp_verf(verfp, &data->verf);
 }
 /**
@ -366,22 +372,10 @@ out:
 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 * the iocb is still valid here if this is a synchronous request.
 */
-static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
 	struct inode *inode = dreq->inode;
 	if (dreq->iocb && write) {
 		loff_t pos = dreq->iocb->ki_pos + dreq->count;
 		spin_lock(&inode->i_lock);
 		if (i_size_read(inode) < pos)
 			i_size_write(inode, pos);
 		spin_unlock(&inode->i_lock);
 	}
 	if (write)
 		nfs_zap_mapping(inode, inode->i_mapping);
 	inode_dio_end(inode);
 	if (dreq->iocb) {
@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 	}
 out_put:
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq, false);
+		nfs_direct_complete(dreq);
 	hdr->release(hdr);
 }
@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	}
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq, false);
+		nfs_direct_complete(dreq);
 	return 0;
 }
@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (!count)
 		goto out;
 	inode_lock(inode);
 	result = nfs_sync_mapping(mapping);
 	if (result)
 		goto out_unlock;
 	task_io_account_read(count);
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
-		goto out_unlock;
+		goto out;
 	dreq->inode = inode;
 	dreq->bytes_left = dreq->max_count = count;
@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 	nfs_start_io_direct(inode);
 	NFS_I(inode)->read_io += count;
 	result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
-	inode_unlock(inode);
+	nfs_end_io_direct(inode);
 	if (!result) {
 		result = nfs_direct_wait(dreq);
@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 			iocb->ki_pos += result;
 	}
 	nfs_direct_req_release(dreq);
 	return result;
 out_release:
 	nfs_direct_req_release(dreq);
 out_unlock:
 	inode_unlock(inode);
 out:
 	return result;
 }
@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 	dreq->count = 0;
 	dreq->verf.committed = NFS_INVALID_STABLE_HOW;
 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
 	for (i = 0; i < dreq->mirror_count; i++)
 		dreq->mirrors[i].count = 0;
 	get_dreq(dreq);
@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			nfs_direct_complete(dreq, true);
+			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
 			nfs_direct_complete(dreq);
 	}
 }
@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 {
 	ssize_t result = -EINVAL;
 	size_t count;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
-	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
+	result = generic_write_checks(iocb, iter);
-		      iov_iter_count(iter));
+	if (result <= 0)
 		return result;
 	count = result;
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 	pos = iocb->ki_pos;
 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
-	inode_lock(inode);
+	task_io_account_write(count);
 	result = nfs_sync_mapping(mapping);
 	if (result)
 		goto out_unlock;
 	if (mapping->nrpages) {
 		result = invalidate_inode_pages2_range(mapping,
 					pos >> PAGE_SHIFT, end);
 		if (result)
 			goto out_unlock;
 	}
 	task_io_account_write(iov_iter_count(iter));
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		goto out_unlock;
+		goto out;
 	dreq->inode = inode;
-	dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
+	dreq->bytes_left = dreq->max_count = count;
 	dreq->io_start = pos;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	l_ctx = nfs_get_lock_context(dreq->ctx);
@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 	nfs_start_io_direct(inode);
 	result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 	if (mapping->nrpages) {
@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 					      pos >> PAGE_SHIFT, end);
 	}
-	inode_unlock(inode);
+	nfs_end_io_direct(inode);
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0) {
 			struct inode *inode = mapping->host;
 			iocb->ki_pos = pos + result;
 			spin_lock(&inode->i_lock);
 			if (i_size_read(inode) < iocb->ki_pos)
 				i_size_write(inode, iocb->ki_pos);
 			spin_unlock(&inode->i_lock);
 			/* XXX: should check the generic_write_sync retval */
 			generic_write_sync(iocb, result);
 		}
 	}
 	nfs_direct_req_release(dreq);
 	return result;
 out_release:
 	nfs_direct_req_release(dreq);
-out_unlock:
+out:
 	inode_unlock(inode);
 	return result;
 }
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 		iocb->ki_filp,
 		iov_iter_count(to), (unsigned long) iocb->ki_pos);
-	result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+	nfs_start_io_read(inode);
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	if (!result) {
 		result = generic_file_read_iter(iocb, to);
 		if (result > 0)
 			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
 	}
 	nfs_end_io_read(inode);
 	return result;
 }
 EXPORT_SYMBOL_GPL(nfs_file_read);
@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
 	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
 		filp, (unsigned long) count, (unsigned long long) *ppos);
-	res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+	nfs_start_io_read(inode);
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res) {
 		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
 		if (res > 0)
 			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
 	}
 	nfs_end_io_read(inode);
 	return res;
 }
 EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_nfs_fsync_enter(inode);
 	inode_dio_wait(inode);
 	do {
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
 			break;
 		inode_lock(inode);
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
 		if (!ret)
 			ret = pnfs_sync_inode(inode, !!datasync);
 		inode_unlock(inode);
 		/*
 		 * If nfs_file_fsync_commit detected a server reboot, then
 		 * resend all dirty pages that might have been covered by
@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file, mapping->host->i_ino, len, (long long) pos);
 start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
 	 * sync-to-disk
 	 */
 	ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
 				 nfs_wait_bit_killable, TASK_KILLABLE);
 	if (ret)
 		return ret;
 	/*
 	 * Wait for O_DIRECT to complete
 	 */
 	inode_dio_wait(mapping->host);
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 		return status;
 	NFS_I(mapping->host)->write_io += copied;
-	if (nfs_ctx_key_to_expire(ctx)) {
+	if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
 		status = nfs_wb_all(mapping->host);
 		if (status < 0)
 			return status;
@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
 */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
 	struct address_space *mapping = page->mapping;
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 	/* Always try to initiate a 'commit' if relevant, but only
 	 * wait for it if the caller allows blocking.  Even then,
 	 * only wait 1 second and only if the 'bdi' is not congested.
 	 * Waiting indefinitely can cause deadlocks when the NFS
 	 * server is on this machine, when a new TCP connection is
 	 * needed and in other rare cases.  There is no particular
 	 * need to wait extensively here.  A short wait has the
 	 * benefit that someone else can worry about the freezer.
 	 */
 	if (mapping) {
 		struct nfs_server *nfss = NFS_SERVER(mapping->host);
 		nfs_commit_inode(mapping->host, 0);
 		if (gfpflags_allow_blocking(gfp) &&
 		    !bdi_write_congested(&nfss->backing_dev_info)) {
 			wait_on_page_bit_killable_timeout(page, PG_private,
 							  HZ);
 			if (PagePrivate(page))
 				set_bdi_congested(&nfss->backing_dev_info,
 						  BLK_RW_ASYNC);
 		}
 	}
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
 		return 0;
@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		filp, filp->f_mapping->host->i_ino,
 		(long long)page_offset(page));
 	sb_start_pagefault(inode->i_sb);
 	/* make sure the cache has finished storing the page */
 	nfs_fscache_wait_on_page_write(NFS_I(inode), page);
@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 out_unlock:
 	unlock_page(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
 	ctx = nfs_file_open_context(filp);
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
-	    nfs_ctx_key_to_expire(ctx))
+	    nfs_ctx_key_to_expire(ctx, inode))
 		return 1;
 	return 0;
 }
@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = file_inode(file);
 	unsigned long written = 0;
 	ssize_t result;
 	size_t count = iov_iter_count(from);
 	result = nfs_key_timeout_notify(file, inode);
 	if (result)
 		return result;
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (iocb->ki_flags & IOCB_DIRECT)
 		result = generic_write_checks(iocb, from);
 		if (result <= 0)
 			return result;
 		return nfs_file_direct_write(iocb, from);
 	}
 	dprintk("NFS: write(%pD2, %zu@%Ld)\n",
-		file, count, (long long) iocb->ki_pos);
+		file, iov_iter_count(from), (long long) iocb->ki_pos);
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
 		goto out_swapfile;
 	/*
@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 			goto out;
 	}
-	result = count;
+	nfs_start_io_write(inode);
-	if (!count)
+	result = generic_write_checks(iocb, from);
 	if (result > 0) {
 		current->backing_dev_info = inode_to_bdi(inode);
 		result = generic_perform_write(file, from, iocb->ki_pos);
 		current->backing_dev_info = NULL;
 	}
 	nfs_end_io_write(inode);
 	if (result <= 0)
 		goto out;
-	result = generic_file_write_iter(iocb, from);
+	written = generic_write_sync(iocb, result);
-	if (result > 0)
+	iocb->ki_pos += written;
 		written = result;
 	/* Return error values */
-	if (result >= 0 && nfs_need_check_write(file, inode)) {
+	if (nfs_need_check_write(file, inode)) {
 		int err = vfs_fsync(file, 0);
 		if (err < 0)
 			result = err;
 	}
-	if (result > 0)
+	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 out:
 	return result;
 out_swapfile:
 	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
-	goto out;
+	return -EBUSY;
 }
 EXPORT_SYMBOL_GPL(nfs_file_write);
@ -779,11 +746,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	return status;
 }
 static int
 is_time_granular(struct timespec *ts) {
 	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
 }
 static int
 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	 * This makes locking act as a cache coherency point.
 	 */
 	nfs_sync_mapping(filp->f_mapping);
-	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
-		if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+		nfs_zap_mapping(inode, filp->f_mapping);
 			__nfs_revalidate_inode(NFS_SERVER(inode), inode);
 		else
 			nfs_zap_caches(inode);
 	}
 out:
 	return status;
 }
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 static void
 filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
 	loff_t end_offs = 0;
 	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-	    hdr->res.verf->committed != NFS_DATA_SYNC)
+	    hdr->res.verf->committed == NFS_FILE_SYNC)
 		return;
 	if (hdr->res.verf->committed == NFS_DATA_SYNC)
 		end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
-	pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+	/* Note: if the write is unstable, don't set end_offs until commit */
-			hdr->mds_offset + hdr->res.count);
+	pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
 	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
 		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 	}
 	filelayout_set_layoutcommit(hdr);
 	/* zero out the fattr */
 	hdr->fattr.valid = 0;
 	if (task->tk_status >= 0)
 		nfs_writeback_update_inode(hdr);
 	return 0;
 }
@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 		return -EAGAIN;
 	}
-	if (data->verf.committed == NFS_UNSTABLE)
+	pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 		pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 	return 0;
 }
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
 * we always send layoutcommit after DS writes.
 */
 static void
-ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+ff_layout_set_layoutcommit(struct inode *inode,
 		struct pnfs_layout_segment *lseg,
 		loff_t end_offset)
 {
-	if (!ff_layout_need_layoutcommit(hdr->lseg))
+	if (!ff_layout_need_layoutcommit(lseg))
 		return;
-	pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+	pnfs_set_layoutcommit(inode, lseg, end_offset);
-			hdr->mds_offset + hdr->res.count);
+	dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
-	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long long) NFS_I(inode)->layout->plh_lwb);
 		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
 static bool
@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
 	loff_t end_offs = 0;
 	int err;
 	trace_nfs4_pnfs_write(hdr, task->tk_status);
@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	if (hdr->res.verf->committed == NFS_FILE_SYNC ||
 	    hdr->res.verf->committed == NFS_DATA_SYNC)
-		ff_layout_set_layoutcommit(hdr);
+		end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
 	/* Note: if the write is unstable, don't set end_offs until commit */
 	ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
 	/* zero out fattr since we don't care DS attr at all */
 	hdr->fattr.valid = 0;
@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 		return -EAGAIN;
 	}
-	if (data->verf.committed == NFS_UNSTABLE
+	ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
 	    && ff_layout_need_layoutcommit(data->lseg))
 		pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 	return 0;
 }
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	trace_nfs_getattr_enter(inode);
 	/* Flush out writes to the server in order to update c/mtime.  */
 	if (S_ISREG(inode->i_mode)) {
-		inode_lock(inode);
+		err = filemap_write_and_wait(inode->i_mapping);
 		err = nfs_sync_inode(inode);
 		inode_unlock(inode);
 		if (err)
 			goto out;
 	}
@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	spin_lock(&inode->i_lock);
-	list_add(&ctx->list, &nfsi->open_files);
+	if (ctx->mode & FMODE_WRITE)
 		list_add(&ctx->list, &nfsi->open_files);
 	else
 		list_add_tail(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
 		goto out;
 	/* pNFS: Attributes aren't updated until we layoutcommit */
 	if (S_ISREG(inode->i_mode)) {
 		status = pnfs_sync_inode(inode, false);
 		if (status)
 			goto out;
 	}
 	status = -ENOMEM;
 	fattr = nfs_alloc_fattr();
 	if (fattr == NULL)
@ -1122,14 +1130,12 @@ out:
 }
 /**
- * __nfs_revalidate_mapping - Revalidate the pagecache
+ * nfs_revalidate_mapping - Revalidate the pagecache
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
 * @may_lock - take inode->i_mutex?
 */
-static int __nfs_revalidate_mapping(struct inode *inode,
+int nfs_revalidate_mapping(struct inode *inode,
-		struct address_space *mapping,
+		struct address_space *mapping)
 		bool may_lock)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long *bitlock = &nfsi->flags;
@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
 	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
-	if (may_lock) {
+	ret = nfs_invalidate_mapping(inode, mapping);
 		inode_lock(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
 		inode_unlock(inode);
 	} else
 		ret = nfs_invalidate_mapping(inode, mapping);
 	trace_nfs_invalidate_mapping_exit(inode, ret);
 	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@ -1193,27 +1194,28 @@ out:
 	return ret;
 }
-/**
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
 * nfs_revalidate_mapping - Revalidate the pagecache
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
 */
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
-	return __nfs_revalidate_mapping(inode, mapping, false);
+	struct inode *inode = &nfsi->vfs_inode;
 	assert_spin_locked(&inode->i_lock);
 	if (!S_ISREG(inode->i_mode))
 		return false;
 	if (list_empty(&nfsi->open_files))
 		return false;
 	/* Note: This relies on nfsi->open_files being ordered with writers
 	 *       being placed at the head of the list.
 	 *       See nfs_inode_attach_open_context()
 	 */
 	return (list_first_entry(&nfsi->open_files,
 			struct nfs_open_context,
 			list)->mode & FMODE_WRITE) == FMODE_WRITE;
 }
-/**
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
 * nfs_revalidate_mapping_protected - Revalidate the pagecache
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
 *
 * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
 * while invalidating the mapping.
 */
 int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
 {
-	return __nfs_revalidate_mapping(inode, mapping, true);
+	return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
 }
 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -EIO;
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+	if (!nfs_file_has_buffered_writers(nfsi)) {
-			inode->i_version != fattr->change_attr)
+		/* Verify a few of the more important attributes */
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
 			invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
-	/* Verify a few of the more important attributes */
+		if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
-	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+			invalid |= NFS_INO_INVALID_ATTR;
 		invalid |= NFS_INO_INVALID_ATTR;
-	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
-		cur_size = i_size_read(inode);
+			invalid |= NFS_INO_INVALID_ATTR;
-		new_isize = nfs_size_to_loff_t(fattr->size);
+
-		if (cur_size != new_isize)
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+			cur_size = i_size_read(inode);
 			new_isize = nfs_size_to_loff_t(fattr->size);
 			if (cur_size != new_isize)
 				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 		}
 	}
 	if (nfsi->nrequests != 0)
 		invalid &= ~NFS_INO_REVAL_PAGECACHE;
 	/* Have any file permissions changed? */
 	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 /*
 * Don't trust the change_attribute, mtime, ctime or size if
 * a pnfs LAYOUTCOMMIT is outstanding
 */
 static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
 		struct nfs_fattr *fattr)
 {
 	if (pnfs_layoutcommit_outstanding(inode))
 		fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
 				NFS_ATTR_FATTR_MTIME |
 				NFS_ATTR_FATTR_CTIME |
 				NFS_ATTR_FATTR_SIZE);
 }
 static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
 	int ret;
 	trace_nfs_refresh_inode_enter(inode);
 	nfs_inode_attrs_handle_layoutcommit(inode, fattr);
 	if (nfs_inode_attrs_need_update(inode, fattr))
 		ret = nfs_update_inode(inode, fattr);
 	else
@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
 static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
-	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	unsigned long invalid = NFS_INO_INVALID_ATTR;
 	/*
 	 * Don't revalidate the pagecache if we hold a delegation, but do
@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	unsigned long invalid = 0;
 	unsigned long now = jiffies;
 	unsigned long save_cache_validity;
 	bool have_writers = nfs_file_has_buffered_writers(nfsi);
 	bool cache_revalidated = true;
 	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Do atomic weak cache consistency updates */
 	invalid |= nfs_wcc_update_inode(inode, fattr);
 	if (pnfs_layoutcommit_outstanding(inode)) {
 		nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
 		cache_revalidated = false;
 	}
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
 		if (inode->i_version != fattr->change_attr) {
 			dprintk("NFS: change_attr change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR
+			/* Could it be a race with writeback? */
-				| NFS_INO_INVALID_DATA
+			if (!have_writers) {
-				| NFS_INO_INVALID_ACCESS
+				invalid |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACL;
+					| NFS_INO_INVALID_DATA
-			if (S_ISDIR(inode->i_mode))
+					| NFS_INO_INVALID_ACCESS
-				nfs_force_lookup_revalidate(inode);
+					| NFS_INO_INVALID_ACL;
 				if (S_ISDIR(inode->i_mode))
 					nfs_force_lookup_revalidate(inode);
 			}
 			inode->i_version = fattr->change_attr;
 		}
 	} else {
@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (new_isize != cur_isize) {
 			/* Do we perhaps have any outstanding writes, or has
 			 * the file grown beyond our last write? */
-			if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+			if (nfsi->nrequests == 0 || new_isize > cur_isize) {
 				i_size_write(inode, new_isize);
-				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+				if (!have_writers)
 					invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			}
 			dprintk("NFS: isize change on server for file %s/%ld "
 					"(%Ld to %Ld)\n",
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@ -66,13 +66,16 @@ struct nfs_clone_mount {
 struct nfs_client_initdata {
 	unsigned long init_flags;
-	const char *hostname;
+	const char *hostname;			/* Hostname of the server */
-	const struct sockaddr *addr;
+	const struct sockaddr *addr;		/* Address of the server */
 	const char *nodename;			/* Hostname of the client */
 	const char *ip_addr;			/* IP address of the client */
 	size_t addrlen;
 	struct nfs_subversion *nfs_mod;
 	int proto;
 	u32 minorversion;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
 };
 /*
@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 extern const struct rpc_program nfs_program;
 extern void nfs_clients_init(struct net *net);
 extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
-int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
 struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 				  const struct rpc_timeout *, const char *,
 				  rpc_authflavor_t);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   rpc_authflavor_t);
 extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 					     const struct sockaddr *ds_addr,
 					     int ds_addrlen, int ds_proto,
 					     unsigned int ds_timeo,
@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     rpc_authflavor_t au_flavor);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
 						struct inode *);
-extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 			const struct sockaddr *ds_addr, int ds_addrlen,
 			int ds_proto, unsigned int ds_timeo,
 			unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
-			   const struct rpc_timeout *timeparms,
+			   const struct nfs_client_initdata *);
 			   const char *ip_addr);
 /* dir.c */
 extern void nfs_force_use_readdirplus(struct inode *dir);
@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
 extern bool nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* io.c */
 extern void nfs_start_io_read(struct inode *inode);
 extern void nfs_end_io_read(struct inode *inode);
 extern void nfs_start_io_write(struct inode *inode);
 extern void nfs_end_io_write(struct inode *inode);
 extern void nfs_start_io_direct(struct inode *inode);
 extern void nfs_end_io_direct(struct inode *inode);
 static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
 {
 	return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
 }
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
 extern char *nfs_path(char **p, struct dentry *dentry,
@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
 		    struct inode *inode,
 		    struct nfs_direct_req *dreq);
 int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
 void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
 int nfs_filemap_write_and_wait_range(struct address_space *mapping,
 		loff_t lstart, loff_t lend);
 #ifdef CONFIG_NFS_V4_1
 static inline
 void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
 {
 	int i;
 	for (i = 0; i < cinfo->nbuckets; i++)
 		cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
 }
 #else
 static inline
 void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
 {
 }
 #endif
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
 		struct page *, struct page *, enum migrate_mode);
@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
 #define nfs_migrate_page NULL
 #endif
 static inline int
 nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
 		const struct nfs_write_verifier *v2)
 {
 	return memcmp(v1->data, v2->data, sizeof(v1->data));
 }
 /* unlink.c */
 extern struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
-			    const struct rpc_timeout *timeparms,
+			    const struct nfs_client_initdata *);
 			    const char *ip_addr);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
 				struct rpc_cred *cred);
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@ -0,0 +1,147 @@
 /*
 * Copyright (c) 2016 Trond Myklebust
 *
 * I/O and data path helper functionality.
 */
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/rwsem.h>
 #include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include "internal.h"
 /* Call with exclusively locked inode->i_rwsem */
 static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
 {
 	if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
 		clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
 		inode_dio_wait(inode);
 	}
 }
 /**
 * nfs_start_io_read - declare the file is being used for buffered reads
 * @inode - file inode
 *
 * Declare that a buffered read operation is about to start, and ensure
 * that we block all direct I/O.
 * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
 * and holds a shared lock on inode->i_rwsem to ensure that the flag
 * cannot be changed.
 * In practice, this means that buffered read operations are allowed to
 * execute in parallel, thanks to the shared lock, whereas direct I/O
 * operations need to wait to grab an exclusive lock in order to set
 * NFS_INO_ODIRECT.
 * Note that buffered writes and truncates both take a write lock on
 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
 */
 void
 nfs_start_io_read(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	/* Be an optimist! */
 	down_read(&inode->i_rwsem);
 	if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
 		return;
 	up_read(&inode->i_rwsem);
 	/* Slow path.... */
 	down_write(&inode->i_rwsem);
 	nfs_block_o_direct(nfsi, inode);
 	downgrade_write(&inode->i_rwsem);
 }
 /**
 * nfs_end_io_read - declare that the buffered read operation is done
 * @inode - file inode
 *
 * Declare that a buffered read operation is done, and release the shared
 * lock on inode->i_rwsem.
 */
 void
 nfs_end_io_read(struct inode *inode)
 {
 	up_read(&inode->i_rwsem);
 }
 /**
 * nfs_start_io_write - declare the file is being used for buffered writes
 * @inode - file inode
 *
 * Declare that a buffered read operation is about to start, and ensure
 * that we block all direct I/O.
 */
 void
 nfs_start_io_write(struct inode *inode)
 {
 	down_write(&inode->i_rwsem);
 	nfs_block_o_direct(NFS_I(inode), inode);
 }
 /**
 * nfs_end_io_write - declare that the buffered write operation is done
 * @inode - file inode
 *
 * Declare that a buffered write operation is done, and release the
 * lock on inode->i_rwsem.
 */
 void
 nfs_end_io_write(struct inode *inode)
 {
 	up_write(&inode->i_rwsem);
 }
 /* Call with exclusively locked inode->i_rwsem */
 static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
 {
 	if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
 		set_bit(NFS_INO_ODIRECT, &nfsi->flags);
 		nfs_wb_all(inode);
 	}
 }
 /**
 * nfs_end_io_direct - declare the file is being used for direct i/o
 * @inode - file inode
 *
 * Declare that a direct I/O operation is about to start, and ensure
 * that we block all buffered I/O.
 * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
 * and holds a shared lock on inode->i_rwsem to ensure that the flag
 * cannot be changed.
 * In practice, this means that direct I/O operations are allowed to
 * execute in parallel, thanks to the shared lock, whereas buffered I/O
 * operations need to wait to grab an exclusive lock in order to clear
 * NFS_INO_ODIRECT.
 * Note that buffered writes and truncates both take a write lock on
 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
 */
 void
 nfs_start_io_direct(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	/* Be an optimist! */
 	down_read(&inode->i_rwsem);
 	if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
 		return;
 	up_read(&inode->i_rwsem);
 	/* Slow path.... */
 	down_write(&inode->i_rwsem);
 	nfs_block_buffered(nfsi, inode);
 	downgrade_write(&inode->i_rwsem);
 }
 /**
 * nfs_end_io_direct - declare that the direct i/o operation is done
 * @inode - file inode
 *
 * Declare that a direct I/O operation is done, and release the shared
 * lock on inode->i_rwsem.
 */
 void
 nfs_end_io_direct(struct inode *inode)
 {
 	up_read(&inode->i_rwsem);
 }
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
 * low timeout interval so that if a connection is lost, we retry through
 * the MDS.
 */
-struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 		const struct sockaddr *ds_addr, int ds_addrlen,
 		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
 		rpc_authflavor_t au_flavor)
 {
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *mds_clp = mds_srv->nfs_client;
 	struct nfs_client_initdata cl_init = {
 		.addr = ds_addr,
 		.addrlen = ds_addrlen,
 		.nodename = mds_clp->cl_rpcclient->cl_nodename,
 		.ip_addr = mds_clp->cl_ipaddr,
 		.nfs_mod = &nfs_v3,
 		.proto = ds_proto,
 		.net = mds_clp->cl_net,
 		.timeparms = &ds_timeout,
 	};
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
 	char buf[INET6_ADDRSTRLEN + 1];
@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
 		return ERR_PTR(-EINVAL);
 	cl_init.hostname = buf;
 	if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	/* Use the MDS nfs_client cl_ipaddr. */
 	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+	clp = nfs_get_client(&cl_init, au_flavor);
 			     au_flavor);
 	return clp;
 }
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
 		return -EOPNOTSUPP;
 	nfs_wb_all(inode);
 	inode_lock(inode);
 	err = nfs_sync_inode(inode);
 	if (err)
 		goto out_unlock;
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
 	if (err == 0)
 		truncate_pagecache_range(inode, offset, (offset + len) -1);
 	if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-
+out_unlock:
 	inode_unlock(inode);
 	return err;
 }
@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
 	if (status)
 		return status;
 	status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
 			pos_src, pos_src + (loff_t)count - 1);
 	if (status)
 		return status;
 	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
 				     dst_lock, FMODE_WRITE);
 	if (status)
 		return status;
 	status = nfs_sync_inode(dst_inode);
 	if (status)
 		return status;
 	status = nfs4_call_sync(server->client, server, &msg,
 				&args.seq_args, &res.seq_res, 0);
 	if (status == -ENOTSUPP)
@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
 	if (status)
 		return status;
-	nfs_wb_all(inode);
+	status = nfs_filemap_write_and_wait_range(inode->i_mapping,
 			offset, LLONG_MAX);
 	if (status)
 		return status;
 	status = nfs4_call_sync(server->client, server, &msg,
 				&args.seq_args, &res.seq_res, 0);
 	if (status == -ENOTSUPP)
@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 			 * Mark the bad layout state as invalid, then retry
 			 * with the current stateid.
 			 */
-			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+			pnfs_mark_layout_stateid_invalid(lo, &head);
 			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
 			spin_unlock(&inode->i_lock);
 			pnfs_free_lseg_list(&head);
 		} else
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
 				 struct nfs42_write_res *res)
 {
 	__be32 *p;
 	int stateids;
 	p = xdr_inline_decode(xdr, 4 + 8 + 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	stateids = be32_to_cpup(p++);
+	/*
 	 * We never use asynchronous mode, so warn if a server returns
 	 * a stateid.
 	 */
 	if (unlikely(*p != 0)) {
 		pr_err_once("%s: server has set unrequested "
 				"asynchronous mode\n", __func__);
 		return -EREMOTEIO;
 	}
 	p++;
 	p = xdr_decode_hyper(p, &res->count);
 	res->verifier.committed = be32_to_cpup(p);
 	return decode_verifier(xdr, &res->verifier.verifier);
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@ -185,6 +185,7 @@ struct nfs4_state {
 struct nfs4_exception {
 	struct nfs4_state *state;
 	struct inode *inode;
 	nfs4_stateid *stateid;
 	long timeout;
 	unsigned char delay : 1,
 		      recovering : 1,
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 * Returns pointer to an NFS client, or an ERR_PTR value.
 */
 struct nfs_client *nfs4_init_client(struct nfs_client *clp,
-				    const struct rpc_timeout *timeparms,
+				    const struct nfs_client_initdata *cl_init)
 				    const char *ip_addr)
 {
 	char buf[INET6_ADDRSTRLEN + 1];
 	const char *ip_addr = cl_init->ip_addr;
 	struct nfs_client *old;
 	int error;
@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
 	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+	error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
-		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+		error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
 	if (error < 0)
 		goto error;
@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
 		.hostname = hostname,
 		.addr = addr,
 		.addrlen = addrlen,
 		.ip_addr = ip_addr,
 		.nfs_mod = &nfs_v4,
 		.proto = proto,
 		.minorversion = minorversion,
 		.net = net,
 		.timeparms = timeparms,
 	};
 	struct nfs_client *clp;
 	int error;
@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+	clp = nfs_get_client(&cl_init, authflavour);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
@ -842,20 +844,24 @@ error:
 * low timeout interval so that if a connection is lost, we retry through
 * the MDS.
 */
-struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 		const struct sockaddr *ds_addr, int ds_addrlen,
 		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
 		u32 minor_version, rpc_authflavor_t au_flavor)
 {
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *mds_clp = mds_srv->nfs_client;
 	struct nfs_client_initdata cl_init = {
 		.addr = ds_addr,
 		.addrlen = ds_addrlen,
 		.nodename = mds_clp->cl_rpcclient->cl_nodename,
 		.ip_addr = mds_clp->cl_ipaddr,
 		.nfs_mod = &nfs_v4,
 		.proto = ds_proto,
 		.minorversion = minor_version,
 		.net = mds_clp->cl_net,
 		.timeparms = &ds_timeout,
 	};
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
 	char buf[INET6_ADDRSTRLEN + 1];
@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 		return ERR_PTR(-EINVAL);
 	cl_init.hostname = buf;
 	if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
 		__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	/*
 	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
 	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
 	 * (section 13.1 RFC 5661).
 	 */
 	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+	clp = nfs_get_client(&cl_init, au_flavor);
 			     au_flavor);
 	dprintk("<-- %s %p\n", __func__, clp);
 	return clp;
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	if (openflags & O_TRUNC) {
 		attr.ia_valid |= ATTR_SIZE;
 		attr.ia_size = 0;
-		nfs_sync_inode(inode);
+		filemap_write_and_wait(inode->i_mapping);
 	}
 	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    size_t count, unsigned int flags)
 {
-	struct inode *in_inode = file_inode(file_in);
+	if (file_inode(file_in) == file_inode(file_out))
 	struct inode *out_inode = file_inode(file_out);
 	int ret;
 	if (in_inode == out_inode)
 		return -EINVAL;
 	/* flush any pending writes */
 	ret = nfs_sync_inode(in_inode);
 	if (ret)
 		return ret;
 	ret = nfs_sync_inode(out_inode);
 	if (ret)
 		return ret;
 	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
 }
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state *state = exception->state;
 	const nfs4_stateid *stateid = exception->stateid;
 	struct inode *inode = exception->inode;
 	int ret = errorcode;
@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-			if (inode && nfs_async_inode_return_delegation(inode,
+			if (inode) {
-						NULL) == 0)
+				int err;
-				goto wait_on_recovery;
+
 				err = nfs_async_inode_return_delegation(inode,
 						stateid);
 				if (err == 0)
 					goto wait_on_recovery;
 				if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
 					exception->retry = 1;
 					break;
 				}
 			}
 			if (state == NULL)
 				break;
 			ret = nfs4_schedule_stateid_recovery(server, state);
@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_LAYOUTTRYLATER:
 		case -NFS4ERR_RECALLCONFLICT:
 			exception->delay = 1;
 			return 0;
@ -2669,10 +2680,61 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 	return res;
 }
-static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+static int _nfs4_do_setattr(struct inode *inode,
-			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs_setattrargs *arg,
-			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs_setattrres *res,
-			    struct nfs4_label *olabel)
+			    struct rpc_cred *cred,
 			    struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
        struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
 		.rpc_argp	= arg,
 		.rpc_resp	= res,
 		.rpc_cred	= cred,
        };
 	struct rpc_cred *delegation_cred = NULL;
 	unsigned long timestamp = jiffies;
 	fmode_t fmode;
 	bool truncate;
 	int status;
 	nfs_fattr_init(res->fattr);
 	/* Servers should only apply open mode checks for file size changes */
 	truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
 	fmode = truncate ? FMODE_WRITE : FMODE_READ;
 	if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
 		/* Use that stateid */
 	} else if (truncate && state != NULL) {
 		struct nfs_lockowner lockowner = {
 			.l_owner = current->files,
 			.l_pid = current->tgid,
 		};
 		if (!nfs4_valid_open_stateid(state))
 			return -EBADF;
 		if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
 				&arg->stateid, &delegation_cred) == -EIO)
 			return -EBADF;
 	} else
 		nfs4_stateid_copy(&arg->stateid, &zero_stateid);
 	if (delegation_cred)
 		msg.rpc_cred = delegation_cred;
 	status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
 	put_rpccred(delegation_cred);
 	if (status == 0 && state != NULL)
 		renew_lease(server, timestamp);
 	trace_nfs4_setattr(inode, &arg->stateid, status);
 	return status;
 }
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			   struct nfs_fattr *fattr, struct iattr *sattr,
 			   struct nfs4_state *state, struct nfs4_label *ilabel,
 			   struct nfs4_label *olabel)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_setattrargs  arg = {
@ -2687,67 +2749,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 		.label		= olabel,
 		.server		= server,
        };
-        struct rpc_message msg = {
+	struct nfs4_exception exception = {
-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.state = state,
-		.rpc_argp	= &arg,
+		.inode = inode,
-		.rpc_resp	= &res,
+		.stateid = &arg.stateid,
-		.rpc_cred	= cred,
+	};
-        };
+	int err;
 	struct rpc_cred *delegation_cred = NULL;
 	unsigned long timestamp = jiffies;
 	fmode_t fmode;
 	bool truncate;
 	int status;
 	arg.bitmask = nfs4_bitmask(server, ilabel);
 	if (ilabel)
 		arg.bitmask = nfs4_bitmask(server, olabel);
 	nfs_fattr_init(fattr);
 	/* Servers should only apply open mode checks for file size changes */
 	truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
 	fmode = truncate ? FMODE_WRITE : FMODE_READ;
 	if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
 		/* Use that stateid */
 	} else if (truncate && state != NULL) {
 		struct nfs_lockowner lockowner = {
 			.l_owner = current->files,
 			.l_pid = current->tgid,
 		};
 		if (!nfs4_valid_open_stateid(state))
 			return -EBADF;
 		if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
 				&arg.stateid, &delegation_cred) == -EIO)
 			return -EBADF;
 	} else
 		nfs4_stateid_copy(&arg.stateid, &zero_stateid);
 	if (delegation_cred)
 		msg.rpc_cred = delegation_cred;
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	put_rpccred(delegation_cred);
 	if (status == 0 && state != NULL)
 		renew_lease(server, timestamp);
 	trace_nfs4_setattr(inode, &arg.stateid, status);
 	return status;
 }
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			   struct nfs_fattr *fattr, struct iattr *sattr,
 			   struct nfs4_state *state, struct nfs4_label *ilabel,
 			   struct nfs4_label *olabel)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_exception exception = {
 		.state = state,
 		.inode = inode,
 	};
 	int err;
 	do {
-		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+		err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
 		switch (err) {
 		case -NFS4ERR_OPENMODE:
 			if (!(sattr->ia_valid & ATTR_SIZE)) {
@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 static int nfs4_do_find_root_sec(struct nfs_server *server,
 		struct nfs_fh *fhandle, struct nfs_fsinfo *info)
 {
 	int mv = server->nfs_client->cl_minorversion;
 	return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
 }
 /**
 * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
 * @server: initialized nfs_server handle
@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		status = nfs4_lookup_root(server, fhandle, info);
 	if (auth_probe || status == NFS4ERR_WRONGSEC)
-		status = nfs4_do_find_root_sec(server, fhandle, info);
+		status = server->nfs_client->cl_mvops->find_root_sec(server,
 				fhandle, info);
 	if (status == 0)
 		status = nfs4_server_capabilities(server, fhandle);
@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
 				 struct rpc_message *msg)
 {
 	hdr->timestamp   = jiffies;
-	hdr->pgio_done_cb = nfs4_read_done_cb;
+	if (!hdr->pgio_done_cb)
 		hdr->pgio_done_cb = nfs4_read_done_cb;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
-	int status = task->tk_status;
+	int nfs4err = task->tk_status;
 	int err, status = 0;
 	LIST_HEAD(head);
 	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
-	switch (status) {
+	switch (nfs4err) {
 	case 0:
 		goto out;
@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 			status = -EOVERFLOW;
 			goto out;
 		}
-		/* Fallthrough */
+		status = -EBUSY;
 		break;
 	case -NFS4ERR_RECALLCONFLICT:
 		nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
 					exception);
 		status = -ERECALLCONFLICT;
-		goto out;
+		break;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
 		exception->timeout = 0;
 		spin_lock(&inode->i_lock);
-		if (nfs4_stateid_match(&lgp->args.stateid,
+		lo = NFS_I(inode)->layout;
 		/* If the open stateid was bad, then recover it. */
 		if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
 		    nfs4_stateid_match_other(&lgp->args.stateid,
 					&lgp->args.ctx->state->stateid)) {
 			spin_unlock(&inode->i_lock);
 			/* If the open stateid was bad, then recover it. */
 			exception->state = lgp->args.ctx->state;
 			break;
 		}
 		lo = NFS_I(inode)->layout;
 		if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
 		    nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
 			LIST_HEAD(head);
-			/*
+		/*
-			 * Mark the bad layout state as invalid, then retry
+		 * Mark the bad layout state as invalid, then retry
-			 * with the current stateid.
+		 */
-			 */
+		pnfs_mark_layout_stateid_invalid(lo, &head);
-			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+		spin_unlock(&inode->i_lock);
-			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
+		pnfs_free_lseg_list(&head);
-			spin_unlock(&inode->i_lock);
+		status = -EAGAIN;
-			pnfs_free_lseg_list(&head);
+		goto out;
 			status = -EAGAIN;
 			goto out;
 		} else
 			spin_unlock(&inode->i_lock);
 	}
-	status = nfs4_handle_exception(server, status, exception);
+	err = nfs4_handle_exception(server, nfs4err, exception);
-	if (exception->retry)
+	if (!status) {
-		status = -EAGAIN;
+		if (exception->retry)
 			status = -EAGAIN;
 		else
 			status = err;
 	}
 out:
 	dprintk("<-- %s\n", __func__);
 	return status;
@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
 	spin_lock(&lo->plh_inode->i_lock);
 	pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
 			be32_to_cpu(lrp->args.stateid.seqid));
-	pnfs_mark_layout_returned_if_empty(lo);
+	if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
 	if (lrp->res.lrs_present)
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
 	pnfs_clear_layoutreturn_waitbit(lo);
 	spin_unlock(&lo->plh_inode->i_lock);
@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 #endif
 };
-ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
 {
 	ssize_t error, error2;
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */
 	*p = cpu_to_be32(0); /* reclaim */
 	encode_nfs4_stateid(xdr, &args->stateid);
-	p = reserve_space(xdr, 20);
+	if (args->lastbytewritten != U64_MAX) {
-	*p++ = cpu_to_be32(1); /* newoffset = TRUE */
+		p = reserve_space(xdr, 20);
-	p = xdr_encode_hyper(p, args->lastbytewritten);
+		*p++ = cpu_to_be32(1); /* newoffset = TRUE */
 		p = xdr_encode_hyper(p, args->lastbytewritten);
 	} else {
 		p = reserve_space(xdr, 12);
 		*p++ = cpu_to_be32(0); /* newoffset = FALSE */
 	}
 	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
 	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@ -37,7 +37,6 @@
 			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
 			{ 1 << NFS_INO_STALE, "STALE" }, \
 			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
 			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
 			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
 			{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
 			{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 * is required.
 * Note that caller must hold inode->i_lock.
 */
-static int
+int
 pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 		struct list_head *lseg_list)
 {
@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 }
 static void
-init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 		const struct pnfs_layout_range *range,
 		const nfs4_stateid *stateid)
 {
 	INIT_LIST_HEAD(&lseg->pls_list);
 	INIT_LIST_HEAD(&lseg->pls_lc_list);
 	atomic_set(&lseg->pls_refcount, 1);
 	smp_mb();
 	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 	lseg->pls_layout = lo;
 	lseg->pls_range = *range;
 	lseg->pls_seq = be32_to_cpu(stateid->seqid);
 }
 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
 	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
 }
 static bool
 should_free_lseg(const struct pnfs_layout_range *lseg_range,
 		 const struct pnfs_layout_range *recall_range)
 {
 	return (recall_range->iomode == IOMODE_ANY ||
 		lseg_range->iomode == recall_range->iomode) &&
 	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
 }
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 		struct list_head *tmp_list)
 {
@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 	return (s32)(s1 - s2) > 0;
 }
 static bool
 pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
 		 const struct pnfs_layout_range *recall_range)
 {
 	return (recall_range->iomode == IOMODE_ANY ||
 		lseg_range->iomode == recall_range->iomode) &&
 	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
 }
 static bool
 pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
 		const struct pnfs_layout_range *recall_range,
 		u32 seq)
 {
 	if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
 		return false;
 	if (recall_range == NULL)
 		return true;
 	return pnfs_should_free_range(&lseg->pls_range, recall_range);
 }
 /**
 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
 * @lo: layout header containing the lsegs
@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	if (list_empty(&lo->plh_segs))
 		return 0;
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-		if (!recall_range ||
+		if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
 		    should_free_lseg(&lseg->pls_range, recall_range)) {
 			if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
 				continue;
 			dprintk("%s: freeing lseg %p iomode %d seq %u"
 				"offset %llu length %llu\n", __func__,
 				lseg, lseg->pls_range.iomode, lseg->pls_seq,
@ -761,24 +773,25 @@ void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 			bool update_barrier)
 {
-	u32 oldseq, newseq, new_barrier;
+	u32 oldseq, newseq, new_barrier = 0;
-	int empty = list_empty(&lo->plh_segs);
+	bool invalid = !pnfs_layout_is_valid(lo);
 	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 	newseq = be32_to_cpu(new->seqid);
-	if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+	if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
 		nfs4_stateid_copy(&lo->plh_stateid, new);
-		if (update_barrier) {
+		/*
-			new_barrier = be32_to_cpu(new->seqid);
+		 * Because of wraparound, we want to keep the barrier
-		} else {
+		 * "close" to the current seqids.
-			/* Because of wraparound, we want to keep the barrier
+		 */
-			 * "close" to the current seqids.
+		new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 			 */
 			new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 		}
 		if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 			lo->plh_barrier = new_barrier;
 	}
 	if (update_barrier)
 		new_barrier = be32_to_cpu(new->seqid);
 	else if (new_barrier == 0)
 		return;
 	if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 		lo->plh_barrier = new_barrier;
 }
 static bool
@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 static void
 pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
 {
 	lo->plh_return_iomode = 0;
 	lo->plh_return_seq = 0;
 	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 }
 static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 		nfs4_stateid *stateid,
 		enum pnfs_iomode *iomode)
 {
 	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 		return false;
 	lo->plh_return_iomode = 0;
 	lo->plh_return_seq = 0;
 	pnfs_get_layout_hdr(lo);
-	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
 		if (stateid != NULL) {
 			nfs4_stateid_copy(stateid, &lo->plh_stateid);
 			if (lo->plh_return_seq != 0)
 				stateid->seqid = cpu_to_be32(lo->plh_return_seq);
 		}
 		if (iomode != NULL)
 			*iomode = lo->plh_return_iomode;
 		pnfs_clear_layoutreturn_info(lo);
 		return true;
 	}
 	if (stateid != NULL)
 		nfs4_stateid_copy(stateid, &lo->plh_stateid);
 	if (iomode != NULL)
 		*iomode = IOMODE_ANY;
 	return true;
 }
@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
 		enum pnfs_iomode iomode;
 		bool send;
-		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+		send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
 		stateid.seqid = cpu_to_be32(lo->plh_return_seq);
 		iomode = lo->plh_return_iomode;
 		send = pnfs_prepare_layoutreturn(lo);
 		spin_unlock(&inode->i_lock);
 		if (send) {
 			/* Send an async layoutreturn so we dont deadlock */
@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
 		dprintk("NFS: %s no layout to return\n", __func__);
 		goto out;
 	}
 	nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
 	/* Reference matched in nfs4_layoutreturn_release */
 	pnfs_get_layout_hdr(lo);
 	empty = list_empty(&lo->plh_segs);
@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
 		goto out_put_layout_hdr;
 	}
-	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+	send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
 	send = pnfs_prepare_layoutreturn(lo);
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 	if (send)
@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
 			goto out_noroc;
 	}
 	nfs4_stateid_copy(&stateid, &lo->plh_stateid);
 	/* always send layoutreturn if being marked so */
-	if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
+	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
-				   &lo->plh_flags))
+		layoutreturn = pnfs_prepare_layoutreturn(lo,
-		layoutreturn = pnfs_prepare_layoutreturn(lo);
+				&stateid, NULL);
 	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
 		/* If we are sending layoutreturn, invalidate all valid lsegs */
@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 	spin_lock(&ino->i_lock);
 	lo = NFS_I(ino)->layout;
 	pnfs_mark_layout_returned_if_empty(lo);
 	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
 		lo->plh_barrier = barrier;
 	spin_unlock(&ino->i_lock);
@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
 	struct pnfs_layout_segment *lseg = NULL;
 	nfs4_stateid stateid;
 	long timeout = 0;
-	unsigned long giveup = jiffies + rpc_get_timeout(server->client);
+	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
 	bool first;
 	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@ -1645,33 +1673,44 @@ lookup_again:
 	lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
 	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 	atomic_dec(&lo->plh_outstanding);
 	if (IS_ERR(lseg)) {
 		switch(PTR_ERR(lseg)) {
-		case -ERECALLCONFLICT:
+		case -EBUSY:
 			if (time_after(jiffies, giveup))
 				lseg = NULL;
 			break;
 		case -ERECALLCONFLICT:
 			/* Huh? We hold no layouts, how is there a recall? */
 			if (first) {
 				lseg = NULL;
 				break;
 			}
 			/* Destroy the existing layout and start over */
 			if (time_after(jiffies, giveup))
 				pnfs_destroy_layout(NFS_I(ino));
 			/* Fallthrough */
 		case -EAGAIN:
-			pnfs_put_layout_hdr(lo);
+			break;
 			if (first)
 				pnfs_clear_first_layoutget(lo);
 			if (lseg) {
 				trace_pnfs_update_layout(ino, pos, count,
 					iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
 				goto lookup_again;
 			}
 			/* Fallthrough */
 		default:
 			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
 				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 				lseg = NULL;
 			}
 			goto out_put_layout_hdr;
 		}
 		if (lseg) {
 			if (first)
 				pnfs_clear_first_layoutget(lo);
 			trace_pnfs_update_layout(ino, pos, count,
 				iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
 			pnfs_put_layout_hdr(lo);
 			goto lookup_again;
 		}
 	} else {
 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 	}
 	atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		return lseg;
 	}
-	init_lseg(lo, lseg);
+	pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
 	lseg->pls_range = res->range;
 	lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
 	spin_lock(&ino->i_lock);
 	if (pnfs_layoutgets_blocked(lo)) {
@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		 * inode invalid, and don't bother validating the stateid
 		 * sequence number.
 		 */
-		pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
+		pnfs_mark_layout_stateid_invalid(lo, &free_me);
 		nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
 		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
 	}
 	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg, &free_me);
 	if (!pnfs_layout_is_valid(lo)) {
 		pnfs_clear_layoutreturn_info(lo);
 		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 	}
 	if (res->return_on_close)
 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@ -1787,14 +1827,14 @@ static void
 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
 			 u32 seq)
 {
-	if (lo->plh_return_iomode == iomode)
+	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
 		return;
 	if (lo->plh_return_iomode != 0)
 		iomode = IOMODE_ANY;
 	lo->plh_return_iomode = iomode;
 	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-	if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+	if (seq != 0) {
 		WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
 		lo->plh_return_seq = seq;
 	}
 }
 /**
@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 	assert_spin_locked(&lo->plh_inode->i_lock);
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-		if (should_free_lseg(&lseg->pls_range, return_range)) {
+		if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
 			dprintk("%s: marking lseg %p iomode %d "
 				"offset %llu length %llu\n", __func__,
 				lseg, lseg->pls_range.iomode,
@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
 	bool return_now = false;
 	spin_lock(&inode->i_lock);
-	pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
+	pnfs_set_plh_return_info(lo, range.iomode, 0);
 	/*
 	 * mark all matching lsegs so that we are sure to have no live
 	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
 	 * for how it works.
 	 */
-	if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
+	if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
 						&range, lseg->pls_seq)) {
 		nfs4_stateid stateid;
-		enum pnfs_iomode iomode = lo->plh_return_iomode;
+		enum pnfs_iomode iomode;
-		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+		return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
 		return_now = pnfs_prepare_layoutreturn(lo);
 		spin_unlock(&inode->i_lock);
 		if (return_now)
 			pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	nfs_fattr_init(&data->fattr);
 	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
 	data->res.fattr = &data->fattr;
-	data->args.lastbytewritten = end_pos - 1;
+	if (end_pos != 0)
 		data->args.lastbytewritten = end_pos - 1;
 	else
 		data->args.lastbytewritten = U64_MAX;
 	data->res.server = NFS_SERVER(inode);
 	if (ld->prepare_layoutcommit) {
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
 				const struct pnfs_layout_range *recall_range,
 				u32 seq);
 int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 		struct list_head *lseg_list);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
 	return NFS_I(inode)->layout != NULL;
 }
 static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
 {
 	return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
 }
 static inline struct nfs4_deviceid_node *
 nfs4_get_deviceid(struct nfs4_deviceid_node *d)
 {
@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
 	return 1 + end - offset;
 }
 /**
 * pnfs_mark_layout_returned_if_empty - marks the layout as returned
 * @lo: layout header
 *
 * Note: Caller must hold inode->i_lock
 */
 static inline void
 pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
 {
 	if (list_empty(&lo->plh_segs))
 		set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 }
 static inline void
 pnfs_copy_range(struct pnfs_layout_range *dst,
 		const struct pnfs_layout_range *src)
@ -628,6 +622,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
 	return 0;
 }
 static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
 	return false;
 }
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
 	return false;
 }
 static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
 	return false;
 }
 static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
 	return NULL;
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
 }
 static struct nfs_client *(*get_v3_ds_connect)(
-			struct nfs_client *mds_clp,
+			struct nfs_server *mds_srv,
 			const struct sockaddr *ds_addr,
 			int ds_addrlen,
 			int ds_proto,
@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
 					rpc_clnt_test_and_add_xprt, NULL);
 		} else
-			clp = get_v3_ds_connect(mds_srv->nfs_client,
+			clp = get_v3_ds_connect(mds_srv,
 					(struct sockaddr *)&da->da_addr,
 					da->da_addrlen, IPPROTO_TCP,
 					timeo, retrans, au_flavor);
@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 		dprintk("%s: DS %s: trying address %s\n",
 			__func__, ds->ds_remotestr, da->da_remotestr);
-		clp = nfs4_set_ds_client(mds_srv->nfs_client,
+		clp = nfs4_set_ds_client(mds_srv,
 					(struct sockaddr *)&da->da_addr,
 					da->da_addrlen, IPPROTO_TCP,
 					timeo, retrans, minor_version,
@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 int
 pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
 {
 	int ret;
 	if (!pnfs_layoutcommit_outstanding(inode))
 		return 0;
 	ret = nfs_commit_inode(inode, FLUSH_SYNC);
 	if (ret < 0)
 		return ret;
 	if (datasync)
 		return 0;
 	return pnfs_layoutcommit_inode(inode, true);
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 {
 	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
 	unsigned int i;
 	int use_auth_null = false;
 	/*
 	 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 	 *
 	 * AUTH_NULL has a special meaning when it's in the server list - it
 	 * means that the server will ignore the rpc creds, so any flavor
-	 * can be used.
+	 * can be used but still use the sec= that was specified.
 	 */
 	for (i = 0; i < count; i++) {
 		flavor = server_authlist[i];
-		if (nfs_auth_info_match(&args->auth_info, flavor) ||
+		if (nfs_auth_info_match(&args->auth_info, flavor))
 		    flavor == RPC_AUTH_NULL)
 			goto out;
 		if (flavor == RPC_AUTH_NULL)
 			use_auth_null = true;
 	}
 	if (use_auth_null) {
 		flavor = RPC_AUTH_NULL;
 		goto out;
 	}
 	dfprintk(MOUNT,
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
 	int err;
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+	nfs_pageio_init_write(&pgio, inode, 0,
 				false, &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio, launder);
 	nfs_pageio_complete(&pgio);
@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 	/* Stop dirtying of new pages while we sync */
 	err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
 			nfs_wait_bit_killable, TASK_KILLABLE);
 	if (err)
 		goto out_err;
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
 	smp_mb__after_atomic();
 	wake_up_bit(bitlock, NFS_INO_FLUSHING);
 	if (err < 0)
 		goto out_err;
 	err = pgio.pg_error;
@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
 /*
 * Test if the open context credential key is marked to expire soon.
 */
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
 {
-	return rpcauth_cred_key_to_expire(ctx->cred);
+	struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
 	return rpcauth_cred_key_to_expire(auth, ctx->cred);
 }
 /*
@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
 	dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n",
 		file, count, (long long)(page_file_offset(page) + offset));
 	if (!count)
 		goto out;
 	if (nfs_can_extend_write(file, page, inode)) {
 		count = max(count + offset, nfs_page_length(page));
 		offset = 0;
@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 		nfs_set_pageerror(page);
 	else
 		__set_page_dirty_nobuffers(page);
-
+out:
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
 	return status;
@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		/* Okay, COMMIT succeeded, apparently. Check the verifier
 		 * returned by the server against all stored verfs. */
-		if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+		if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
 			/* We have a match */
 			nfs_inode_remove_request(req);
 			dprintk(" OK\n");
@ -1923,6 +1917,24 @@ out_mark_dirty:
 }
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 /*
 * Wrapper for filemap_write_and_wait_range()
 *
 * Needed for pNFS in order to ensure data becomes visible to the
 * client.
 */
 int nfs_filemap_write_and_wait_range(struct address_space *mapping,
 		loff_t lstart, loff_t lend)
 {
 	int ret;
 	ret = filemap_write_and_wait_range(mapping, lstart, lend);
 	if (ret == 0)
 		ret = pnfs_sync_inode(mapping->host, true);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
 /*
 * flush the inode to disk.
 */
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@ -205,12 +205,12 @@ struct nfs_inode {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
 #define NFS_INO_ODIRECT		(12)		/* I/O setting is O_DIRECT */
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
 extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
 extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@ -1596,9 +1596,8 @@ struct nfs_rpc_ops {
 	int (*have_delegation)(struct inode *, fmode_t);
 	int (*return_delegation)(struct inode *);
 	struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *);
-	struct nfs_client *
+	struct nfs_client *(*init_client) (struct nfs_client *,
-		(*init_client) (struct nfs_client *, const struct rpc_timeout *,
+				const struct nfs_client_initdata *);
 				const char *);
 	void	(*free_client) (struct nfs_client *);
 	struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *);
 	struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@ -37,7 +37,6 @@ struct rpcsec_gss_info;
 /* auth_cred ac_flags bits */
 enum {
 	RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */
 	RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */
 	RPC_CRED_NOTIFY_TIMEOUT = 2,   /* nofity generic cred when underlying
 					key will expire soon */
@ -82,6 +81,9 @@ struct rpc_cred {
 #define RPCAUTH_CRED_MAGIC	0x0f4aa4f0
 /* rpc_auth au_flags */
 #define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT	0x0001 /* underlying cred has no key timeout */
 /*
 * Client authentication handle
 */
@ -107,6 +109,9 @@ struct rpc_auth {
 	/* per-flavor data */
 };
 /* rpc_auth au_flags */
 #define RPCAUTH_AUTH_DATATOUCH	0x00000002
 struct rpc_auth_create_args {
 	rpc_authflavor_t pseudoflavor;
 	const char *target_name;
@ -196,7 +201,7 @@ void			rpcauth_destroy_credcache(struct rpc_auth *);
 void			rpcauth_clear_credcache(struct rpc_cred_cache *);
 int			rpcauth_key_timeout_notify(struct rpc_auth *,
 						struct rpc_cred *);
-bool			rpcauth_cred_key_to_expire(struct rpc_cred *);
+bool			rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
 char *			rpcauth_stringify_acceptor(struct rpc_cred *);
 static inline
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@ -73,6 +73,7 @@ u32 gss_delete_sec_context(
 rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
 					u32 service);
 u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
 bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor);
 char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
 struct pf_desc {
@ -81,6 +82,7 @@ struct pf_desc {
 	u32	service;
 	char	*name;
 	char	*auth_domain_name;
 	bool	datatouch;
 };
 /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@ -230,6 +230,10 @@ void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
 					struct rpc_task *);
 void		rpc_wake_up(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
 					struct rpc_wait_queue *,
 					bool (*)(struct rpc_task *, void *),
 					void *);
 struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
 					bool (*)(struct rpc_task *, void *),
 					void *);
@ -247,6 +251,7 @@ void		rpc_show_tasks(struct net *);
 int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
 extern struct workqueue_struct *xprtiod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@ -80,6 +80,7 @@ struct sock_xprt {
 #define TCP_RPC_REPLY		(1UL << 6)
 #define XPRT_SOCK_CONNECTING	1U
 #define XPRT_SOCK_DATA_READY	(2)
 #endif /* __KERNEL__ */
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
 	ret = kstrtoul(val, 0, &num);
 	if (ret == -EINVAL)
 		goto out_inval;
-	nbits = fls(num);
+	nbits = fls(num - 1);
 	if (num > (1U << nbits))
 		nbits++;
 	if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
 		goto out_inval;
 	*(unsigned int *)kp->arg = nbits;
@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
 EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
 bool
-rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
 {
 	if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
 		return false;
 	if (!cred->cr_ops->crkey_to_expire)
 		return false;
 	return cred->cr_ops->crkey_to_expire(cred);
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 	/* Fast track for non crkey_timeout (no key) underlying credentials */
-	if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+	if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
 		return 0;
 	/* Fast track for the normal case */
@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 	if (IS_ERR(tcred))
 		return -EACCES;
 	if (!tcred->cr_ops->crkey_timeout) {
 		set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
 		ret = 0;
 		goto out_put;
 	}
 	/* Test for the almost error case */
 	ret = tcred->cr_ops->crkey_timeout(tcred);
 	if (ret != 0) {
@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 		set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
 	}
 out_put:
 	put_rpccred(tcred);
 	return ret;
 }
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 	auth = &gss_auth->rpc_auth;
 	auth->au_cslack = GSS_CRED_SLACK >> 2;
 	auth->au_rslack = GSS_VERF_SLACK >> 2;
 	auth->au_flags = 0;
 	auth->au_ops = &authgss_ops;
 	auth->au_flavor = flavor;
 	if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
 		auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
 	atomic_set(&auth->au_count, 1);
 	kref_init(&gss_auth->kref);
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
 		.qop = GSS_C_QOP_DEFAULT,
 		.service = RPC_GSS_SVC_INTEGRITY,
 		.name = "krb5i",
 		.datatouch = true,
 	},
 	[2] = {
 		.pseudoflavor = RPC_AUTH_GSS_KRB5P,
 		.qop = GSS_C_QOP_DEFAULT,
 		.service = RPC_GSS_SVC_PRIVACY,
 		.name = "krb5p",
 		.datatouch = true,
 	},
 };
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
 }
 EXPORT_SYMBOL(gss_pseudoflavor_to_service);
 bool
 gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
 {
 	int i;
 	for (i = 0; i < gm->gm_pf_num; i++) {
 		if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
 			return gm->gm_pfs[i].datatouch;
 	}
 	return false;
 }
 char *
 gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
 {
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@ -115,6 +115,7 @@ static
 struct rpc_auth null_auth = {
 	.au_cslack	= NUL_CALLSLACK,
 	.au_rslack	= NUL_REPLYSLACK,
 	.au_flags	= RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
 	.au_ops		= &authnull_ops,
 	.au_flavor	= RPC_AUTH_NULL,
 	.au_count	= ATOMIC_INIT(0),
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@ -228,6 +228,7 @@ static
 struct rpc_auth		unix_auth = {
 	.au_cslack	= UNX_CALLSLACK,
 	.au_rslack	= NUL_REPLYSLACK,
 	.au_flags	= RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
 	.au_ops		= &authunix_ops,
 	.au_flavor	= RPC_AUTH_UNIX,
 	.au_count	= ATOMIC_INIT(0),
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
 	kfree(data);
 }
-const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
 	.rpc_call_done = rpc_cb_add_xprt_done,
 	.rpc_release = rpc_cb_add_xprt_release,
 };
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
 /*
 * rpciod-related stuff
 */
-struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue __read_mostly;
 struct workqueue_struct *xprtiod_workqueue __read_mostly;
 /*
 * Disable the timer for a given RPC task. Should be called with
@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
 * lockless RPC_IS_QUEUED() test) before we've had a chance to test
 * the RPC_TASK_RUNNING flag.
 */
-static void rpc_make_runnable(struct rpc_task *task)
+static void rpc_make_runnable(struct workqueue_struct *wq,
 		struct rpc_task *task)
 {
 	bool need_wakeup = !rpc_test_and_set_running(task);
@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 		return;
 	if (RPC_IS_ASYNC(task)) {
 		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
-		queue_work(rpciod_workqueue, &task->u.tk_work);
+		queue_work(wq, &task->u.tk_work);
 	} else
 		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
 EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
 /**
- * __rpc_do_wake_up_task - wake up a single rpc_task
+ * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
 * @wq: workqueue on which to run task
 * @queue: wait queue
 * @task: task to be woken up
 *
 * Caller must hold queue->lock, and have cleared the task queued flag.
 */
-static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
 		struct rpc_wait_queue *queue,
 		struct rpc_task *task)
 {
 	dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
 			task->tk_pid, jiffies);
@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
 	__rpc_remove_wait_queue(queue, task);
-	rpc_make_runnable(task);
+	rpc_make_runnable(wq, task);
 	dprintk("RPC:       __rpc_wake_up_task done\n");
 }
@ -436,15 +441,24 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
 /*
 * Wake up a queued task while the queue lock is being held
 */
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
 		struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	if (RPC_IS_QUEUED(task)) {
 		smp_rmb();
 		if (task->tk_waitqueue == queue)
-			__rpc_do_wake_up_task(queue, task);
+			__rpc_do_wake_up_task_on_wq(wq, queue, task);
 	}
 }
 /*
 * Wake up a queued task while the queue lock is being held
 */
 static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
 }
 /*
 * Wake up a task on a specific queue
 */
@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
 /*
 * Wake up the first task on the wait queue.
 */
-struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
 		struct rpc_wait_queue *queue,
 		bool (*func)(struct rpc_task *, void *), void *data)
 {
 	struct rpc_task	*task = NULL;
@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
 	task = __rpc_find_next_queued(queue);
 	if (task != NULL) {
 		if (func(task, data))
-			rpc_wake_up_task_queue_locked(queue, task);
+			rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
 		else
 			task = NULL;
 	}
@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
 	return task;
 }
 /*
 * Wake up the first task on the wait queue.
 */
 struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
 		bool (*func)(struct rpc_task *, void *), void *data)
 {
 	return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_first);
 static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
 	bool is_async = RPC_IS_ASYNC(task);
 	rpc_set_active(task);
-	rpc_make_runnable(task);
+	rpc_make_runnable(rpciod_workqueue, task);
 	if (!is_async)
 		__rpc_execute(task);
 }
@ -1071,10 +1095,22 @@ static int rpciod_start(void)
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	dprintk("RPC:       creating workqueue rpciod\n");
-	/* Note: highpri because network receive is latency sensitive */
+	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
-	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	if (!wq)
 		goto out_failed;
 	rpciod_workqueue = wq;
-	return rpciod_workqueue != NULL;
+	/* Note: highpri because network receive is latency sensitive */
 	wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!wq)
 		goto free_rpciod;
 	xprtiod_workqueue = wq;
 	return 1;
 free_rpciod:
 	wq = rpciod_workqueue;
 	rpciod_workqueue = NULL;
 	destroy_workqueue(wq);
 out_failed:
 	return 0;
 }
 static void rpciod_stop(void)
@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
 	wq = rpciod_workqueue;
 	rpciod_workqueue = NULL;
 	destroy_workqueue(wq);
 	wq = xprtiod_workqueue;
 	xprtiod_workqueue = NULL;
 	destroy_workqueue(wq);
 }
 void
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 		*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 		/* Encode reply */
-		if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+		if (*statp == rpc_drop_reply ||
 		    test_bit(RQ_DROPME, &rqstp->rq_flags)) {
 			if (procp->pc_release)
 				procp->pc_release(rqstp, NULL, rqstp->rq_resp);
 			goto dropit;
 		}
 		if (*statp == rpc_autherr_badcred) {
 			if (procp->pc_release)
 				procp->pc_release(rqstp, NULL, rqstp->rq_resp);
 			goto err_bad_auth;
 		}
 		if (*statp == rpc_success &&
 		    (xdr = procp->pc_encode) &&
 		    !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
 		clear_bit(XPRT_LOCKED, &xprt->state);
 		smp_mb__after_atomic();
 	} else
-		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+		queue_work(xprtiod_workqueue, &xprt->task_cleanup);
 }
 /*
@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		return;
-	if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+	if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
 				__xprt_lock_write_func, xprt))
 		return;
 	xprt_clear_locked(xprt);
 }
@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
 		return;
 	if (RPCXPRT_CONGESTED(xprt))
 		goto out_unlock;
-	if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+	if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
 				__xprt_lock_write_cong_func, xprt))
 		return;
 out_unlock:
 	xprt_clear_locked(xprt);
@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
-		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+		queue_work(xprtiod_workqueue, &xprt->task_cleanup);
 	xprt_wake_pending_tasks(xprt, -EAGAIN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
 	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
-		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+		queue_work(xprtiod_workqueue, &xprt->task_cleanup);
 	xprt_wake_pending_tasks(xprt, -EAGAIN);
 out:
 	spin_unlock_bh(&xprt->transport_lock);
@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		goto out_abort;
 	spin_unlock(&xprt->transport_lock);
-	queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	queue_work(xprtiod_workqueue, &xprt->task_cleanup);
 	return;
 out_abort:
 	spin_unlock(&xprt->transport_lock);
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
 		xprt_switch_find_xprt_t find_next)
 {
 	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
 	struct list_head *head;
 	if (xps == NULL)
 		return NULL;
-	head = &xps->xps_xprt_list;
+	return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
-	if (xps->xps_nxprts < 2)
+			&xpi->xpi_cursor,
-		return xprt_switch_find_first_entry(head);
+			find_next);
 	return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
 }
 static
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@ -1,7 +1,7 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 rpcrdma-y := transport.o rpc_rdma.o verbs.o \
-	fmr_ops.o frwr_ops.o physical_ops.o \
+	fmr_ops.o frwr_ops.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
 	module.o
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@ -19,13 +19,6 @@
 * verb (fmr_op_unmap).
 */
 /* Transport recovery
 *
 * After a transport reconnect, fmr_op_map re-uses the MR already
 * allocated for the RPC, but generates a fresh rkey then maps the
 * MR again. This process is synchronous.
 */
 #include "xprt_rdma.h"
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@ -35,62 +28,132 @@
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES	(64)
-static struct workqueue_struct *fmr_recovery_wq;
+/* Access mode of externally registered pages */
 enum {
 	RPCRDMA_FMR_ACCESS_FLAGS	= IB_ACCESS_REMOTE_WRITE |
 					  IB_ACCESS_REMOTE_READ,
 };
-#define FMR_RECOVERY_WQ_FLAGS		(WQ_UNBOUND)
+bool
-
+fmr_is_supported(struct rpcrdma_ia *ia)
 int
 fmr_alloc_recovery_wq(void)
 {
-	fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+	if (!ia->ri_device->alloc_fmr) {
-	return !fmr_recovery_wq ? -ENOMEM : 0;
+		pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
 			ia->ri_device->name);
 		return false;
 	}
 	return true;
 }
-void
+static int
-fmr_destroy_recovery_wq(void)
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
 {
-	struct workqueue_struct *wq;
+	static struct ib_fmr_attr fmr_attr = {
 		.max_pages	= RPCRDMA_MAX_FMR_SGES,
 		.max_maps	= 1,
 		.page_shift	= PAGE_SHIFT
 	};
-	if (!fmr_recovery_wq)
+	mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
-		return;
+				       sizeof(u64), GFP_KERNEL);
 	if (!mw->fmr.fm_physaddrs)
 		goto out_free;
-	wq = fmr_recovery_wq;
+	mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
-	fmr_recovery_wq = NULL;
+			    sizeof(*mw->mw_sg), GFP_KERNEL);
-	destroy_workqueue(wq);
+	if (!mw->mw_sg)
 		goto out_free;
 	sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
 	mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
 				     &fmr_attr);
 	if (IS_ERR(mw->fmr.fm_mr))
 		goto out_fmr_err;
 	return 0;
 out_fmr_err:
 	dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
 		PTR_ERR(mw->fmr.fm_mr));
 out_free:
 	kfree(mw->mw_sg);
 	kfree(mw->fmr.fm_physaddrs);
 	return -ENOMEM;
 }
 static int
 __fmr_unmap(struct rpcrdma_mw *mw)
 {
 	LIST_HEAD(l);
 	int rc;
-	list_add(&mw->fmr.fmr->list, &l);
+	list_add(&mw->fmr.fm_mr->list, &l);
-	return ib_unmap_fmr(&l);
+	rc = ib_unmap_fmr(&l);
 	list_del_init(&mw->fmr.fm_mr->list);
 	return rc;
 }
-/* Deferred reset of a single FMR. Generate a fresh rkey by
+static void
- * replacing the MR. There's no recovery if this fails.
+fmr_op_release_mr(struct rpcrdma_mw *r)
 {
 	LIST_HEAD(unmap_list);
 	int rc;
 	/* Ensure MW is not on any rl_registered list */
 	if (!list_empty(&r->mw_list))
 		list_del(&r->mw_list);
 	kfree(r->fmr.fm_physaddrs);
 	kfree(r->mw_sg);
 	/* In case this one was left mapped, try to unmap it
 	 * to prevent dealloc_fmr from failing with EBUSY
 	 */
 	rc = __fmr_unmap(r);
 	if (rc)
 		pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
 		       r, rc);
 	rc = ib_dealloc_fmr(r->fmr.fm_mr);
 	if (rc)
 		pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
 		       r, rc);
 	kfree(r);
 }
 /* Reset of a single FMR.
 */
 static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
 {
 	struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
 					    mw_work);
 	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
 	int rc;
 	/* ORDER: invalidate first */
 	rc = __fmr_unmap(mw);
 	/* ORDER: then DMA unmap */
 	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
 			mw->mw_sg, mw->mw_nents, mw->mw_dir);
 	if (rc)
 		goto out_release;
 	__fmr_unmap(mw);
 	rpcrdma_put_mw(r_xprt, mw);
 	r_xprt->rx_stats.mrs_recovered++;
 	return;
 }
-/* A broken MR was discovered in a context that can't sleep.
+out_release:
- * Defer recovery to the recovery worker.
+	pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
- */
+	r_xprt->rx_stats.mrs_orphaned++;
-static void
+
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+	spin_lock(&r_xprt->rx_buf.rb_mwlock);
-{
+	list_del(&mw->mw_all);
-	INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+	spin_unlock(&r_xprt->rx_buf.rb_mwlock);
-	queue_work(fmr_recovery_wq, &mw->mw_work);
+
 	fmr_op_release_mr(mw);
 }
 static int
@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 }
 static int
 fmr_op_init(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
 	struct ib_fmr_attr fmr_attr = {
 		.max_pages	= RPCRDMA_MAX_FMR_SGES,
 		.max_maps	= 1,
 		.page_shift	= PAGE_SHIFT
 	};
 	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
 	struct rpcrdma_mw *r;
 	int i, rc;
 	spin_lock_init(&buf->rb_mwlock);
 	INIT_LIST_HEAD(&buf->rb_mws);
 	INIT_LIST_HEAD(&buf->rb_all);
 	i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
 	i += 2;				/* head + tail */
 	i *= buf->rb_max_requests;	/* one set for each RPC slot */
 	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
 	rc = -ENOMEM;
 	while (i--) {
 		r = kzalloc(sizeof(*r), GFP_KERNEL);
 		if (!r)
 			goto out;
 		r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
 					   sizeof(u64), GFP_KERNEL);
 		if (!r->fmr.physaddrs)
 			goto out_free;
 		r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
 		if (IS_ERR(r->fmr.fmr))
 			goto out_fmr_err;
 		r->mw_xprt = r_xprt;
 		list_add(&r->mw_list, &buf->rb_mws);
 		list_add(&r->mw_all, &buf->rb_all);
 	}
 	return 0;
 out_fmr_err:
 	rc = PTR_ERR(r->fmr.fmr);
 	dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
 	kfree(r->fmr.physaddrs);
 out_free:
 	kfree(r);
 out:
 	return rc;
 }
 /* Use the ib_map_phys_fmr() verb to register a memory region
 * for remote access via RDMA READ or RDMA WRITE.
 */
 static int
 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-	   int nsegs, bool writing)
+	   int nsegs, bool writing, struct rpcrdma_mw **out)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct ib_device *device = ia->ri_device;
 	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	int len, pageoff, i, rc;
 	struct rpcrdma_mw *mw;
 	u64 *dma_pages;
-	mw = seg1->rl_mw;
+	mw = rpcrdma_get_mw(r_xprt);
-	seg1->rl_mw = NULL;
+	if (!mw)
-	if (!mw) {
+		return -ENOBUFS;
 		mw = rpcrdma_get_mw(r_xprt);
 		if (!mw)
 			return -ENOMEM;
 	} else {
 		/* this is a retransmit; generate a fresh rkey */
 		rc = __fmr_unmap(mw);
 		if (rc)
 			return rc;
 	}
 	pageoff = offset_in_page(seg1->mr_offset);
 	seg1->mr_offset -= pageoff;	/* start of page */
@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (nsegs > RPCRDMA_MAX_FMR_SGES)
 		nsegs = RPCRDMA_MAX_FMR_SGES;
 	for (i = 0; i < nsegs;) {
-		rpcrdma_map_one(device, seg, direction);
+		if (seg->mr_page)
-		mw->fmr.physaddrs[i] = seg->mr_dma;
+			sg_set_page(&mw->mw_sg[i],
 				    seg->mr_page,
 				    seg->mr_len,
 				    offset_in_page(seg->mr_offset));
 		else
 			sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
 				   seg->mr_len);
 		len += seg->mr_len;
 		++seg;
 		++i;
@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
 	mw->mw_nents = i;
 	mw->mw_dir = rpcrdma_data_dir(writing);
 	if (i == 0)
 		goto out_dmamap_err;
-	rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
+	if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
-			     i, seg1->mr_dma);
+			   mw->mw_sg, mw->mw_nents, mw->mw_dir))
 		goto out_dmamap_err;
 	for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
 		dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
 	rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
 			     dma_pages[0]);
 	if (rc)
 		goto out_maperr;
-	seg1->rl_mw = mw;
+	mw->mw_handle = mw->fmr.fm_mr->rkey;
-	seg1->mr_rkey = mw->fmr.fmr->rkey;
+	mw->mw_length = len;
-	seg1->mr_base = seg1->mr_dma + pageoff;
+	mw->mw_offset = dma_pages[0] + pageoff;
-	seg1->mr_nsegs = i;
+
-	seg1->mr_len = len;
+	*out = mw;
-	return i;
+	return mw->mw_nents;
 out_dmamap_err:
 	pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
 	       mw->mw_sg, mw->mw_nents);
 	rpcrdma_defer_mr_recovery(mw);
 	return -EIO;
 out_maperr:
-	dprintk("RPC:       %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+	pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
-		__func__, len, (unsigned long long)seg1->mr_dma,
+	       len, (unsigned long long)dma_pages[0],
-		pageoff, i, rc);
+	       pageoff, mw->mw_nents, rc);
-	while (i--)
+	rpcrdma_defer_mr_recovery(mw);
-		rpcrdma_unmap_one(device, --seg);
+	return -EIO;
 	return rc;
 }
 static void
 __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct ib_device *device = r_xprt->rx_ia.ri_device;
 	int nsegs = seg->mr_nsegs;
 	while (nsegs--)
 		rpcrdma_unmap_one(device, seg++);
 }
 /* Invalidate all memory regions that were registered for "req".
 *
 * Sleeps until it is safe for the host CPU to access the
 * previously mapped memory regions.
 *
 * Caller ensures that req->rl_registered is not empty.
 */
 static void
 fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mw *mw, *tmp;
 	unsigned int i, nchunks;
 	struct rpcrdma_mw *mw;
 	LIST_HEAD(unmap_list);
 	int rc;
@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	/* ORDER: Invalidate all of the req's MRs first
 	 *
 	 * ib_unmap_fmr() is slow, so use a single call instead
-	 * of one call per mapped MR.
+	 * of one call per mapped FMR.
 	 */
-	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+	list_for_each_entry(mw, &req->rl_registered, mw_list)
-		seg = &req->rl_segments[i];
+		list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
 		mw = seg->rl_mw;
 		list_add(&mw->fmr.fmr->list, &unmap_list);
 		i += seg->mr_nsegs;
 	}
 	rc = ib_unmap_fmr(&unmap_list);
 	if (rc)
-		pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+		goto out_reset;
 	/* ORDER: Now DMA unmap all of the req's MRs, and return
 	 * them to the free MW list.
 	 */
-	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
-		seg = &req->rl_segments[i];
+		list_del_init(&mw->mw_list);
-
+		list_del_init(&mw->fmr.fm_mr->list);
-		__fmr_dma_unmap(r_xprt, seg);
+		ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
-		rpcrdma_put_mw(r_xprt, seg->rl_mw);
+				mw->mw_sg, mw->mw_nents, mw->mw_dir);
-
+		rpcrdma_put_mw(r_xprt, mw);
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
 		seg->rl_mw = NULL;
 	}
-	req->rl_nchunks = 0;
+	return;
 out_reset:
 	pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
 	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
 		list_del_init(&mw->fmr.fm_mr->list);
 		fmr_op_recover_mr(mw);
 	}
 }
 /* Use a slow, safe mechanism to invalidate all memory regions
 * that were registered for "req".
 *
 * In the asynchronous case, DMA unmapping occurs first here
 * because the rpcrdma_mr_seg is released immediately after this
 * call. It's contents won't be available in __fmr_dma_unmap later.
 * FIXME.
 */
 static void
 fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		  bool sync)
 {
 	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mw *mw;
 	unsigned int i;
-	for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+	while (!list_empty(&req->rl_registered)) {
-		seg = &req->rl_segments[i];
+		mw = list_first_entry(&req->rl_registered,
-		mw = seg->rl_mw;
+				      struct rpcrdma_mw, mw_list);
 		list_del_init(&mw->mw_list);
-		if (sync) {
+		if (sync)
-			/* ORDER */
+			fmr_op_recover_mr(mw);
-			__fmr_unmap(mw);
+		else
-			__fmr_dma_unmap(r_xprt, seg);
+			rpcrdma_defer_mr_recovery(mw);
 			rpcrdma_put_mw(r_xprt, mw);
 		} else {
 			__fmr_dma_unmap(r_xprt, seg);
 			__fmr_queue_recovery(mw);
 		}
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
 		seg->rl_mw = NULL;
 	}
 }
 static void
 fmr_op_destroy(struct rpcrdma_buffer *buf)
 {
 	struct rpcrdma_mw *r;
 	int rc;
 	while (!list_empty(&buf->rb_all)) {
 		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
 		list_del(&r->mw_all);
 		kfree(r->fmr.physaddrs);
 		rc = ib_dealloc_fmr(r->fmr.fmr);
 		if (rc)
 			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
 				__func__, rc);
 		kfree(r);
 	}
 }
@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap_sync			= fmr_op_unmap_sync,
 	.ro_unmap_safe			= fmr_op_unmap_safe,
 	.ro_recover_mr			= fmr_op_recover_mr,
 	.ro_open			= fmr_op_open,
 	.ro_maxpages			= fmr_op_maxpages,
-	.ro_init			= fmr_op_init,
+	.ro_init_mr			= fmr_op_init_mr,
-	.ro_destroy			= fmr_op_destroy,
+	.ro_release_mr			= fmr_op_release_mr,
 	.ro_displayname			= "fmr",
 };
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@ -73,29 +73,71 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
-static struct workqueue_struct *frwr_recovery_wq;
+bool
-
+frwr_is_supported(struct rpcrdma_ia *ia)
 #define FRWR_RECOVERY_WQ_FLAGS		(WQ_UNBOUND | WQ_MEM_RECLAIM)
 int
 frwr_alloc_recovery_wq(void)
 {
-	frwr_recovery_wq = alloc_workqueue("frwr_recovery",
+	struct ib_device_attr *attrs = &ia->ri_device->attrs;
-					   FRWR_RECOVERY_WQ_FLAGS, 0);
+
-	return !frwr_recovery_wq ? -ENOMEM : 0;
+	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
 		goto out_not_supported;
 	if (attrs->max_fast_reg_page_list_len == 0)
 		goto out_not_supported;
 	return true;
 out_not_supported:
 	pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
 		ia->ri_device->name);
 	return false;
 }
-void
+static int
-frwr_destroy_recovery_wq(void)
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
 {
-	struct workqueue_struct *wq;
+	unsigned int depth = ia->ri_max_frmr_depth;
 	struct rpcrdma_frmr *f = &r->frmr;
 	int rc;
-	if (!frwr_recovery_wq)
+	f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
-		return;
+	if (IS_ERR(f->fr_mr))
 		goto out_mr_err;
-	wq = frwr_recovery_wq;
+	r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
-	frwr_recovery_wq = NULL;
+	if (!r->mw_sg)
-	destroy_workqueue(wq);
+		goto out_list_err;
 	sg_init_table(r->mw_sg, depth);
 	init_completion(&f->fr_linv_done);
 	return 0;
 out_mr_err:
 	rc = PTR_ERR(f->fr_mr);
 	dprintk("RPC:       %s: ib_alloc_mr status %i\n",
 		__func__, rc);
 	return rc;
 out_list_err:
 	rc = -ENOMEM;
 	dprintk("RPC:       %s: sg allocation failure\n",
 		__func__);
 	ib_dereg_mr(f->fr_mr);
 	return rc;
 }
 static void
 frwr_op_release_mr(struct rpcrdma_mw *r)
 {
 	int rc;
 	/* Ensure MW is not on any rl_registered list */
 	if (!list_empty(&r->mw_list))
 		list_del(&r->mw_list);
 	rc = ib_dereg_mr(r->frmr.fr_mr);
 	if (rc)
 		pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
 		       r, rc);
 	kfree(r->mw_sg);
 	kfree(r);
 }
 static int
@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
 	return 0;
 }
-static void
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
 __frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_frmr *f = &mw->frmr;
 	int rc;
 	rc = __frwr_reset_mr(ia, mw);
 	ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
 	if (rc)
 		return;
 	rpcrdma_put_mw(r_xprt, mw);
 }
 /* Deferred reset of a single FRMR. Generate a fresh rkey by
 * replacing the MR.
 *
 * There's no recovery if this fails. The FRMR is abandoned, but
 * remains in rb_all. It will be cleaned up when the transport is
 * destroyed.
 */
 static void
-__frwr_recovery_worker(struct work_struct *work)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
 {
-	struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
+	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
-					    mw_work);
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	__frwr_reset_and_unmap(r->mw_xprt, r);
 	return;
 }
 /* A broken MR was discovered in a context that can't sleep.
 * Defer recovery to the recovery worker.
 */
 static void
 __frwr_queue_recovery(struct rpcrdma_mw *r)
 {
 	INIT_WORK(&r->mw_work, __frwr_recovery_worker);
 	queue_work(frwr_recovery_wq, &r->mw_work);
 }
 static int
 __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 	    unsigned int depth)
 {
 	struct rpcrdma_frmr *f = &r->frmr;
 	int rc;
-	f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
+	rc = __frwr_reset_mr(ia, mw);
-	if (IS_ERR(f->fr_mr))
+	ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
 		goto out_mr_err;
 	f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
 	if (!f->fr_sg)
 		goto out_list_err;
 	sg_init_table(f->fr_sg, depth);
 	init_completion(&f->fr_linv_done);
 	return 0;
 out_mr_err:
 	rc = PTR_ERR(f->fr_mr);
 	dprintk("RPC:       %s: ib_alloc_mr status %i\n",
 		__func__, rc);
 	return rc;
 out_list_err:
 	rc = -ENOMEM;
 	dprintk("RPC:       %s: sg allocation failure\n",
 		__func__);
 	ib_dereg_mr(f->fr_mr);
 	return rc;
 }
 static void
 __frwr_release(struct rpcrdma_mw *r)
 {
 	int rc;
 	rc = ib_dereg_mr(r->frmr.fr_mr);
 	if (rc)
-		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
+		goto out_release;
-			__func__, rc);
+
-	kfree(r->frmr.fr_sg);
+	rpcrdma_put_mw(r_xprt, mw);
 	r_xprt->rx_stats.mrs_recovered++;
 	return;
 out_release:
 	pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
 	r_xprt->rx_stats.mrs_orphaned++;
 	spin_lock(&r_xprt->rx_buf.rb_mwlock);
 	list_del(&mw->mw_all);
 	spin_unlock(&r_xprt->rx_buf.rb_mwlock);
 	frwr_op_release_mr(mw);
 }
 static int
@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 	complete_all(&frmr->fr_linv_done);
 }
-static int
+/* Post a REG_MR Work Request to register a memory region
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct ib_device *device = r_xprt->rx_ia.ri_device;
 	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
 	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
 	int i;
 	spin_lock_init(&buf->rb_mwlock);
 	INIT_LIST_HEAD(&buf->rb_mws);
 	INIT_LIST_HEAD(&buf->rb_all);
 	i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
 	i += 2;				/* head + tail */
 	i *= buf->rb_max_requests;	/* one set for each RPC slot */
 	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
 	while (i--) {
 		struct rpcrdma_mw *r;
 		int rc;
 		r = kzalloc(sizeof(*r), GFP_KERNEL);
 		if (!r)
 			return -ENOMEM;
 		rc = __frwr_init(r, pd, device, depth);
 		if (rc) {
 			kfree(r);
 			return rc;
 		}
 		r->mw_xprt = r_xprt;
 		list_add(&r->mw_list, &buf->rb_mws);
 		list_add(&r->mw_all, &buf->rb_all);
 	}
 	return 0;
 }
 /* Post a FAST_REG Work Request to register a memory region
 * for remote access via RDMA READ or RDMA WRITE.
 */
 static int
 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-	    int nsegs, bool writing)
+	    int nsegs, bool writing, struct rpcrdma_mw **out)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct ib_device *device = ia->ri_device;
 	enum dma_data_direction direction = rpcrdma_data_dir(writing);
 	struct rpcrdma_mr_seg *seg1 = seg;
 	struct rpcrdma_mw *mw;
 	struct rpcrdma_frmr *frmr;
 	struct ib_mr *mr;
@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	int rc, i, n, dma_nents;
 	u8 key;
-	mw = seg1->rl_mw;
+	mw = NULL;
 	seg1->rl_mw = NULL;
 	do {
 		if (mw)
-			__frwr_queue_recovery(mw);
+			rpcrdma_defer_mr_recovery(mw);
 		mw = rpcrdma_get_mw(r_xprt);
 		if (!mw)
-			return -ENOMEM;
+			return -ENOBUFS;
 	} while (mw->frmr.fr_state != FRMR_IS_INVALID);
 	frmr = &mw->frmr;
 	frmr->fr_state = FRMR_IS_VALID;
@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (nsegs > ia->ri_max_frmr_depth)
 		nsegs = ia->ri_max_frmr_depth;
 	for (i = 0; i < nsegs;) {
 		if (seg->mr_page)
-			sg_set_page(&frmr->fr_sg[i],
+			sg_set_page(&mw->mw_sg[i],
 				    seg->mr_page,
 				    seg->mr_len,
 				    offset_in_page(seg->mr_offset));
 		else
-			sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
+			sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
 				   seg->mr_len);
 		++seg;
@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
-	frmr->fr_nents = i;
+	mw->mw_nents = i;
-	frmr->fr_dir = direction;
+	mw->mw_dir = rpcrdma_data_dir(writing);
 	if (i == 0)
 		goto out_dmamap_err;
-	dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
+	dma_nents = ib_dma_map_sg(ia->ri_device,
-	if (!dma_nents) {
+				  mw->mw_sg, mw->mw_nents, mw->mw_dir);
-		pr_err("RPC:       %s: failed to dma map sg %p sg_nents %u\n",
+	if (!dma_nents)
-		       __func__, frmr->fr_sg, frmr->fr_nents);
+		goto out_dmamap_err;
 		return -ENOMEM;
 	}
-	n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+	n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
-	if (unlikely(n != frmr->fr_nents)) {
+	if (unlikely(n != mw->mw_nents))
-		pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
+		goto out_mapmr_err;
 		       __func__, frmr->fr_mr, n, frmr->fr_nents);
 		rc = n < 0 ? n : -EINVAL;
 		goto out_senderr;
 	}
 	dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-		__func__, mw, frmr->fr_nents, mr->length);
+		__func__, mw, mw->mw_nents, mr->length);
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (rc)
 		goto out_senderr;
-	seg1->rl_mw = mw;
+	mw->mw_handle = mr->rkey;
-	seg1->mr_rkey = mr->rkey;
+	mw->mw_length = mr->length;
-	seg1->mr_base = mr->iova;
+	mw->mw_offset = mr->iova;
 	seg1->mr_nsegs = frmr->fr_nents;
 	seg1->mr_len = mr->length;
-	return frmr->fr_nents;
+	*out = mw;
 	return mw->mw_nents;
 out_dmamap_err:
 	pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
 	       mw->mw_sg, mw->mw_nents);
 	rpcrdma_defer_mr_recovery(mw);
 	return -EIO;
 out_mapmr_err:
 	pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
 	       frmr->fr_mr, n, mw->mw_nents);
 	rpcrdma_defer_mr_recovery(mw);
 	return -EIO;
 out_senderr:
-	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+	pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
-	__frwr_queue_recovery(mw);
+	rpcrdma_defer_mr_recovery(mw);
-	return rc;
+	return -ENOTCONN;
 }
 static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
 {
 	struct rpcrdma_mw *mw = seg->rl_mw;
 	struct rpcrdma_frmr *f = &mw->frmr;
 	struct ib_send_wr *invalidate_wr;
@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
 *
 * Sleeps until it is safe for the host CPU to access the
 * previously mapped memory regions.
 *
 * Caller ensures that req->rl_registered is not empty.
 */
 static void
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mw *mw, *tmp;
 	unsigned int i, nchunks;
 	struct rpcrdma_frmr *f;
 	struct rpcrdma_mw *mw;
 	int rc;
 	dprintk("RPC:       %s: req %p\n", __func__, req);
@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * Chain the LOCAL_INV Work Requests and post them with
 	 * a single ib_post_send() call.
 	 */
 	f = NULL;
 	invalidate_wrs = pos = prev = NULL;
-	seg = NULL;
+	list_for_each_entry(mw, &req->rl_registered, mw_list) {
-	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		pos = __frwr_prepare_linv_wr(mw);
 		seg = &req->rl_segments[i];
 		pos = __frwr_prepare_linv_wr(seg);
 		if (!invalidate_wrs)
 			invalidate_wrs = pos;
 		else
 			prev->next = pos;
 		prev = pos;
-
+		f = &mw->frmr;
 		i += seg->mr_nsegs;
 	}
 	f = &seg->rl_mw->frmr;
 	/* Strong send queue ordering guarantees that when the
 	 * last WR in the chain completes, all WRs in the chain
@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * them to the free MW list.
 	 */
 unmap:
-	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
-		seg = &req->rl_segments[i];
+		list_del_init(&mw->mw_list);
-		mw = seg->rl_mw;
+		ib_dma_unmap_sg(ia->ri_device,
-		seg->rl_mw = NULL;
+				mw->mw_sg, mw->mw_nents, mw->mw_dir);
 		ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
 				f->fr_dir);
 		rpcrdma_put_mw(r_xprt, mw);
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
 	}
 	req->rl_nchunks = 0;
 	return;
 reset_mrs:
-	pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+	pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
 	rdma_disconnect(ia->ri_id);
 	/* Find and reset the MRs in the LOCAL_INV WRs that did not
 	 * get posted. This is synchronous, and slow.
 	 */
-	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+	list_for_each_entry(mw, &req->rl_registered, mw_list) {
 		seg = &req->rl_segments[i];
 		mw = seg->rl_mw;
 		f = &mw->frmr;
 		if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
 			__frwr_reset_mr(ia, mw);
 			bad_wr = bad_wr->next;
 		}
 		i += seg->mr_nsegs;
 	}
 	goto unmap;
 }
@ -621,38 +552,17 @@ static void
 frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		   bool sync)
 {
 	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mw *mw;
 	unsigned int i;
-	for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+	while (!list_empty(&req->rl_registered)) {
-		seg = &req->rl_segments[i];
+		mw = list_first_entry(&req->rl_registered,
-		mw = seg->rl_mw;
+				      struct rpcrdma_mw, mw_list);
 		list_del_init(&mw->mw_list);
 		if (sync)
-			__frwr_reset_and_unmap(r_xprt, mw);
+			frwr_op_recover_mr(mw);
 		else
-			__frwr_queue_recovery(mw);
+			rpcrdma_defer_mr_recovery(mw);
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
 		seg->rl_mw = NULL;
 	}
 }
 static void
 frwr_op_destroy(struct rpcrdma_buffer *buf)
 {
 	struct rpcrdma_mw *r;
 	/* Ensure stale MWs for "buf" are no longer in flight */
 	flush_workqueue(frwr_recovery_wq);
 	while (!list_empty(&buf->rb_all)) {
 		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
 		list_del(&r->mw_all);
 		__frwr_release(r);
 		kfree(r);
 	}
 }
@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap_sync			= frwr_op_unmap_sync,
 	.ro_unmap_safe			= frwr_op_unmap_safe,
 	.ro_recover_mr			= frwr_op_recover_mr,
 	.ro_open			= frwr_op_open,
 	.ro_maxpages			= frwr_op_maxpages,
-	.ro_init			= frwr_op_init,
+	.ro_init_mr			= frwr_op_init_mr,
-	.ro_destroy			= frwr_op_destroy,
+	.ro_release_mr			= frwr_op_release_mr,
 	.ro_displayname			= "frwr",
 };
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@ -1,122 +0,0 @@
 /*
 * Copyright (c) 2015 Oracle.  All rights reserved.
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 */
 /* No-op chunk preparation. All client memory is pre-registered.
 * Sometimes referred to as ALLPHYSICAL mode.
 *
 * Physical registration is simple because all client memory is
 * pre-registered and never deregistered. This mode is good for
 * adapter bring up, but is considered not safe: the server is
 * trusted not to abuse its access to client memory not involved
 * in RDMA I/O.
 */
 #include "xprt_rdma.h"
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 static int
 physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 		 struct rpcrdma_create_data_internal *cdata)
 {
 	struct ib_mr *mr;
 	/* Obtain an rkey to use for RPC data payloads.
 	 */
 	mr = ib_get_dma_mr(ia->ri_pd,
 			   IB_ACCESS_LOCAL_WRITE |
 			   IB_ACCESS_REMOTE_WRITE |
 			   IB_ACCESS_REMOTE_READ);
 	if (IS_ERR(mr)) {
 		pr_err("%s: ib_get_dma_mr for failed with %lX\n",
 		       __func__, PTR_ERR(mr));
 		return -ENOMEM;
 	}
 	ia->ri_dma_mr = mr;
 	rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
 						      RPCRDMA_MAX_DATA_SEGS,
 						      RPCRDMA_MAX_HDR_SEGS));
 	return 0;
 }
 /* PHYSICAL memory registration conveys one page per chunk segment.
 */
 static size_t
 physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
 		     RPCRDMA_MAX_HDR_SEGS);
 }
 static int
 physical_op_init(struct rpcrdma_xprt *r_xprt)
 {
 	return 0;
 }
 /* The client's physical memory is already exposed for
 * remote access via RDMA READ or RDMA WRITE.
 */
 static int
 physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		int nsegs, bool writing)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
 	seg->mr_rkey = ia->ri_dma_mr->rkey;
 	seg->mr_base = seg->mr_dma;
 	return 1;
 }
 /* DMA unmap all memory regions that were mapped for "req".
 */
 static void
 physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_device *device = r_xprt->rx_ia.ri_device;
 	unsigned int i;
 	for (i = 0; req->rl_nchunks; --req->rl_nchunks)
 		rpcrdma_unmap_one(device, &req->rl_segments[i++]);
 }
 /* Use a slow, safe mechanism to invalidate all memory regions
 * that were registered for "req".
 *
 * For physical memory registration, there is no good way to
 * fence a single MR that has been advertised to the server. The
 * client has already handed the server an R_key that cannot be
 * invalidated and is shared by all MRs on this connection.
 * Tearing down the PD might be the only safe choice, but it's
 * not clear that a freshly acquired DMA R_key would be different
 * than the one used by the PD that was just destroyed.
 * FIXME.
 */
 static void
 physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		       bool sync)
 {
 	physical_op_unmap_sync(r_xprt, req);
 }
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
 }
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap_sync			= physical_op_unmap_sync,
 	.ro_unmap_safe			= physical_op_unmap_safe,
 	.ro_open			= physical_op_open,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_init			= physical_op_init,
 	.ro_destroy			= physical_op_destroy,
 	.ro_displayname			= "physical",
 };
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
 * MR when they can.
 */
 static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
 		     int n, int nsegs)
 {
 	size_t page_offset;
 	u32 remaining;
@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
 	base = vec->iov_base;
 	page_offset = offset_in_page(base);
 	remaining = vec->iov_len;
-	while (remaining && n < nsegs) {
+	while (remaining && n < RPCRDMA_MAX_SEGS) {
 		seg[n].mr_page = NULL;
 		seg[n].mr_offset = base;
 		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
 static int
 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
 {
-	int len, n = 0, p;
+	int len, n, p, page_base;
 	int page_base;
 	struct page **ppages;
 	n = 0;
 	if (pos == 0) {
-		n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
+		n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
-		if (n == nsegs)
+		if (n == RPCRDMA_MAX_SEGS)
-			return -EIO;
+			goto out_overflow;
 	}
 	len = xdrbuf->page_len;
 	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
 	page_base = xdrbuf->page_base & ~PAGE_MASK;
 	p = 0;
-	while (len && n < nsegs) {
+	while (len && n < RPCRDMA_MAX_SEGS) {
 		if (!ppages[p]) {
 			/* alloc the pagelist for receiving buffer */
 			ppages[p] = alloc_page(GFP_ATOMIC);
 			if (!ppages[p])
-				return -ENOMEM;
+				return -EAGAIN;
 		}
 		seg[n].mr_page = ppages[p];
 		seg[n].mr_offset = (void *)(unsigned long) page_base;
 		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
 		if (seg[n].mr_len > PAGE_SIZE)
-			return -EIO;
+			goto out_overflow;
 		len -= seg[n].mr_len;
 		++n;
 		++p;
@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	}
 	/* Message overflows the seg array */
-	if (len && n == nsegs)
+	if (len && n == RPCRDMA_MAX_SEGS)
-		return -EIO;
+		goto out_overflow;
 	/* When encoding the read list, the tail is always sent inline */
 	if (type == rpcrdma_readch)
@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 		 * xdr pad bytes, saving the server an RDMA operation. */
 		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
 			return n;
-		n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
+		n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
-		if (n == nsegs)
+		if (n == RPCRDMA_MAX_SEGS)
-			return -EIO;
+			goto out_overflow;
 	}
 	return n;
 out_overflow:
 	pr_err("rpcrdma: segment array overflow\n");
 	return -EIO;
 }
 static inline __be32 *
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
 {
-	*iptr++ = cpu_to_be32(seg->mr_rkey);
+	*iptr++ = cpu_to_be32(mw->mw_handle);
-	*iptr++ = cpu_to_be32(seg->mr_len);
+	*iptr++ = cpu_to_be32(mw->mw_length);
-	return xdr_encode_hyper(iptr, seg->mr_base);
+	return xdr_encode_hyper(iptr, mw->mw_offset);
 }
 /* XDR-encode the Read list. Supports encoding a list of read
@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 			 struct rpcrdma_req *req, struct rpc_rqst *rqst,
 			 __be32 *iptr, enum rpcrdma_chunktype rtype)
 {
-	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mw *mw;
 	unsigned int pos;
 	int n, nsegs;
@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 	pos = rqst->rq_snd_buf.head[0].iov_len;
 	if (rtype == rpcrdma_areadch)
 		pos = 0;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+	seg = req->rl_segments;
-				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 	do {
-		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-		if (n <= 0)
+						 false, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
 		list_add(&mw->mw_list, &req->rl_registered);
 		*iptr++ = xdr_one;	/* item present */
@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 		 * have the same "position".
 		 */
 		*iptr++ = cpu_to_be32(pos);
-		iptr = xdr_encode_rdma_segment(iptr, seg);
+		iptr = xdr_encode_rdma_segment(iptr, mw);
-		dprintk("RPC: %5u %s: read segment pos %u "
+		dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
 			"%d@0x%016llx:0x%08x (%s)\n",
 			rqst->rq_task->tk_pid, __func__, pos,
-			seg->mr_len, (unsigned long long)seg->mr_base,
+			mw->mw_length, (unsigned long long)mw->mw_offset,
-			seg->mr_rkey, n < nsegs ? "more" : "last");
+			mw->mw_handle, n < nsegs ? "more" : "last");
 		r_xprt->rx_stats.read_chunk_count++;
 		req->rl_nchunks++;
 		seg += n;
 		nsegs -= n;
 	} while (nsegs);
 	req->rl_nextseg = seg;
 	/* Finish Read list */
 	*iptr++ = xdr_zero;	/* Next item not present */
@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 			  struct rpc_rqst *rqst, __be32 *iptr,
 			  enum rpcrdma_chunktype wtype)
 {
-	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mw *mw;
 	int n, nsegs, nchunks;
 	__be32 *segcount;
@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 		return iptr;
 	}
 	seg = req->rl_segments;
 	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
 				     rqst->rq_rcv_buf.head[0].iov_len,
-				     wtype, seg,
+				     wtype, seg);
 				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	nchunks = 0;
 	do {
-		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-		if (n <= 0)
+						 true, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
 		list_add(&mw->mw_list, &req->rl_registered);
-		iptr = xdr_encode_rdma_segment(iptr, seg);
+		iptr = xdr_encode_rdma_segment(iptr, mw);
-		dprintk("RPC: %5u %s: write segment "
+		dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
 			"%d@0x016%llx:0x%08x (%s)\n",
 			rqst->rq_task->tk_pid, __func__,
-			seg->mr_len, (unsigned long long)seg->mr_base,
+			mw->mw_length, (unsigned long long)mw->mw_offset,
-			seg->mr_rkey, n < nsegs ? "more" : "last");
+			mw->mw_handle, n < nsegs ? "more" : "last");
 		r_xprt->rx_stats.write_chunk_count++;
 		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
 		req->rl_nchunks++;
 		nchunks++;
 		seg   += n;
 		nsegs -= n;
 	} while (nsegs);
 	req->rl_nextseg = seg;
 	/* Update count of segments in this Write chunk */
 	*segcount = cpu_to_be32(nchunks);
@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 			   struct rpcrdma_req *req, struct rpc_rqst *rqst,
 			   __be32 *iptr, enum rpcrdma_chunktype wtype)
 {
-	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	struct rpcrdma_mr_seg *seg;
 	struct rpcrdma_mw *mw;
 	int n, nsegs, nchunks;
 	__be32 *segcount;
@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 		return iptr;
 	}
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+	seg = req->rl_segments;
-				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 	nchunks = 0;
 	do {
-		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
-		if (n <= 0)
+						 true, &mw);
 		if (n < 0)
 			return ERR_PTR(n);
 		list_add(&mw->mw_list, &req->rl_registered);
-		iptr = xdr_encode_rdma_segment(iptr, seg);
+		iptr = xdr_encode_rdma_segment(iptr, mw);
-		dprintk("RPC: %5u %s: reply segment "
+		dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
 			"%d@0x%016llx:0x%08x (%s)\n",
 			rqst->rq_task->tk_pid, __func__,
-			seg->mr_len, (unsigned long long)seg->mr_base,
+			mw->mw_length, (unsigned long long)mw->mw_offset,
-			seg->mr_rkey, n < nsegs ? "more" : "last");
+			mw->mw_handle, n < nsegs ? "more" : "last");
 		r_xprt->rx_stats.reply_chunk_count++;
 		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
 		req->rl_nchunks++;
 		nchunks++;
 		seg   += n;
 		nsegs -= n;
 	} while (nsegs);
 	req->rl_nextseg = seg;
 	/* Update count of segments in the Reply chunk */
 	*segcount = cpu_to_be32(nchunks);
@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
 	bool ddp_allowed;
 	ssize_t hdrlen;
 	size_t rpclen;
 	__be32 *iptr;
@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
 	headerp->rm_type = rdma_msg;
 	/* When the ULP employs a GSS flavor that guarantees integrity
 	 * or privacy, direct data placement of individual data items
 	 * is not allowed.
 	 */
 	ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
 						RPCAUTH_AUTH_DATATOUCH);
 	/*
 	 * Chunks needed for results?
 	 *
@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 */
 	if (rpcrdma_results_inline(r_xprt, rqst))
 		wtype = rpcrdma_noch;
-	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+	else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
 		wtype = rpcrdma_writech;
 	else
 		wtype = rpcrdma_replych;
@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 		rtype = rpcrdma_noch;
 		rpcrdma_inline_pullup(rqst);
 		rpclen = rqst->rq_svec[0].iov_len;
-	} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 		rtype = rpcrdma_readch;
 		rpclen = rqst->rq_svec[0].iov_len;
 		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * send a Call message with a Position Zero Read chunk and a
 	 * regular Read chunk at the same time.
 	 */
 	req->rl_nchunks = 0;
 	req->rl_nextseg = req->rl_segments;
 	iptr = headerp->rm_body.rm_chunks;
 	iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
 	if (IS_ERR(iptr))
@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 out_overflow:
 	pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
 		hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
-	/* Terminate this RPC. Chunks registered above will be
+	iptr = ERR_PTR(-EIO);
 	 * released by xprt_release -> xprt_rmda_free .
 	 */
 	return -EIO;
 out_unmap:
 	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@ -705,15 +711,13 @@ out_unmap:
 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
 */
 static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
 {
 	unsigned int i, total_len;
 	struct rpcrdma_write_chunk *cur_wchunk;
 	char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
 	i = be32_to_cpu(**iptrp);
 	if (i > max)
 		return -1;
 	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
 	total_len = 0;
 	while (i--) {
@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
 	return total_len;
 }
-/*
+/**
- * Scatter inline received data back into provided iov's.
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
 * @rqst: controlling RPC request
 * @srcp: points to RPC message payload in receive buffer
 * @copy_len: remaining length of receive buffer content
 * @pad: Write chunk pad bytes needed (zero for pure inline)
 *
 * The upper layer has set the maximum number of bytes it can
 * receive in each component of rq_rcv_buf. These values are set in
 * the head.iov_len, page_len, tail.iov_len, and buflen fields.
 *
 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
 * many cases this function simply updates iov_base pointers in
 * rq_rcv_buf to point directly to the received reply data, to
 * avoid copying reply data.
 *
 * Returns the count of bytes which had to be memcopied.
 */
-static void
+static unsigned long
 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
-	int i, npages, curlen, olen;
+	unsigned long fixup_copy_count;
 	int i, npages, curlen;
 	char *destp;
 	struct page **ppages;
 	int page_base;
-	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	/* The head iovec is redirected to the RPC reply message
-	if (curlen > copy_len) {	/* write chunk header fixup */
+	 * in the receive buffer, to avoid a memcopy.
-		curlen = copy_len;
+	 */
-		rqst->rq_rcv_buf.head[0].iov_len = curlen;
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
-	}
+	rqst->rq_private_buf.head[0].iov_base = srcp;
 	/* The contents of the receive buffer that follow
 	 * head.iov_len bytes are copied into the page list.
 	 */
 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
 	if (curlen > copy_len)
 		curlen = copy_len;
 	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
 		__func__, srcp, copy_len, curlen);
 	/* Shift pointer for first receive segment only */
 	rqst->rq_rcv_buf.head[0].iov_base = srcp;
 	srcp += curlen;
 	copy_len -= curlen;
 	olen = copy_len;
 	i = 0;
 	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
 	page_base = rqst->rq_rcv_buf.page_base;
 	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
 	page_base &= ~PAGE_MASK;
-
+	fixup_copy_count = 0;
 	if (copy_len && rqst->rq_rcv_buf.page_len) {
-		npages = PAGE_ALIGN(page_base +
+		int pagelist_len;
-			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+
-		for (; i < npages; i++) {
+		pagelist_len = rqst->rq_rcv_buf.page_len;
 		if (pagelist_len > copy_len)
 			pagelist_len = copy_len;
 		npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
 		for (i = 0; i < npages; i++) {
 			curlen = PAGE_SIZE - page_base;
-			if (curlen > copy_len)
+			if (curlen > pagelist_len)
-				curlen = copy_len;
+				curlen = pagelist_len;
 			dprintk("RPC:       %s: page %d"
 				" srcp 0x%p len %d curlen %d\n",
 				__func__, i, srcp, copy_len, curlen);
@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 			kunmap_atomic(destp);
 			srcp += curlen;
 			copy_len -= curlen;
-			if (copy_len == 0)
+			fixup_copy_count += curlen;
 			pagelist_len -= curlen;
 			if (!pagelist_len)
 				break;
 			page_base = 0;
 		}
 		/* Implicit padding for the last segment in a Write
 		 * chunk is inserted inline at the front of the tail
 		 * iovec. The upper layer ignores the content of
 		 * the pad. Simply ensure inline content in the tail
 		 * that follows the Write chunk is properly aligned.
 		 */
 		if (pad)
 			srcp -= pad;
 	}
-	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+	/* The tail iovec is redirected to the remaining data
-		curlen = copy_len;
+	 * in the receive buffer, to avoid a memcopy.
-		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+	 */
-			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+	if (copy_len || pad) {
-		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+		rqst->rq_rcv_buf.tail[0].iov_base = srcp;
-			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+		rqst->rq_private_buf.tail[0].iov_base = srcp;
 		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
 			__func__, srcp, copy_len, curlen);
 		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
 		copy_len -= curlen; ++i;
 	} else
 		rqst->rq_rcv_buf.tail[0].iov_len = 0;
 	if (pad) {
 		/* implicit padding on terminal chunk */
 		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
 		while (pad--)
 			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
 	}
-	if (copy_len)
+	return fixup_copy_count;
 		dprintk("RPC:       %s: %d bytes in"
 			" %d extra segments (%d lost)\n",
 			__func__, olen, i, copy_len);
 	/* TBD avoid a warning from call_decode() */
 	rqst->rq_private_buf = rqst->rq_rcv_buf;
 }
 void
@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
 		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
 		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
-		     req->rl_nchunks == 0))
+		     list_empty(&req->rl_registered)))
 			goto badheader;
 		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
 			/* count any expected write chunks in read reply */
 			/* start at write chunk array count */
 			iptr = &headerp->rm_body.rm_chunks[2];
-			rdmalen = rpcrdma_count_chunks(rep,
+			rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
 						req->rl_nchunks, 1, &iptr);
 			/* check for validity, and no reply chunk after */
 			if (rdmalen < 0 || *iptr++ != xdr_zero)
 				goto badheader;
@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
 			status = rep->rr_len;
 		}
-		/* Fix up the rpc results for upper layer */
+
-		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+		r_xprt->rx_stats.fixup_copy_count +=
 			rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
 					     rdmalen);
 		break;
 	case rdma_nomsg:
@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
 		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
 		    headerp->rm_body.rm_chunks[2] != xdr_one ||
-		    req->rl_nchunks == 0)
+		    list_empty(&req->rl_registered))
 			goto badheader;
 		iptr = (__be32 *)((unsigned char *)headerp +
 							RPCRDMA_HDRLEN_MIN);
-		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+		rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
 		if (rdmalen < 0)
 			goto badheader;
 		r_xprt->rx_stats.total_rdma_reply += rdmalen;
@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 badheader:
 	default:
-		dprintk("%s: invalid rpcrdma reply header (type %d):"
+		dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
-				" chunks[012] == %d %d %d"
+			rqst->rq_task->tk_pid, __func__,
-				" expected chunks <= %d\n",
+			be32_to_cpu(headerp->rm_type));
 				__func__, be32_to_cpu(headerp->rm_type),
 				headerp->rm_body.rm_chunks[0],
 				headerp->rm_body.rm_chunks[1],
 				headerp->rm_body.rm_chunks[2],
 				req->rl_nchunks);
 		status = -EIO;
 		r_xprt->rx_stats.bad_reply_count++;
 		break;
@ -1035,7 +1049,7 @@ out:
 	 * control: waking the next RPC waits until this RPC has
 	 * relinquished all its Send Queue entries.
 	 */
-	if (req->rl_nchunks)
+	if (!list_empty(&req->rl_registered))
 		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
 	spin_lock_bh(&xprt->transport_lock);
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@ -558,7 +558,6 @@ out_sendbuf:
 out_fail:
 	rpcrdma_buffer_put(req);
 	r_xprt->rx_stats.failed_marshal_count++;
 	return NULL;
 }
@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
 	rpcrdma_buffer_put(req);
 }
-/*
+/**
 * xprt_rdma_send_request - marshal and send an RPC request
 * @task: RPC task with an RPC message in rq_snd_buf
 *
 * Return values:
 *        0:	The request has been sent
 * ENOTCONN:	Caller needs to invoke connect logic then call again
 *  ENOBUFS:	Call again later to send the request
 *      EIO:	A permanent error occurred. The request was not sent,
 *		and don't try it again
 *
 * send_request invokes the meat of RPC RDMA. It must do the following:
 *
 *  1.  Marshal the RPC request into an RPC RDMA request, which means
 *	putting a header in front of data, and creating IOVs for RDMA
 *	from those in the request.
@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
 *	the request (rpcrdma_ep_post).
 *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
 */
 static int
 xprt_rdma_send_request(struct rpc_task *task)
 {
@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	int rc = 0;
 	/* On retransmit, remove any previously registered chunks */
 	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
 	rc = rpcrdma_marshal_req(rqst);
 	if (rc < 0)
 		goto failed_marshal;
@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
 	return 0;
 failed_marshal:
 	r_xprt->rx_stats.failed_marshal_count++;
 	dprintk("RPC:       %s: rpcrdma_marshal_req failed, status %i\n",
 		__func__, rc);
 	if (rc == -EIO)
-		return -EIO;
+		r_xprt->rx_stats.failed_marshal_count++;
 	if (rc != -ENOTCONN)
 		return rc;
 drop_connection:
 	xprt_disconnect_done(xprt);
 	return -ENOTCONN;	/* implies disconnect */
@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 		   xprt->stat.bad_xids,
 		   xprt->stat.req_u,
 		   xprt->stat.bklog_u);
-	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
 		   r_xprt->rx_stats.read_chunk_count,
 		   r_xprt->rx_stats.write_chunk_count,
 		   r_xprt->rx_stats.reply_chunk_count,
@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 		   r_xprt->rx_stats.failed_marshal_count,
 		   r_xprt->rx_stats.bad_reply_count,
 		   r_xprt->rx_stats.nomsg_call_count);
 	seq_printf(seq, "%lu %lu %lu\n",
 		   r_xprt->rx_stats.mrs_recovered,
 		   r_xprt->rx_stats.mrs_orphaned,
 		   r_xprt->rx_stats.mrs_allocated);
 }
 static int
@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
 			__func__, rc);
 	rpcrdma_destroy_wq();
 	frwr_destroy_recovery_wq();
 	rc = xprt_unregister_transport(&xprt_rdma_bc);
 	if (rc)
@ -753,20 +769,13 @@ int xprt_rdma_init(void)
 {
 	int rc;
-	rc = frwr_alloc_recovery_wq();
+	rc = rpcrdma_alloc_wq();
 	if (rc)
 		return rc;
 	rc = rpcrdma_alloc_wq();
 	if (rc) {
 		frwr_destroy_recovery_wq();
 		return rc;
 	}
 	rc = xprt_register_transport(&xprt_rdma);
 	if (rc) {
 		rpcrdma_destroy_wq();
 		frwr_destroy_recovery_wq();
 		return rc;
 	}
@ -774,7 +783,6 @@ int xprt_rdma_init(void)
 	if (rc) {
 		xprt_unregister_transport(&xprt_rdma);
 		rpcrdma_destroy_wq();
 		frwr_destroy_recovery_wq();
 		return rc;
 	}
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	int rc;
 	ia->ri_dma_mr = NULL;
 	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 	if (IS_ERR(ia->ri_id)) {
 		rc = PTR_ERR(ia->ri_id);
@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	ia->ri_pd = ib_alloc_pd(ia->ri_device);
 	if (IS_ERR(ia->ri_pd)) {
 		rc = PTR_ERR(ia->ri_pd);
-		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
+		pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
 			__func__, rc);
 		goto out2;
 	}
 	if (memreg == RPCRDMA_FRMR) {
 		if (!(ia->ri_device->attrs.device_cap_flags &
 				IB_DEVICE_MEM_MGT_EXTENSIONS) ||
 		    (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
 			dprintk("RPC:       %s: FRMR registration "
 				"not supported by HCA\n", __func__);
 			memreg = RPCRDMA_MTHCAFMR;
 		}
 	}
 	if (memreg == RPCRDMA_MTHCAFMR) {
 		if (!ia->ri_device->alloc_fmr) {
 			dprintk("RPC:       %s: MTHCAFMR registration "
 				"not supported by HCA\n", __func__);
 			rc = -EINVAL;
 			goto out3;
 		}
 	}
 	switch (memreg) {
 	case RPCRDMA_FRMR:
-		ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+		if (frwr_is_supported(ia)) {
-		break;
+			ia->ri_ops = &rpcrdma_frwr_memreg_ops;
-	case RPCRDMA_ALLPHYSICAL:
+			break;
-		ia->ri_ops = &rpcrdma_physical_memreg_ops;
+		}
-		break;
+		/*FALLTHROUGH*/
 	case RPCRDMA_MTHCAFMR:
-		ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+		if (fmr_is_supported(ia)) {
-		break;
+			ia->ri_ops = &rpcrdma_fmr_memreg_ops;
 			break;
 		}
 		/*FALLTHROUGH*/
 	default:
-		printk(KERN_ERR "RPC: Unsupported memory "
+		pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
-				"registration mode: %d\n", memreg);
+		       memreg);
-		rc = -ENOMEM;
+		rc = -EINVAL;
 		goto out3;
 	}
 	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
 		__func__, ia->ri_ops->ro_displayname);
 	return 0;
@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 out2:
 	ib_free_cq(sendcq);
 out1:
 	if (ia->ri_dma_mr)
 		ib_dereg_mr(ia->ri_dma_mr);
 	return rc;
 }
@ -600,8 +578,6 @@ out1:
 void
 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
 	int rc;
 	dprintk("RPC:       %s: entering, connected is %d\n",
 		__func__, ep->rep_connected);
@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	ib_free_cq(ep->rep_attr.recv_cq);
 	ib_free_cq(ep->rep_attr.send_cq);
 	if (ia->ri_dma_mr) {
 		rc = ib_dereg_mr(ia->ri_dma_mr);
 		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
 			__func__, rc);
 	}
 }
 /*
@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	ib_drain_qp(ia->ri_id->qp);
 }
 static void
 rpcrdma_mr_recovery_worker(struct work_struct *work)
 {
 	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
 						  rb_recovery_worker.work);
 	struct rpcrdma_mw *mw;
 	spin_lock(&buf->rb_recovery_lock);
 	while (!list_empty(&buf->rb_stale_mrs)) {
 		mw = list_first_entry(&buf->rb_stale_mrs,
 				      struct rpcrdma_mw, mw_list);
 		list_del_init(&mw->mw_list);
 		spin_unlock(&buf->rb_recovery_lock);
 		dprintk("RPC:       %s: recovering MR %p\n", __func__, mw);
 		mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
 		spin_lock(&buf->rb_recovery_lock);
 	}
 	spin_unlock(&buf->rb_recovery_lock);
 }
 void
 rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
 {
 	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	spin_lock(&buf->rb_recovery_lock);
 	list_add(&mw->mw_list, &buf->rb_stale_mrs);
 	spin_unlock(&buf->rb_recovery_lock);
 	schedule_delayed_work(&buf->rb_recovery_worker, 0);
 }
 static void
 rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	unsigned int count;
 	LIST_HEAD(free);
 	LIST_HEAD(all);
 	for (count = 0; count < 32; count++) {
 		struct rpcrdma_mw *mw;
 		int rc;
 		mw = kzalloc(sizeof(*mw), GFP_KERNEL);
 		if (!mw)
 			break;
 		rc = ia->ri_ops->ro_init_mr(ia, mw);
 		if (rc) {
 			kfree(mw);
 			break;
 		}
 		mw->mw_xprt = r_xprt;
 		list_add(&mw->mw_list, &free);
 		list_add(&mw->mw_all, &all);
 	}
 	spin_lock(&buf->rb_mwlock);
 	list_splice(&free, &buf->rb_mws);
 	list_splice(&all, &buf->rb_all);
 	r_xprt->rx_stats.mrs_allocated += count;
 	spin_unlock(&buf->rb_mwlock);
 	dprintk("RPC:       %s: created %u MRs\n", __func__, count);
 }
 static void
 rpcrdma_mr_refresh_worker(struct work_struct *work)
 {
 	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
 						  rb_refresh_worker.work);
 	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
 						   rx_buf);
 	rpcrdma_create_mrs(r_xprt);
 }
 struct rpcrdma_req *
 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 {
@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 	spin_unlock(&buffer->rb_reqslock);
 	req->rl_cqe.done = rpcrdma_wc_send;
 	req->rl_buffer = &r_xprt->rx_buf;
 	INIT_LIST_HEAD(&req->rl_registered);
 	return req;
 }
@ -832,17 +887,23 @@ int
 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	int i, rc;
 	buf->rb_max_requests = r_xprt->rx_data.max_requests;
 	buf->rb_bc_srv_max_requests = 0;
 	spin_lock_init(&buf->rb_lock);
 	atomic_set(&buf->rb_credits, 1);
 	spin_lock_init(&buf->rb_mwlock);
 	spin_lock_init(&buf->rb_lock);
 	spin_lock_init(&buf->rb_recovery_lock);
 	INIT_LIST_HEAD(&buf->rb_mws);
 	INIT_LIST_HEAD(&buf->rb_all);
 	INIT_LIST_HEAD(&buf->rb_stale_mrs);
 	INIT_DELAYED_WORK(&buf->rb_refresh_worker,
 			  rpcrdma_mr_refresh_worker);
 	INIT_DELAYED_WORK(&buf->rb_recovery_worker,
 			  rpcrdma_mr_recovery_worker);
-	rc = ia->ri_ops->ro_init(r_xprt);
+	rpcrdma_create_mrs(r_xprt);
 	if (rc)
 		goto out;
 	INIT_LIST_HEAD(&buf->rb_send_bufs);
 	INIT_LIST_HEAD(&buf->rb_allreqs);
@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	}
 	INIT_LIST_HEAD(&buf->rb_recv_bufs);
-	for (i = 0; i < buf->rb_max_requests + 2; i++) {
+	for (i = 0; i < buf->rb_max_requests; i++) {
 		struct rpcrdma_rep *rep;
 		rep = rpcrdma_create_rep(r_xprt);
@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
 	kfree(req);
 }
 static void
 rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
 {
 	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
 						   rx_buf);
 	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
 	struct rpcrdma_mw *mw;
 	unsigned int count;
 	count = 0;
 	spin_lock(&buf->rb_mwlock);
 	while (!list_empty(&buf->rb_all)) {
 		mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
 		list_del(&mw->mw_all);
 		spin_unlock(&buf->rb_mwlock);
 		ia->ri_ops->ro_release_mr(mw);
 		count++;
 		spin_lock(&buf->rb_mwlock);
 	}
 	spin_unlock(&buf->rb_mwlock);
 	r_xprt->rx_stats.mrs_allocated = 0;
 	dprintk("RPC:       %s: released %u MRs\n", __func__, count);
 }
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
 	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
 	cancel_delayed_work_sync(&buf->rb_recovery_worker);
 	while (!list_empty(&buf->rb_recv_bufs)) {
 		struct rpcrdma_rep *rep;
@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 	}
 	spin_unlock(&buf->rb_reqslock);
-	ia->ri_ops->ro_destroy(buf);
+	rpcrdma_destroy_mrs(buf);
 }
 struct rpcrdma_mw *
@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
 	spin_unlock(&buf->rb_mwlock);
 	if (!mw)
-		pr_err("RPC:       %s: no MWs available\n", __func__);
+		goto out_nomws;
 	return mw;
 out_nomws:
 	dprintk("RPC:       %s: no MWs available\n", __func__);
 	schedule_delayed_work(&buf->rb_refresh_worker, 0);
 	/* Allow the reply handler and refresh worker to run */
 	cond_resched();
 	return NULL;
 }
 void
@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 /*
 * Get a set of request/reply buffers.
 *
 * Reply buffer (if available) is attached to send buffer upon return.
 */
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 out_reqbuf:
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("RPC:       %s: out of request buffers\n", __func__);
+	pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
 	return NULL;
 out_repbuf:
 	list_add(&req->rl_free, &buffers->rb_send_bufs);
 	spin_unlock(&buffers->rb_lock);
-	pr_warn("RPC:       %s: out of reply buffers\n", __func__);
+	pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
-	req->rl_reply = NULL;
+	return NULL;
 	return req;
 }
 /*
@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
 */
 void
 rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
 {
 	dprintk("RPC:       map_one: offset %p iova %llx len %zu\n",
 		seg->mr_offset,
 		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
 }
 /**
 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
 * @ia: controlling rpcrdma_ia
@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	if (rep) {
 		rc = rpcrdma_ep_post_recv(ia, ep, rep);
 		if (rc)
-			goto out;
+			return rc;
 		req->rl_reply = NULL;
 	}
@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
 	if (rc)
-		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
+		goto out_postsend_err;
-			rc);
+	return 0;
-out:
+
-	return rc;
+out_postsend_err:
 	pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
 	return -ENOTCONN;
 }
 /*
@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 				   DMA_BIDIRECTIONAL);
 	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
 	if (rc)
-		dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
+		goto out_postrecv;
-			rc);
+	return 0;
-	return rc;
+
 out_postrecv:
 	pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
 	return -ENOTCONN;
 }
 /**
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@ -68,7 +68,6 @@ struct rpcrdma_ia {
 	struct ib_device	*ri_device;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	struct ib_mr		*ri_dma_mr;
 	struct completion	ri_done;
 	int			ri_async_rc;
 	unsigned int		ri_max_frmr_depth;
@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 *   o recv buffer (posted to provider)
 *   o ib_sge (also donated to provider)
 *   o status of reply (length, success or not)
- *   o bookkeeping state to get run by tasklet (list, etc)
+ *   o bookkeeping state to get run by reply handler (list, etc)
 *
- * These are allocated during initialization, per-transport instance;
+ * These are allocated during initialization, per-transport instance.
 * however, the tasklet execution list itself is global, as it should
 * always be pretty short.
 *
 * N of these are associated with a transport instance, and stored in
 * struct rpcrdma_buffer. N is the max number of outstanding requests.
 */
 #define RPCRDMA_MAX_DATA_SEGS	((1 * 1024 * 1024) / PAGE_SIZE)
 /* data segments + head/tail for Call + head/tail for Reply */
 #define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 4)
 struct rpcrdma_buffer;
 struct rpcrdma_rep {
 	struct ib_cqe		rr_cqe;
 	unsigned int		rr_len;
@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
 };
 struct rpcrdma_frmr {
 	struct scatterlist		*fr_sg;
 	int				fr_nents;
 	enum dma_data_direction		fr_dir;
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
 	enum rpcrdma_frmr_state		fr_state;
@ -235,18 +222,23 @@ struct rpcrdma_frmr {
 };
 struct rpcrdma_fmr {
-	struct ib_fmr		*fmr;
+	struct ib_fmr		*fm_mr;
-	u64			*physaddrs;
+	u64			*fm_physaddrs;
 };
 struct rpcrdma_mw {
 	struct list_head	mw_list;
 	struct scatterlist	*mw_sg;
 	int			mw_nents;
 	enum dma_data_direction	mw_dir;
 	union {
 		struct rpcrdma_fmr	fmr;
 		struct rpcrdma_frmr	frmr;
 	};
 	struct work_struct	mw_work;
 	struct rpcrdma_xprt	*mw_xprt;
-	struct list_head	mw_list;
+	u32			mw_handle;
 	u32			mw_length;
 	u64			mw_offset;
 	struct list_head	mw_all;
 };
@ -266,33 +258,30 @@ struct rpcrdma_mw {
 * of iovs for send operations. The reason is that the iovs passed to
 * ib_post_{send,recv} must not be modified until the work request
 * completes.
 *
 * NOTES:
 *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
 *     marshal. The number needed varies depending on the iov lists that
 *     are passed to us, the memory registration mode we are in, and if
 *     physical addressing is used, the layout.
 */
 /* Maximum number of page-sized "segments" per chunk list to be
 * registered or invalidated. Must handle a Reply chunk:
 */
 enum {
 	RPCRDMA_MAX_IOV_SEGS	= 3,
 	RPCRDMA_MAX_DATA_SEGS	= ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
 	RPCRDMA_MAX_SEGS	= RPCRDMA_MAX_DATA_SEGS +
 				  RPCRDMA_MAX_IOV_SEGS,
 };
 struct rpcrdma_mr_seg {		/* chunk descriptors */
 	struct rpcrdma_mw *rl_mw;	/* registered MR */
 	u64		mr_base;	/* registration result */
 	u32		mr_rkey;	/* registration result */
 	u32		mr_len;		/* length of chunk or segment */
 	int		mr_nsegs;	/* number of segments in chunk or 0 */
 	enum dma_data_direction	mr_dir;	/* segment mapping direction */
 	dma_addr_t	mr_dma;		/* segment mapping address */
 	size_t		mr_dmalen;	/* segment mapping length */
 	struct page	*mr_page;	/* owning page, if any */
 	char		*mr_offset;	/* kva if no page, else offset */
 };
 #define RPCRDMA_MAX_IOVS	(2)
 struct rpcrdma_buffer;
 struct rpcrdma_req {
 	struct list_head	rl_free;
 	unsigned int		rl_niovs;
 	unsigned int		rl_nchunks;
 	unsigned int		rl_connect_cookie;
 	struct rpc_task		*rl_task;
 	struct rpcrdma_buffer	*rl_buffer;
@ -300,12 +289,13 @@ struct rpcrdma_req {
 	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
 	struct rpcrdma_regbuf	*rl_rdmabuf;
 	struct rpcrdma_regbuf	*rl_sendbuf;
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 	struct rpcrdma_mr_seg	*rl_nextseg;
 	struct ib_cqe		rl_cqe;
 	struct list_head	rl_all;
 	bool			rl_backchannel;
 	struct list_head	rl_registered;	/* registered segments */
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 };
 static inline struct rpcrdma_req *
@ -341,6 +331,11 @@ struct rpcrdma_buffer {
 	struct list_head	rb_allreqs;
 	u32			rb_bc_max_requests;
 	spinlock_t		rb_recovery_lock; /* protect rb_stale_mrs */
 	struct list_head	rb_stale_mrs;
 	struct delayed_work	rb_recovery_worker;
 	struct delayed_work	rb_refresh_worker;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@ -387,6 +382,9 @@ struct rpcrdma_stats {
 	unsigned long		bad_reply_count;
 	unsigned long		nomsg_call_count;
 	unsigned long		bcall_count;
 	unsigned long		mrs_recovered;
 	unsigned long		mrs_orphaned;
 	unsigned long		mrs_allocated;
 };
 /*
@ -395,23 +393,25 @@ struct rpcrdma_stats {
 struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
 	int		(*ro_map)(struct rpcrdma_xprt *,
-				  struct rpcrdma_mr_seg *, int, bool);
+				  struct rpcrdma_mr_seg *, int, bool,
 				  struct rpcrdma_mw **);
 	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
 					 struct rpcrdma_req *);
 	void		(*ro_unmap_safe)(struct rpcrdma_xprt *,
 					 struct rpcrdma_req *, bool);
 	void		(*ro_recover_mr)(struct rpcrdma_mw *);
 	int		(*ro_open)(struct rpcrdma_ia *,
 				   struct rpcrdma_ep *,
 				   struct rpcrdma_create_data_internal *);
 	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
-	int		(*ro_init)(struct rpcrdma_xprt *);
+	int		(*ro_init_mr)(struct rpcrdma_ia *,
-	void		(*ro_destroy)(struct rpcrdma_buffer *);
+				      struct rpcrdma_mw *);
 	void		(*ro_release_mr)(struct rpcrdma_mw *);
 	const char	*ro_displayname;
 };
 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
 extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
 extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
 /*
 * RPCRDMA transport -- encapsulates the structures above for
@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
 */
 int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
 void rpcrdma_ia_close(struct rpcrdma_ia *);
 bool frwr_is_supported(struct rpcrdma_ia *);
 bool fmr_is_supported(struct rpcrdma_ia *);
 /*
 * Endpoint calls - xprtrdma/verbs.c
@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 					    size_t, gfp_t);
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 int frwr_alloc_recovery_wq(void);
 void frwr_destroy_recovery_wq(void);
 int rpcrdma_alloc_wq(void);
 void rpcrdma_destroy_wq(void);
@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
 * Wrappers for chunk registration, shared by read/write chunk code.
 */
 void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
 static inline enum dma_data_direction
 rpcrdma_data_dir(bool writing)
 {
 	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 }
 static inline void
 rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
 		enum dma_data_direction direction)
 {
 	seg->mr_dir = direction;
 	seg->mr_dmalen = seg->mr_len;
 	if (seg->mr_page)
 		seg->mr_dma = ib_dma_map_page(device,
 				seg->mr_page, offset_in_page(seg->mr_offset),
 				seg->mr_dmalen, seg->mr_dir);
 	else
 		seg->mr_dma = ib_dma_map_single(device,
 				seg->mr_offset,
 				seg->mr_dmalen, seg->mr_dir);
 	if (ib_dma_mapping_error(device, seg->mr_dma))
 		rpcrdma_mapping_error(seg);
 }
 static inline void
 rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
 {
 	if (seg->mr_page)
 		ib_dma_unmap_page(device,
 				  seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 	else
 		ib_dma_unmap_single(device,
 				    seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 }
 /*
 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
 */
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &xprt_min_resvport_limit,
-		.extra2		= &xprt_max_resvport_limit
+		.extra2		= &xprt_max_resvport
 	},
 	{
 		.procname	= "max_resvport",
@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &xprt_min_resvport_limit,
+		.extra1		= &xprt_min_resvport,
 		.extra2		= &xprt_max_resvport_limit
 	},
 	{
@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
 	bool zerocopy = true;
 	bool vm_wait = false;
 	int status;
 	int sent;
@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
 			return 0;
 		}
 		WARN_ON_ONCE(sent == 0 && status == 0);
 		if (status == -EAGAIN ) {
 			/*
 			 * Return EAGAIN if we're sure we're hitting the
 			 * socket send buffer limits.
 			 */
 			if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
 				break;
 			/*
 			 * Did we hit a memory allocation failure?
 			 */
 			if (sent == 0) {
 				status = -ENOBUFS;
 				if (vm_wait)
 					break;
 				/* Retry, knowing now that we're below the
 				 * socket send buffer limit
 				 */
 				vm_wait = true;
 			}
 			continue;
 		}
 		if (status < 0)
 			break;
-		if (sent == 0) {
+		vm_wait = false;
 			status = -EAGAIN;
 			break;
 		}
 	}
 	if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
 		status = -ENOBUFS;
 	switch (status) {
 	case -ENOTSOCK:
@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
 	sk->sk_error_report = transport->old_error_report;
 }
 static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
 }
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
 {
 	smp_mb__before_atomic();
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	xs_sock_reset_state_flags(xprt);
 	smp_mb__after_atomic();
 }
@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
 		goto out;
 	for (;;) {
 		skb = skb_recv_datagram(sk, 0, 1, &err);
-		if (skb == NULL)
+		if (skb != NULL) {
 			xs_local_data_read_skb(&transport->xprt, sk, skb);
 			skb_free_datagram(sk, skb);
 			continue;
 		}
 		if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
 			break;
 		xs_local_data_read_skb(&transport->xprt, sk, skb);
 		skb_free_datagram(sk, skb);
 	}
 out:
 	mutex_unlock(&transport->recv_mutex);
@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 		goto out;
 	for (;;) {
 		skb = skb_recv_datagram(sk, 0, 1, &err);
-		if (skb == NULL)
+		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
 			skb_free_datagram(sk, skb);
 			continue;
 		}
 		if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
 			break;
 		xs_udp_data_read_skb(&transport->xprt, sk, skb);
 		skb_free_datagram(sk, skb);
 	}
 out:
 	mutex_unlock(&transport->recv_mutex);
@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk)
 	if (xprt != NULL) {
 		struct sock_xprt *transport = container_of(xprt,
 				struct sock_xprt, xprt);
-		queue_work(rpciod_workqueue, &transport->recv_worker);
+		transport->old_data_ready(sk);
 		/* Any data means we had a useful conversation, so
 		 * then we don't need to delay the next reconnect
 		 */
 		if (xprt->reestablish_timeout)
 			xprt->reestablish_timeout = 0;
 		if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
 			queue_work(xprtiod_workqueue, &transport->recv_worker);
 	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
 	for (;;) {
 		lock_sock(sk);
 		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
-		release_sock(sk);
+		if (read <= 0) {
-		if (read <= 0)
+			clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
-			break;
+			release_sock(sk);
-		total += read;
+			if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
 				break;
 		} else {
 			release_sock(sk);
 			total += read;
 		}
 		rd_desc.count = 65536;
 	}
 out:
@ -1492,34 +1537,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
 	xs_tcp_data_receive(transport);
 }
 /**
 * xs_tcp_data_ready - "data ready" callback for TCP sockets
 * @sk: socket with data to read
 *
 */
 static void xs_tcp_data_ready(struct sock *sk)
 {
 	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 	dprintk("RPC:       xs_tcp_data_ready...\n");
 	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 	/* Any data means we had a useful conversation, so
 	 * the we don't need to delay the next reconnect
 	 */
 	if (xprt->reestablish_timeout)
 		xprt->reestablish_timeout = 0;
 	queue_work(rpciod_workqueue, &transport->recv_worker);
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 /**
 * xs_tcp_state_change - callback to handle TCP socket state changes
 * @sk: socket whose state has changed
@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 static unsigned short xs_get_random_port(void)
 {
-	unsigned short range = xprt_max_resvport - xprt_min_resvport;
+	unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
 	unsigned short rand = (unsigned short) prandom_u32() % range;
 	return rand + xprt_min_resvport;
 }
@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		xs_save_old_callbacks(transport, sk);
 		sk->sk_user_data = xprt;
-		sk->sk_data_ready = xs_tcp_data_ready;
+		sk->sk_data_ready = xs_data_ready;
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_write_space = xs_tcp_write_space;
 		sock_set_flag(sk, SOCK_FASYNC);
@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		/* Start by resetting any existing state */
 		xs_reset_transport(transport);
-		queue_delayed_work(rpciod_workqueue,
+		queue_delayed_work(xprtiod_workqueue,
 				   &transport->connect_worker,
 				   xprt->reestablish_timeout);
 		xprt->reestablish_timeout <<= 1;
@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
 	} else {
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-		queue_delayed_work(rpciod_workqueue,
+		queue_delayed_work(xprtiod_workqueue,
 				   &transport->connect_worker, 0);
 	}
 }
@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val,
 static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
-	return param_set_uint_minmax(val, kp,
+	if (kp->arg == &xprt_min_resvport)
 		return param_set_uint_minmax(val, kp,
 			RPC_MIN_RESVPORT,
 			xprt_max_resvport);
 	return param_set_uint_minmax(val, kp,
 			xprt_min_resvport,
 			RPC_MAX_RESVPORT);
 }