1
0
Fork 0

NFS client updates for Linux 4.8

Highlights include:
 
 Stable bugfixes:
  - nfs: don't create zero-length requests
  - Several LAYOUTGET bugfixes
 
 Features:
  - Several performance related features
    - More aggressive caching when we can rely on close-to-open cache
      consistency
    - Remove serialisation of O_DIRECT reads and writes
    - Optimise several code paths to not flush to disk unnecessarily. However
      allow for the idiosyncracies of pNFS for those layout types that need
      to issue a LAYOUTCOMMIT before the metadata can be updated on the server.
    - SUNRPC updates to the client data receive path
  - pNFS/SCSI support RH/Fedora dm-mpath device nodes
  - pNFS files/flexfiles can now use unprivileged ports when the generic NFS
    mount options allow it.
 
 Bugfixes:
  - Don't use RDMA direct data placement together with data integrity or
    privacy security flavours
  - Remove the RDMA ALLPHYSICAL memory registration mode as it has potential
    security holes.
  - Several layout recall fixes to improve NFSv4.1 protocol compliance.
  - Fix an Oops in the pNFS files and flexfiles connection setup to the DS
  - Allow retry of operations that used a returned delegation stateid
  - Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding
  - Fix writeback races in nfs4_copy_range() and nfs42_proc_deallocate()
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABAgAGBQJXnSq8AAoJEGcL54qWCgDyn8cP/RCHLekUCq7Klh+NAnEsvuBi
 C7w9YpVHaC83/8Q0tR6LyFShSBJBWi/clWwO0IEomkNK/MuO77v4iyPujtEyqowK
 0+eWFh/e8CsTf7mNGoi0avrHAZDB3deSuOQeYbwnNWHmd7qKVkB6tHus8LQjk852
 eqwYmZ4kVr+eaCO6MttCCxJHf6datPnsbe0stiC9MpxmCzsdpZmFptfauidsFX+p
 0U1IHi/ABN6zIFoc4R0iXXbaDb8ErxGw32SWIb8cnnWwdlSD8I0+Jqxs4opp23LY
 lAm9E0vtDJ49bJBllYl0dUmizdhJ3+NefK4aqPh5H5h3Csub+MLIsuQv/+r2AOhH
 qLBi5kThpspPhGHZ40VDmfV825+csUPTc8WkDaNLvb4f4UGIPakK/KBrBtxiqn+P
 0etvYiWBuoBaqRVQpstawnyDdnBK0IMF/3LAULo+ozo7iTkpaZmOALYgPcBUYw2f
 d6pxZGeNN0GwWfjDmoUDGC07OpO/CSN5WouArgKsp5+VhjzPxjyaZLCnUhzHzXiM
 RV1oBytEs/iw2BLXX809noM9mqHYkdgSVmrZ9OvvDMslcLHaslpq6eaJKZSWqV2J
 fAws6rbcZdTFSnbAWr0OSxct6w6BijEjc3/uk+wWRtw9nkOhFqtlxI3y7k4odpW9
 wVcEmRNkxfA0LlYNXWuL
 =WNyE
 -----END PGP SIGNATURE-----

Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable bugfixes:
   - nfs: don't create zero-length requests

   - several LAYOUTGET bugfixes

  Features:
   - several performance related features

   - more aggressive caching when we can rely on close-to-open
     cache consistency

   - remove serialisation of O_DIRECT reads and writes

   - optimise several code paths to not flush to disk unnecessarily.

     However allow for the idiosyncracies of pNFS for those layout
     types that need to issue a LAYOUTCOMMIT before the metadata can
     be updated on the server.

   - SUNRPC updates to the client data receive path

   - pNFS/SCSI support RH/Fedora dm-mpath device nodes

   - pNFS files/flexfiles can now use unprivileged ports when
     the generic NFS mount options allow it.

  Bugfixes:
   - Don't use RDMA direct data placement together with data
     integrity or privacy security flavours

   - Remove the RDMA ALLPHYSICAL memory registration mode as
     it has potential security holes.

   - Several layout recall fixes to improve NFSv4.1 protocol
     compliance.

   - Fix an Oops in the pNFS files and flexfiles connection
     setup to the DS

   - Allow retry of operations that used a returned delegation
      stateid

   - Don't mark the inode as revalidated if a LAYOUTCOMMIT is
     outstanding

   - Fix writeback races in nfs4_copy_range() and
     nfs42_proc_deallocate()"

* tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits)
  pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding
  NFSv4: Clean up lookup of SECINFO_NO_NAME
  NFSv4.2: Fix warning "variable ‘stateids’ set but not used"
  NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’"
  SUNRPC: Fix a compiler warning in fs/nfs/clnt.c
  pNFS: Remove redundant smp_mb() from pnfs_init_lseg()
  pNFS: Cleanup - do layout segment initialisation in one place
  pNFS: Remove redundant stateid invalidation
  pNFS: Remove redundant pnfs_mark_layout_returned_if_empty()
  pNFS: Clear the layout metadata if the server changed the layout stateid
  pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid()
  NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id
  pNFS: Do not set plh_return_seq for non-callback related layoutreturns
  pNFS: Ensure layoutreturn acts as a completion for layout callbacks
  pNFS: Fix CB_LAYOUTRECALL stateid verification
  pNFS: Always update the layout barrier seqid on LAYOUTGET
  pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set
  pNFS: Clear the layout return tracking on layout reinitialisation
  pNFS: LAYOUTRETURN should only update the stateid if the layout is valid
  nfs: don't create zero-length requests
  ...
steinar/wifi_calib_4_9_kernel
Linus Torvalds 2016-07-30 16:33:25 -07:00
commit 7f155c7026
55 changed files with 1784 additions and 1536 deletions

View File

@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
CFLAGS_nfstrace.o += -I$(src) CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
direct.o pagelist.o read.o symlink.o unlink.o \ io.o direct.o pagelist.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o nfstrace.o write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_SYSCTL) += sysctl.o

View File

@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
if (!p) if (!p)
return -EIO; return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++); b->simple.nr_sigs = be32_to_cpup(p++);
if (!b->simple.nr_sigs) { if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
dprintk("no signature\n"); dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
return -EIO; return -EIO;
} }
@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
memcpy(&b->simple.sigs[i].sig, p, memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len); b->simple.sigs[i].sig_len);
b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; b->simple.len += 8 + 4 + \
(XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
} }
break; break;
case PNFS_BLOCK_VOLUME_SLICE: case PNFS_BLOCK_VOLUME_SLICE:
@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 4); p = xdr_inline_decode(xdr, 4);
if (!p) if (!p)
return -EIO; return -EIO;
b->concat.volumes_count = be32_to_cpup(p++); b->concat.volumes_count = be32_to_cpup(p++);
if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
dprintk("Too many volumes: %d\n", b->concat.volumes_count);
return -EIO;
}
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p) if (!p)
@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 8 + 4); p = xdr_inline_decode(xdr, 8 + 4);
if (!p) if (!p)
return -EIO; return -EIO;
p = xdr_decode_hyper(p, &b->stripe.chunk_size); p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++); b->stripe.volumes_count = be32_to_cpup(p++);
if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
return -EIO;
}
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p) if (!p)
@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{ {
struct pnfs_block_volume *v = &volumes[idx]; struct pnfs_block_volume *v = &volumes[idx];
struct block_device *bdev;
dev_t dev; dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask); dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev) if (!dev)
return -EIO; return -EIO;
d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(d->bdev)) { if (IS_ERR(bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
return PTR_ERR(d->bdev); return PTR_ERR(bdev);
} }
d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode); d->len = i_size_read(d->bdev->bd_inode);
@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
} }
} }
/*
* Try to open the udev path for the WWN. At least on Debian the udev
* by-id path will always point to the dm-multipath device if one exists.
*/
static struct block_device *
bl_open_udev_path(struct pnfs_block_volume *v)
{
struct block_device *bdev;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
v->scsi.designator_len, v->scsi.designator);
if (!devname)
return ERR_PTR(-ENOMEM);
bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(bdev)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
devname, PTR_ERR(bdev));
}
kfree(devname);
return bdev;
}
/*
* Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
* wwn- links will only point to the first discovered SCSI device there.
*/
static struct block_device *
bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
{
struct block_device *bdev;
const char *devname;
devname = kasprintf(GFP_KERNEL,
"/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
v->scsi.designator_type,
v->scsi.designator_len, v->scsi.designator);
if (!devname)
return ERR_PTR(-ENOMEM);
bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
kfree(devname);
return bdev;
}
static int static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{ {
struct pnfs_block_volume *v = &volumes[idx]; struct pnfs_block_volume *v = &volumes[idx];
struct block_device *bdev;
const struct pr_ops *ops; const struct pr_ops *ops;
const char *devname;
int error; int error;
if (!bl_validate_designator(v)) if (!bl_validate_designator(v))
return -EINVAL; return -EINVAL;
switch (v->scsi.designator_len) { bdev = bl_open_dm_mpath_udev_path(v);
case 8: if (IS_ERR(bdev))
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", bdev = bl_open_udev_path(v);
v->scsi.designator); if (IS_ERR(bdev))
break; return PTR_ERR(bdev);
case 12: d->bdev = bdev;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
v->scsi.designator);
break;
case 16:
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
v->scsi.designator);
break;
default:
return -EINVAL;
}
d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
if (IS_ERR(d->bdev)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
devname, PTR_ERR(d->bdev));
kfree(devname);
return PTR_ERR(d->bdev);
}
kfree(devname);
d->len = i_size_read(d->bdev->bd_inode); d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple; d->map = bl_map_simple;
@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0; return 0;
out_blkdev_put: out_blkdev_put:
blkdev_put(d->bdev, FMODE_READ); blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
return error; return error;
} }

View File

@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
return be; return be;
} }
static void __ext_put_deviceids(struct list_head *head)
{
struct pnfs_block_extent *be, *tmp;
list_for_each_entry_safe(be, tmp, head, be_list) {
nfs4_put_deviceid_node(be->be_device);
kfree(be);
}
}
static void static void
__ext_tree_insert(struct rb_root *root, __ext_tree_insert(struct rb_root *root,
struct pnfs_block_extent *new, bool merge_ok) struct pnfs_block_extent *new, bool merge_ok)
@ -163,7 +173,8 @@ free_new:
} }
static int static int
__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) __ext_tree_remove(struct rb_root *root,
sector_t start, sector_t end, struct list_head *tmp)
{ {
struct pnfs_block_extent *be; struct pnfs_block_extent *be;
sector_t len1 = 0, len2 = 0; sector_t len1 = 0, len2 = 0;
@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
struct pnfs_block_extent *next = ext_tree_next(be); struct pnfs_block_extent *next = ext_tree_next(be);
rb_erase(&be->be_node, root); rb_erase(&be->be_node, root);
nfs4_put_deviceid_node(be->be_device); list_add_tail(&be->be_list, tmp);
kfree(be);
be = next; be = next;
} }
@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
sector_t start, sector_t end) sector_t start, sector_t end)
{ {
int err, err2; int err, err2;
LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock); spin_lock(&bl->bl_ext_lock);
err = __ext_tree_remove(&bl->bl_ext_ro, start, end); err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (rw) { if (rw) {
err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
if (!err) if (!err)
err = err2; err = err2;
} }
spin_unlock(&bl->bl_ext_lock); spin_unlock(&bl->bl_ext_lock);
__ext_put_deviceids(&tmp);
return err; return err;
} }
@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t end = start + len; sector_t end = start + len;
struct pnfs_block_extent *be; struct pnfs_block_extent *be;
int err = 0; int err = 0;
LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock); spin_lock(&bl->bl_ext_lock);
/* /*
* First remove all COW extents or holes from written to range. * First remove all COW extents or holes from written to range.
*/ */
err = __ext_tree_remove(&bl->bl_ext_ro, start, end); err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (err) if (err)
goto out; goto out;
@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
} }
out: out:
spin_unlock(&bl->bl_ext_lock); spin_unlock(&bl->bl_ext_lock);
__ext_put_deviceids(&tmp);
return err; return err;
} }

View File

@ -119,27 +119,30 @@ out:
* hashed by filehandle. * hashed by filehandle.
*/ */
static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
struct nfs_fh *fh, nfs4_stateid *stateid) struct nfs_fh *fh)
{ {
struct nfs_server *server; struct nfs_server *server;
struct nfs_inode *nfsi;
struct inode *ino; struct inode *ino;
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) { list_for_each_entry(lo, &server->layouts, plh_layouts) {
if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) nfsi = NFS_I(lo->plh_inode);
if (nfs_compare_fh(fh, &nfsi->fh))
continue; continue;
if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) if (nfsi->layout != lo)
continue; continue;
ino = igrab(lo->plh_inode); ino = igrab(lo->plh_inode);
if (!ino) if (!ino)
break; break;
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */ /* Is this layout in the process of being freed? */
if (NFS_I(ino)->layout != lo) { if (nfsi->layout != lo) {
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
iput(ino); iput(ino);
break; goto restart;
} }
pnfs_get_layout_hdr(lo); pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
} }
static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
struct nfs_fh *fh, nfs4_stateid *stateid) struct nfs_fh *fh)
{ {
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock); spin_lock(&clp->cl_lock);
rcu_read_lock(); rcu_read_lock();
lo = get_layout_by_fh_locked(clp, fh, stateid); lo = get_layout_by_fh_locked(clp, fh);
rcu_read_unlock(); rcu_read_unlock();
spin_unlock(&clp->cl_lock); spin_unlock(&clp->cl_lock);
@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
/* /*
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/ */
static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new) const nfs4_stateid *new)
{ {
u32 oldseq, newseq; u32 oldseq, newseq;
oldseq = be32_to_cpu(lo->plh_stateid.seqid); /* Is the stateid still not initialised? */
newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo))
return NFS4ERR_DELAY;
/* Mismatched stateid? */
if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
return NFS4ERR_BAD_STATEID;
newseq = be32_to_cpu(new->seqid);
/* Are we already in a layout recall situation? */
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
lo->plh_return_seq != 0) {
if (newseq < lo->plh_return_seq)
return NFS4ERR_OLD_STATEID;
if (newseq > lo->plh_return_seq)
return NFS4ERR_DELAY;
goto out;
}
/* Check that the stateid matches what we think it should be. */
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
if (newseq > oldseq + 1) if (newseq > oldseq + 1)
return false; return NFS4ERR_DELAY;
return true; /* Crazy server! */
if (newseq <= oldseq)
return NFS4ERR_OLD_STATEID;
out:
return NFS_OK;
} }
static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_file_draining(struct nfs_client *clp,
@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT; u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list); LIST_HEAD(free_me_list);
lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); lo = get_layout_by_fh(clp, &args->cbl_fh);
if (!lo) { if (!lo) {
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
&args->cbl_stateid, -rv); &args->cbl_stateid, -rv);
@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
} }
ino = lo->plh_inode; ino = lo->plh_inode;
spin_lock(&ino->i_lock);
if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
rv = NFS4ERR_DELAY;
goto unlock;
}
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&ino->i_lock);
pnfs_layoutcommit_inode(ino, false); pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
if (rv != NFS_OK)
goto unlock;
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
/* /*
* Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
*/ */
@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock; goto unlock;
} }
/* Embrace your forgetfulness! */
rv = NFS4ERR_NOMATCHING_LAYOUT;
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range); &args->cbl_range);
} }
pnfs_mark_layout_returned_if_empty(lo);
unlock: unlock:
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list); pnfs_free_lseg_list(&free_me_list);

View File

@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
if (hdr_arg.minorversion == 0) { if (hdr_arg.minorversion == 0) {
cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
return rpc_drop_reply; goto out_invalidcred;
} }
cps.minorversion = hdr_arg.minorversion; cps.minorversion = hdr_arg.minorversion;
@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
nfs_put_client(cps.clp); nfs_put_client(cps.clp);
dprintk("%s: done, status = %u\n", __func__, ntohl(status)); dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success; return rpc_success;
out_invalidcred:
pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
return rpc_autherr_badcred;
} }
/* /*

View File

@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
*/ */
struct nfs_client * struct nfs_client *
nfs_get_client(const struct nfs_client_initdata *cl_init, nfs_get_client(const struct nfs_client_initdata *cl_init,
const struct rpc_timeout *timeparms,
const char *ip_addr,
rpc_authflavor_t authflavour) rpc_authflavor_t authflavour)
{ {
struct nfs_client *clp, *new = NULL; struct nfs_client *clp, *new = NULL;
@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list); &nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock); spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags; new->cl_flags = cl_init->init_flags;
return rpc_ops->init_client(new, timeparms, ip_addr); return rpc_ops->init_client(new, cl_init);
} }
spin_unlock(&nn->nfs_client_lock); spin_unlock(&nn->nfs_client_lock);
@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
* Create an RPC client handle * Create an RPC client handle
*/ */
int nfs_create_rpc_client(struct nfs_client *clp, int nfs_create_rpc_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor) rpc_authflavor_t flavor)
{ {
struct rpc_clnt *clnt = NULL; struct rpc_clnt *clnt = NULL;
@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.protocol = clp->cl_proto, .protocol = clp->cl_proto,
.address = (struct sockaddr *)&clp->cl_addr, .address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen, .addrsize = clp->cl_addrlen,
.timeout = timeparms, .timeout = cl_init->timeparms,
.servername = clp->cl_hostname, .servername = clp->cl_hostname,
.nodename = cl_init->nodename,
.program = &nfs_program, .program = &nfs_program,
.version = clp->rpc_ops->version, .version = clp->rpc_ops->version,
.authflavor = flavor, .authflavor = flavor,
@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* nfs_init_client - Initialise an NFS2 or NFS3 client * nfs_init_client - Initialise an NFS2 or NFS3 client
* *
* @clp: nfs_client to initialise * @clp: nfs_client to initialise
* @timeparms: timeout parameters for underlying RPC transport * @cl_init: Initialisation parameters
* @ip_addr: IP presentation address (not used)
* *
* Returns pointer to an NFS client, or an ERR_PTR value. * Returns pointer to an NFS client, or an ERR_PTR value.
*/ */
struct nfs_client *nfs_init_client(struct nfs_client *clp, struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct nfs_client_initdata *cl_init)
const char *ip_addr)
{ {
int error; int error;
@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
* Create a client RPC handle for doing FSSTAT with UNIX auth only * Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2 * - RFC 2623, sec 2.3.2
*/ */
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0) if (error < 0)
goto error; goto error;
nfs_mark_client_ready(clp, NFS_CS_READY); nfs_mark_client_ready(clp, NFS_CS_READY);
@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
const struct nfs_parsed_mount_data *data, const struct nfs_parsed_mount_data *data,
struct nfs_subversion *nfs_mod) struct nfs_subversion *nfs_mod)
{ {
struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = { struct nfs_client_initdata cl_init = {
.hostname = data->nfs_server.hostname, .hostname = data->nfs_server.hostname,
.addr = (const struct sockaddr *)&data->nfs_server.address, .addr = (const struct sockaddr *)&data->nfs_server.address,
@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
.nfs_mod = nfs_mod, .nfs_mod = nfs_mod,
.proto = data->nfs_server.protocol, .proto = data->nfs_server.protocol,
.net = data->net, .net = data->net,
.timeparms = &timeparms,
}; };
struct rpc_timeout timeparms;
struct nfs_client *clp; struct nfs_client *clp;
int error; int error;
@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */ /* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
if (IS_ERR(clp)) { if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp); return PTR_ERR(clp);

View File

@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
return NULL; return NULL;
} }
static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
{ {
struct nfs_inode *nfsi = NFS_I(inode); struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache; struct nfs_access_entry *cache;
int err = -ENOENT; bool retry = true;
int err;
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) for(;;) {
goto out_zap; if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
cache = nfs_access_search_rbtree(inode, cred); goto out_zap;
if (cache == NULL) cache = nfs_access_search_rbtree(inode, cred);
goto out; err = -ENOENT;
if (!nfs_have_delegated_attributes(inode) && if (cache == NULL)
!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) goto out;
goto out_stale; /* Found an entry, is our attribute cache valid? */
if (!nfs_attribute_cache_expired(inode) &&
!(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
break;
err = -ECHILD;
if (!may_block)
goto out;
if (!retry)
goto out_zap;
spin_unlock(&inode->i_lock);
err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (err)
return err;
spin_lock(&inode->i_lock);
retry = false;
}
res->jiffies = cache->jiffies; res->jiffies = cache->jiffies;
res->cred = cache->cred; res->cred = cache->cred;
res->mask = cache->mask; res->mask = cache->mask;
@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
out: out:
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
return err; return err;
out_stale:
rb_erase(&cache->rb_node, &nfsi->access_cache);
list_del(&cache->lru);
spin_unlock(&inode->i_lock);
nfs_access_free_entry(cache);
return -ENOENT;
out_zap: out_zap:
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode); nfs_access_zap_cache(inode);
@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
cache = NULL; cache = NULL;
if (cache == NULL) if (cache == NULL)
goto out; goto out;
if (!nfs_have_delegated_attributes(inode) && err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) if (err)
goto out; goto out;
res->jiffies = cache->jiffies; res->jiffies = cache->jiffies;
res->cred = cache->cred; res->cred = cache->cred;
res->mask = cache->mask; res->mask = cache->mask;
err = 0;
out: out:
rcu_read_unlock(); rcu_read_unlock();
return err; return err;
@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
{ {
struct nfs_access_entry cache; struct nfs_access_entry cache;
bool may_block = (mask & MAY_NOT_BLOCK) == 0;
int status; int status;
trace_nfs_access_enter(inode); trace_nfs_access_enter(inode);
status = nfs_access_get_cached_rcu(inode, cred, &cache); status = nfs_access_get_cached_rcu(inode, cred, &cache);
if (status != 0) if (status != 0)
status = nfs_access_get_cached(inode, cred, &cache); status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0) if (status == 0)
goto out_cached; goto out_cached;
status = -ECHILD; status = -ECHILD;
if (mask & MAY_NOT_BLOCK) if (!may_block)
goto out; goto out;
/* Be clever: ask server to check for all possible rights */ /* Be clever: ask server to check for all possible rights */

View File

@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0); WARN_ON_ONCE(verfp->committed < 0);
} }
static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
const struct nfs_writeverf *v2)
{
return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
}
/* /*
* nfs_direct_cmp_hdr_verf - compare verifier for pgio header * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
* @dreq - direct request possibly spanning multiple servers * @dreq - direct request possibly spanning multiple servers
@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
nfs_direct_set_hdr_verf(dreq, hdr); nfs_direct_set_hdr_verf(dreq, hdr);
return 0; return 0;
} }
return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); return nfs_direct_cmp_verf(verfp, &hdr->verf);
} }
/* /*
@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
if (verfp->committed < 0) if (verfp->committed < 0)
return 1; return 1;
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); return nfs_direct_cmp_verf(verfp, &data->verf);
} }
/** /**
@ -366,22 +372,10 @@ out:
* Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
* the iocb is still valid here if this is a synchronous request. * the iocb is still valid here if this is a synchronous request.
*/ */
static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_complete(struct nfs_direct_req *dreq)
{ {
struct inode *inode = dreq->inode; struct inode *inode = dreq->inode;
if (dreq->iocb && write) {
loff_t pos = dreq->iocb->ki_pos + dreq->count;
spin_lock(&inode->i_lock);
if (i_size_read(inode) < pos)
i_size_write(inode, pos);
spin_unlock(&inode->i_lock);
}
if (write)
nfs_zap_mapping(inode, inode->i_mapping);
inode_dio_end(inode); inode_dio_end(inode);
if (dreq->iocb) { if (dreq->iocb) {
@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
} }
out_put: out_put:
if (put_dreq(dreq)) if (put_dreq(dreq))
nfs_direct_complete(dreq, false); nfs_direct_complete(dreq);
hdr->release(hdr); hdr->release(hdr);
} }
@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
} }
if (put_dreq(dreq)) if (put_dreq(dreq))
nfs_direct_complete(dreq, false); nfs_direct_complete(dreq);
return 0; return 0;
} }
@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!count) if (!count)
goto out; goto out;
inode_lock(inode);
result = nfs_sync_mapping(mapping);
if (result)
goto out_unlock;
task_io_account_read(count); task_io_account_read(count);
result = -ENOMEM; result = -ENOMEM;
dreq = nfs_direct_req_alloc(); dreq = nfs_direct_req_alloc();
if (dreq == NULL) if (dreq == NULL)
goto out_unlock; goto out;
dreq->inode = inode; dreq->inode = inode;
dreq->bytes_left = dreq->max_count = count; dreq->bytes_left = dreq->max_count = count;
@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb)) if (!is_sync_kiocb(iocb))
dreq->iocb = iocb; dreq->iocb = iocb;
nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count; NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
inode_unlock(inode); nfs_end_io_direct(inode);
if (!result) { if (!result) {
result = nfs_direct_wait(dreq); result = nfs_direct_wait(dreq);
@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += result; iocb->ki_pos += result;
} }
nfs_direct_req_release(dreq);
return result;
out_release: out_release:
nfs_direct_req_release(dreq); nfs_direct_req_release(dreq);
out_unlock:
inode_unlock(inode);
out: out:
return result; return result;
} }
@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0; dreq->count = 0;
dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
for (i = 0; i < dreq->mirror_count; i++) for (i = 0; i < dreq->mirror_count; i++)
dreq->mirrors[i].count = 0; dreq->mirrors[i].count = 0;
get_dreq(dreq); get_dreq(dreq);
@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq); nfs_direct_write_reschedule(dreq);
break; break;
default: default:
nfs_direct_complete(dreq, true); nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
nfs_direct_complete(dreq);
} }
} }
@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{ {
ssize_t result = -EINVAL; ssize_t result = -EINVAL;
size_t count;
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos); file, iov_iter_count(iter), (long long) iocb->ki_pos);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, result = generic_write_checks(iocb, iter);
iov_iter_count(iter)); if (result <= 0)
return result;
count = result;
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
pos = iocb->ki_pos; pos = iocb->ki_pos;
end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
inode_lock(inode); task_io_account_write(count);
result = nfs_sync_mapping(mapping);
if (result)
goto out_unlock;
if (mapping->nrpages) {
result = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
if (result)
goto out_unlock;
}
task_io_account_write(iov_iter_count(iter));
result = -ENOMEM; result = -ENOMEM;
dreq = nfs_direct_req_alloc(); dreq = nfs_direct_req_alloc();
if (!dreq) if (!dreq)
goto out_unlock; goto out;
dreq->inode = inode; dreq->inode = inode;
dreq->bytes_left = dreq->max_count = iov_iter_count(iter); dreq->bytes_left = dreq->max_count = count;
dreq->io_start = pos; dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx); l_ctx = nfs_get_lock_context(dreq->ctx);
@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb)) if (!is_sync_kiocb(iocb))
dreq->iocb = iocb; dreq->iocb = iocb;
nfs_start_io_direct(inode);
result = nfs_direct_write_schedule_iovec(dreq, iter, pos); result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) { if (mapping->nrpages) {
@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos >> PAGE_SHIFT, end); pos >> PAGE_SHIFT, end);
} }
inode_unlock(inode); nfs_end_io_direct(inode);
if (!result) { if (!result) {
result = nfs_direct_wait(dreq); result = nfs_direct_wait(dreq);
if (result > 0) { if (result > 0) {
struct inode *inode = mapping->host;
iocb->ki_pos = pos + result; iocb->ki_pos = pos + result;
spin_lock(&inode->i_lock);
if (i_size_read(inode) < iocb->ki_pos)
i_size_write(inode, iocb->ki_pos);
spin_unlock(&inode->i_lock);
/* XXX: should check the generic_write_sync retval */ /* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result); generic_write_sync(iocb, result);
} }
} }
nfs_direct_req_release(dreq);
return result;
out_release: out_release:
nfs_direct_req_release(dreq); nfs_direct_req_release(dreq);
out_unlock: out:
inode_unlock(inode);
return result; return result;
} }

View File

@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp, iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos); iov_iter_count(to), (unsigned long) iocb->ki_pos);
result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); nfs_start_io_read(inode);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) { if (!result) {
result = generic_file_read_iter(iocb, to); result = generic_file_read_iter(iocb, to);
if (result > 0) if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
} }
nfs_end_io_read(inode);
return result; return result;
} }
EXPORT_SYMBOL_GPL(nfs_file_read); EXPORT_SYMBOL_GPL(nfs_file_read);
@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos); filp, (unsigned long) count, (unsigned long long) *ppos);
res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); nfs_start_io_read(inode);
res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res) { if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags); res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0) if (res > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
} }
nfs_end_io_read(inode);
return res; return res;
} }
EXPORT_SYMBOL_GPL(nfs_file_splice_read); EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode); trace_nfs_fsync_enter(inode);
inode_dio_wait(inode);
do { do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end); ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0) if (ret != 0)
break; break;
inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync); ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret) if (!ret)
ret = pnfs_sync_inode(inode, !!datasync); ret = pnfs_sync_inode(inode, !!datasync);
inode_unlock(inode);
/* /*
* If nfs_file_fsync_commit detected a server reboot, then * If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by * resend all dirty pages that might have been covered by
@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos); file, mapping->host->i_ino, len, (long long) pos);
start: start:
/*
* Prevent starvation issues if someone is doing a consistency
* sync-to-disk
*/
ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
return ret;
/*
* Wait for O_DIRECT to complete
*/
inode_dio_wait(mapping->host);
page = grab_cache_page_write_begin(mapping, index, flags); page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return status; return status;
NFS_I(mapping->host)->write_io += copied; NFS_I(mapping->host)->write_io += copied;
if (nfs_ctx_key_to_expire(ctx)) { if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
status = nfs_wb_all(mapping->host); status = nfs_wb_all(mapping->host);
if (status < 0) if (status < 0)
return status; return status;
@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
*/ */
static int nfs_release_page(struct page *page, gfp_t gfp) static int nfs_release_page(struct page *page, gfp_t gfp)
{ {
struct address_space *mapping = page->mapping;
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
* wait for it if the caller allows blocking. Even then,
* only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
* need to wait extensively here. A short wait has the
* benefit that someone else can worry about the freezer.
*/
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
if (PagePrivate(page))
set_bdi_congested(&nfss->backing_dev_info,
BLK_RW_ASYNC);
}
}
/* If PagePrivate() is set, then the page is not freeable */ /* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page)) if (PagePrivate(page))
return 0; return 0;
@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
filp, filp->f_mapping->host->i_ino, filp, filp->f_mapping->host->i_ino,
(long long)page_offset(page)); (long long)page_offset(page));
sb_start_pagefault(inode->i_sb);
/* make sure the cache has finished storing the page */ /* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page); nfs_fscache_wait_on_page_write(NFS_I(inode), page);
@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_unlock: out_unlock:
unlock_page(page); unlock_page(page);
out: out:
sb_end_pagefault(inode->i_sb);
return ret; return ret;
} }
@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
ctx = nfs_file_open_context(filp); ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
nfs_ctx_key_to_expire(ctx)) nfs_ctx_key_to_expire(ctx, inode))
return 1; return 1;
return 0; return 0;
} }
@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
unsigned long written = 0; unsigned long written = 0;
ssize_t result; ssize_t result;
size_t count = iov_iter_count(from);
result = nfs_key_timeout_notify(file, inode); result = nfs_key_timeout_notify(file, inode);
if (result) if (result)
return result; return result;
if (iocb->ki_flags & IOCB_DIRECT) { if (iocb->ki_flags & IOCB_DIRECT)
result = generic_write_checks(iocb, from);
if (result <= 0)
return result;
return nfs_file_direct_write(iocb, from); return nfs_file_direct_write(iocb, from);
}
dprintk("NFS: write(%pD2, %zu@%Ld)\n", dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, count, (long long) iocb->ki_pos); file, iov_iter_count(from), (long long) iocb->ki_pos);
result = -EBUSY;
if (IS_SWAPFILE(inode)) if (IS_SWAPFILE(inode))
goto out_swapfile; goto out_swapfile;
/* /*
@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out; goto out;
} }
result = count; nfs_start_io_write(inode);
if (!count) result = generic_write_checks(iocb, from);
if (result > 0) {
current->backing_dev_info = inode_to_bdi(inode);
result = generic_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
}
nfs_end_io_write(inode);
if (result <= 0)
goto out; goto out;
result = generic_file_write_iter(iocb, from); written = generic_write_sync(iocb, result);
if (result > 0) iocb->ki_pos += written;
written = result;
/* Return error values */ /* Return error values */
if (result >= 0 && nfs_need_check_write(file, inode)) { if (nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0); int err = vfs_fsync(file, 0);
if (err < 0) if (err < 0)
result = err; result = err;
} }
if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out: out:
return result; return result;
out_swapfile: out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
goto out; return -EBUSY;
} }
EXPORT_SYMBOL_GPL(nfs_file_write); EXPORT_SYMBOL_GPL(nfs_file_write);
@ -779,11 +746,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
return status; return status;
} }
static int
is_time_granular(struct timespec *ts) {
return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
}
static int static int
do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{ {
@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* This makes locking act as a cache coherency point. * This makes locking act as a cache coherency point.
*/ */
nfs_sync_mapping(filp->f_mapping); nfs_sync_mapping(filp->f_mapping);
if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
if (is_time_granular(&NFS_SERVER(inode)->time_delta)) nfs_zap_mapping(inode, filp->f_mapping);
__nfs_revalidate_inode(NFS_SERVER(inode), inode);
else
nfs_zap_caches(inode);
}
out: out:
return status; return status;
} }

View File

@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
static void static void
filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{ {
loff_t end_offs = 0;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
hdr->res.verf->committed != NFS_DATA_SYNC) hdr->res.verf->committed == NFS_FILE_SYNC)
return; return;
if (hdr->res.verf->committed == NFS_DATA_SYNC)
end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
pnfs_set_layoutcommit(hdr->inode, hdr->lseg, /* Note: if the write is unstable, don't set end_offs until commit */
hdr->mds_offset + hdr->res.count); pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
} }
@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
} }
filelayout_set_layoutcommit(hdr); filelayout_set_layoutcommit(hdr);
/* zero out the fattr */
hdr->fattr.valid = 0;
if (task->tk_status >= 0)
nfs_writeback_update_inode(hdr);
return 0; return 0;
} }
@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN; return -EAGAIN;
} }
if (data->verf.committed == NFS_UNSTABLE) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0; return 0;
} }

View File

@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
* we always send layoutcommit after DS writes. * we always send layoutcommit after DS writes.
*/ */
static void static void
ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) ff_layout_set_layoutcommit(struct inode *inode,
struct pnfs_layout_segment *lseg,
loff_t end_offset)
{ {
if (!ff_layout_need_layoutcommit(hdr->lseg)) if (!ff_layout_need_layoutcommit(lseg))
return; return;
pnfs_set_layoutcommit(hdr->inode, hdr->lseg, pnfs_set_layoutcommit(inode, lseg, end_offset);
hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long long) NFS_I(inode)->layout->plh_lwb);
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
} }
static bool static bool
@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task, static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr) struct nfs_pgio_header *hdr)
{ {
loff_t end_offs = 0;
int err; int err;
trace_nfs4_pnfs_write(hdr, task->tk_status); trace_nfs4_pnfs_write(hdr, task->tk_status);
@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
if (hdr->res.verf->committed == NFS_FILE_SYNC || if (hdr->res.verf->committed == NFS_FILE_SYNC ||
hdr->res.verf->committed == NFS_DATA_SYNC) hdr->res.verf->committed == NFS_DATA_SYNC)
ff_layout_set_layoutcommit(hdr); end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
/* Note: if the write is unstable, don't set end_offs until commit */
ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
/* zero out fattr since we don't care DS attr at all */ /* zero out fattr since we don't care DS attr at all */
hdr->fattr.valid = 0; hdr->fattr.valid = 0;
@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN; return -EAGAIN;
} }
if (data->verf.committed == NFS_UNSTABLE ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
&& ff_layout_need_layoutcommit(data->lseg))
pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0; return 0;
} }

View File

@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode); trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */ /* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) { if (S_ISREG(inode->i_mode)) {
inode_lock(inode); err = filemap_write_and_wait(inode->i_mapping);
err = nfs_sync_inode(inode);
inode_unlock(inode);
if (err) if (err)
goto out; goto out;
} }
@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode); struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
list_add(&ctx->list, &nfsi->open_files); if (ctx->mode & FMODE_WRITE)
list_add(&ctx->list, &nfsi->open_files);
else
list_add_tail(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
} }
EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode)) if (NFS_STALE(inode))
goto out; goto out;
/* pNFS: Attributes aren't updated until we layoutcommit */
if (S_ISREG(inode->i_mode)) {
status = pnfs_sync_inode(inode, false);
if (status)
goto out;
}
status = -ENOMEM; status = -ENOMEM;
fattr = nfs_alloc_fattr(); fattr = nfs_alloc_fattr();
if (fattr == NULL) if (fattr == NULL)
@ -1122,14 +1130,12 @@ out:
} }
/** /**
* __nfs_revalidate_mapping - Revalidate the pagecache * nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode * @inode - pointer to host inode
* @mapping - pointer to mapping * @mapping - pointer to mapping
* @may_lock - take inode->i_mutex?
*/ */
static int __nfs_revalidate_mapping(struct inode *inode, int nfs_revalidate_mapping(struct inode *inode,
struct address_space *mapping, struct address_space *mapping)
bool may_lock)
{ {
struct nfs_inode *nfsi = NFS_I(inode); struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags; unsigned long *bitlock = &nfsi->flags;
@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode); trace_nfs_invalidate_mapping_enter(inode);
if (may_lock) { ret = nfs_invalidate_mapping(inode, mapping);
inode_lock(inode);
ret = nfs_invalidate_mapping(inode, mapping);
inode_unlock(inode);
} else
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret); trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@ -1193,27 +1194,28 @@ out:
return ret; return ret;
} }
/** static bool nfs_file_has_writers(struct nfs_inode *nfsi)
* nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
*/
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{ {
return __nfs_revalidate_mapping(inode, mapping, false); struct inode *inode = &nfsi->vfs_inode;
assert_spin_locked(&inode->i_lock);
if (!S_ISREG(inode->i_mode))
return false;
if (list_empty(&nfsi->open_files))
return false;
/* Note: This relies on nfsi->open_files being ordered with writers
* being placed at the head of the list.
* See nfs_inode_attach_open_context()
*/
return (list_first_entry(&nfsi->open_files,
struct nfs_open_context,
list)->mode & FMODE_WRITE) == FMODE_WRITE;
} }
/** static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
* nfs_revalidate_mapping_protected - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
*
* Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
* while invalidating the mapping.
*/
int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
{ {
return __nfs_revalidate_mapping(inode, mapping, true); return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
} }
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO; return -EIO;
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && if (!nfs_file_has_buffered_writers(nfsi)) {
inode->i_version != fattr->change_attr) /* Verify a few of the more important attributes */
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
/* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) invalid |= NFS_INO_INVALID_ATTR;
invalid |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_SIZE) { if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
cur_size = i_size_read(inode); invalid |= NFS_INO_INVALID_ATTR;
new_isize = nfs_size_to_loff_t(fattr->size);
if (cur_size != new_isize) if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; cur_size = i_size_read(inode);
new_isize = nfs_size_to_loff_t(fattr->size);
if (cur_size != new_isize)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
}
} }
if (nfsi->nrequests != 0)
invalid &= ~NFS_INO_REVAL_PAGECACHE;
/* Have any file permissions changed? */ /* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
} }
/*
* Don't trust the change_attribute, mtime, ctime or size if
* a pnfs LAYOUTCOMMIT is outstanding
*/
static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
struct nfs_fattr *fattr)
{
if (pnfs_layoutcommit_outstanding(inode))
fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
NFS_ATTR_FATTR_MTIME |
NFS_ATTR_FATTR_CTIME |
NFS_ATTR_FATTR_SIZE);
}
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{ {
int ret; int ret;
trace_nfs_refresh_inode_enter(inode); trace_nfs_refresh_inode_enter(inode);
nfs_inode_attrs_handle_layoutcommit(inode, fattr);
if (nfs_inode_attrs_need_update(inode, fattr)) if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr); ret = nfs_update_inode(inode, fattr);
else else
@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{ {
unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; unsigned long invalid = NFS_INO_INVALID_ATTR;
/* /*
* Don't revalidate the pagecache if we hold a delegation, but do * Don't revalidate the pagecache if we hold a delegation, but do
@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0; unsigned long invalid = 0;
unsigned long now = jiffies; unsigned long now = jiffies;
unsigned long save_cache_validity; unsigned long save_cache_validity;
bool have_writers = nfs_file_has_buffered_writers(nfsi);
bool cache_revalidated = true; bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do atomic weak cache consistency updates */ /* Do atomic weak cache consistency updates */
invalid |= nfs_wcc_update_inode(inode, fattr); invalid |= nfs_wcc_update_inode(inode, fattr);
if (pnfs_layoutcommit_outstanding(inode)) {
nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
cache_revalidated = false;
}
/* More cache consistency checks */ /* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
if (inode->i_version != fattr->change_attr) { if (inode->i_version != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n", dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino); inode->i_sb->s_id, inode->i_ino);
invalid |= NFS_INO_INVALID_ATTR /* Could it be a race with writeback? */
| NFS_INO_INVALID_DATA if (!have_writers) {
| NFS_INO_INVALID_ACCESS invalid |= NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACL; | NFS_INO_INVALID_DATA
if (S_ISDIR(inode->i_mode)) | NFS_INO_INVALID_ACCESS
nfs_force_lookup_revalidate(inode); | NFS_INO_INVALID_ACL;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
}
inode->i_version = fattr->change_attr; inode->i_version = fattr->change_attr;
} }
} else { } else {
@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) { if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has /* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */ * the file grown beyond our last write? */
if ((nfsi->nrequests == 0) || new_isize > cur_isize) { if (nfsi->nrequests == 0 || new_isize > cur_isize) {
i_size_write(inode, new_isize); i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; if (!have_writers)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
} }
dprintk("NFS: isize change on server for file %s/%ld " dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n", "(%Ld to %Ld)\n",

View File

@ -66,13 +66,16 @@ struct nfs_clone_mount {
struct nfs_client_initdata { struct nfs_client_initdata {
unsigned long init_flags; unsigned long init_flags;
const char *hostname; const char *hostname; /* Hostname of the server */
const struct sockaddr *addr; const struct sockaddr *addr; /* Address of the server */
const char *nodename; /* Hostname of the client */
const char *ip_addr; /* IP address of the client */
size_t addrlen; size_t addrlen;
struct nfs_subversion *nfs_mod; struct nfs_subversion *nfs_mod;
int proto; int proto;
u32 minorversion; u32 minorversion;
struct net *net; struct net *net;
const struct rpc_timeout *timeparms;
}; };
/* /*
@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
extern const struct rpc_program nfs_program; extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net); extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
const struct rpc_timeout *, const char *,
rpc_authflavor_t); rpc_authflavor_t);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *); void nfs_server_insert_lists(struct nfs_server *);
@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
rpc_authflavor_t); rpc_authflavor_t);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto, int ds_addrlen, int ds_proto,
unsigned int ds_timeo, unsigned int ds_timeo,
@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
rpc_authflavor_t au_flavor); rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *); struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen, const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans, rpc_authflavor_t au_flavor); unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
/* proc.c */ /* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync); void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp, extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct nfs_client_initdata *);
const char *ip_addr);
/* dir.c */ /* dir.c */
extern void nfs_force_use_readdirplus(struct inode *dir); extern void nfs_force_use_readdirplus(struct inode *dir);
@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb); extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
extern void nfs_start_io_write(struct inode *inode);
extern void nfs_end_io_write(struct inode *inode);
extern void nfs_start_io_direct(struct inode *inode);
extern void nfs_end_io_direct(struct inode *inode);
static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
{
return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}
/* namespace.c */ /* namespace.c */
#define NFS_PATH_CANONICAL 1 #define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry, extern char *nfs_path(char **p, struct dentry *dentry,
@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct inode *inode, struct inode *inode,
struct nfs_direct_req *dreq); struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode); int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
int nfs_filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
#ifdef CONFIG_NFS_V4_1
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
int i;
for (i = 0; i < cinfo->nbuckets; i++)
cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
}
#else
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
}
#endif
#ifdef CONFIG_MIGRATION #ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *, extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode); struct page *, struct page *, enum migrate_mode);
@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL #define nfs_migrate_page NULL
#endif #endif
static inline int
nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
const struct nfs_write_verifier *v2)
{
return memcmp(v1->data, v2->data, sizeof(v1->data));
}
/* unlink.c */ /* unlink.c */
extern struct rpc_task * extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */ /* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct nfs_client_initdata *);
const char *ip_addr);
extern int nfs40_walk_client_list(struct nfs_client *clp, extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result, struct nfs_client **result,
struct rpc_cred *cred); struct rpc_cred *cred);

147
fs/nfs/io.c 100644
View File

@ -0,0 +1,147 @@
/*
* Copyright (c) 2016 Trond Myklebust
*
* I/O and data path helper functionality.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/rwsem.h>
#include <linux/fs.h>
#include <linux/nfs_fs.h>
#include "internal.h"
/* Call with exclusively locked inode->i_rwsem */
static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
{
if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
inode_dio_wait(inode);
}
}
/**
* nfs_start_io_read - declare the file is being used for buffered reads
* @inode - file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
* On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that buffered read operations are allowed to
* execute in parallel, thanks to the shared lock, whereas direct I/O
* operations need to wait to grab an exclusive lock in order to set
* NFS_INO_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
*/
void
nfs_start_io_read(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
nfs_block_o_direct(nfsi, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* nfs_end_io_read - declare that the buffered read operation is done
* @inode - file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
nfs_end_io_read(struct inode *inode)
{
up_read(&inode->i_rwsem);
}
/**
* nfs_start_io_write - declare the file is being used for buffered writes
* @inode - file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
*/
void
nfs_start_io_write(struct inode *inode)
{
down_write(&inode->i_rwsem);
nfs_block_o_direct(NFS_I(inode), inode);
}
/**
* nfs_end_io_write - declare that the buffered write operation is done
* @inode - file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
*/
void
nfs_end_io_write(struct inode *inode)
{
up_write(&inode->i_rwsem);
}
/* Call with exclusively locked inode->i_rwsem */
static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
{
if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
set_bit(NFS_INO_ODIRECT, &nfsi->flags);
nfs_wb_all(inode);
}
}
/**
* nfs_end_io_direct - declare the file is being used for direct i/o
* @inode - file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
* On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that direct I/O operations are allowed to
* execute in parallel, thanks to the shared lock, whereas buffered I/O
* operations need to wait to grab an exclusive lock in order to clear
* NFS_INO_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
*/
void
nfs_start_io_direct(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
nfs_block_buffered(nfsi, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* nfs_end_io_direct - declare that the direct i/o operation is done
* @inode - file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
nfs_end_io_direct(struct inode *inode)
{
up_read(&inode->i_rwsem);
}

View File

@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* low timeout interval so that if a connection is lost, we retry through * low timeout interval so that if a connection is lost, we retry through
* the MDS. * the MDS.
*/ */
struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen, const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
rpc_authflavor_t au_flavor) rpc_authflavor_t au_flavor)
{ {
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = { struct nfs_client_initdata cl_init = {
.addr = ds_addr, .addr = ds_addr,
.addrlen = ds_addrlen, .addrlen = ds_addrlen,
.nodename = mds_clp->cl_rpcclient->cl_nodename,
.ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v3, .nfs_mod = &nfs_v3,
.proto = ds_proto, .proto = ds_proto,
.net = mds_clp->cl_net, .net = mds_clp->cl_net,
.timeparms = &ds_timeout,
}; };
struct rpc_timeout ds_timeout;
struct nfs_client *clp; struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1]; char buf[INET6_ADDRSTRLEN + 1];
@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
cl_init.hostname = buf; cl_init.hostname = buf;
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Use the MDS nfs_client cl_ipaddr. */ /* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, clp = nfs_get_client(&cl_init, au_flavor);
au_flavor);
return clp; return clp;
} }

View File

@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
return -EOPNOTSUPP; return -EOPNOTSUPP;
nfs_wb_all(inode);
inode_lock(inode); inode_lock(inode);
err = nfs_sync_inode(inode);
if (err)
goto out_unlock;
err = nfs42_proc_fallocate(&msg, filep, offset, len); err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0) if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1); truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP) if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
out_unlock:
inode_unlock(inode); inode_unlock(inode);
return err; return err;
} }
@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
if (status) if (status)
return status; return status;
status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
pos_src, pos_src + (loff_t)count - 1);
if (status)
return status;
status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
dst_lock, FMODE_WRITE); dst_lock, FMODE_WRITE);
if (status) if (status)
return status; return status;
status = nfs_sync_inode(dst_inode);
if (status)
return status;
status = nfs4_call_sync(server->client, server, &msg, status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0); &args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP) if (status == -ENOTSUPP)
@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
if (status) if (status)
return status; return status;
nfs_wb_all(inode); status = nfs_filemap_write_and_wait_range(inode->i_mapping,
offset, LLONG_MAX);
if (status)
return status;
status = nfs4_call_sync(server->client, server, &msg, status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0); &args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP) if (status == -ENOTSUPP)
@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
* Mark the bad layout state as invalid, then retry * Mark the bad layout state as invalid, then retry
* with the current stateid. * with the current stateid.
*/ */
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_mark_layout_stateid_invalid(lo, &head);
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head); pnfs_free_lseg_list(&head);
} else } else

View File

@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
struct nfs42_write_res *res) struct nfs42_write_res *res)
{ {
__be32 *p; __be32 *p;
int stateids;
p = xdr_inline_decode(xdr, 4 + 8 + 4); p = xdr_inline_decode(xdr, 4 + 8 + 4);
if (unlikely(!p)) if (unlikely(!p))
goto out_overflow; goto out_overflow;
stateids = be32_to_cpup(p++); /*
* We never use asynchronous mode, so warn if a server returns
* a stateid.
*/
if (unlikely(*p != 0)) {
pr_err_once("%s: server has set unrequested "
"asynchronous mode\n", __func__);
return -EREMOTEIO;
}
p++;
p = xdr_decode_hyper(p, &res->count); p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p); res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier); return decode_verifier(xdr, &res->verifier.verifier);

View File

@ -185,6 +185,7 @@ struct nfs4_state {
struct nfs4_exception { struct nfs4_exception {
struct nfs4_state *state; struct nfs4_state *state;
struct inode *inode; struct inode *inode;
nfs4_stateid *stateid;
long timeout; long timeout;
unsigned char delay : 1, unsigned char delay : 1,
recovering : 1, recovering : 1,

View File

@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* Returns pointer to an NFS client, or an ERR_PTR value. * Returns pointer to an NFS client, or an ERR_PTR value.
*/ */
struct nfs_client *nfs4_init_client(struct nfs_client *clp, struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms, const struct nfs_client_initdata *cl_init)
const char *ip_addr)
{ {
char buf[INET6_ADDRSTRLEN + 1]; char buf[INET6_ADDRSTRLEN + 1];
const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old; struct nfs_client *old;
int error; int error;
@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL) if (error == -EINVAL)
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0) if (error < 0)
goto error; goto error;
@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
.hostname = hostname, .hostname = hostname,
.addr = addr, .addr = addr,
.addrlen = addrlen, .addrlen = addrlen,
.ip_addr = ip_addr,
.nfs_mod = &nfs_v4, .nfs_mod = &nfs_v4,
.proto = proto, .proto = proto,
.minorversion = minorversion, .minorversion = minorversion,
.net = net, .net = net,
.timeparms = timeparms,
}; };
struct nfs_client *clp; struct nfs_client *clp;
int error; int error;
@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */ /* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); clp = nfs_get_client(&cl_init, authflavour);
if (IS_ERR(clp)) { if (IS_ERR(clp)) {
error = PTR_ERR(clp); error = PTR_ERR(clp);
goto error; goto error;
@ -842,20 +844,24 @@ error:
* low timeout interval so that if a connection is lost, we retry through * low timeout interval so that if a connection is lost, we retry through
* the MDS. * the MDS.
*/ */
struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen, const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version, rpc_authflavor_t au_flavor) u32 minor_version, rpc_authflavor_t au_flavor)
{ {
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = { struct nfs_client_initdata cl_init = {
.addr = ds_addr, .addr = ds_addr,
.addrlen = ds_addrlen, .addrlen = ds_addrlen,
.nodename = mds_clp->cl_rpcclient->cl_nodename,
.ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v4, .nfs_mod = &nfs_v4,
.proto = ds_proto, .proto = ds_proto,
.minorversion = minor_version, .minorversion = minor_version,
.net = mds_clp->cl_net, .net = mds_clp->cl_net,
.timeparms = &ds_timeout,
}; };
struct rpc_timeout ds_timeout;
struct nfs_client *clp; struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1]; char buf[INET6_ADDRSTRLEN + 1];
@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
cl_init.hostname = buf; cl_init.hostname = buf;
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* /*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client * Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
* (section 13.1 RFC 5661). * (section 13.1 RFC 5661).
*/ */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, clp = nfs_get_client(&cl_init, au_flavor);
au_flavor);
dprintk("<-- %s %p\n", __func__, clp); dprintk("<-- %s %p\n", __func__, clp);
return clp; return clp;

View File

@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (openflags & O_TRUNC) { if (openflags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE; attr.ia_valid |= ATTR_SIZE;
attr.ia_size = 0; attr.ia_size = 0;
nfs_sync_inode(inode); filemap_write_and_wait(inode->i_mapping);
} }
inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags) size_t count, unsigned int flags)
{ {
struct inode *in_inode = file_inode(file_in); if (file_inode(file_in) == file_inode(file_out))
struct inode *out_inode = file_inode(file_out);
int ret;
if (in_inode == out_inode)
return -EINVAL; return -EINVAL;
/* flush any pending writes */
ret = nfs_sync_inode(in_inode);
if (ret)
return ret;
ret = nfs_sync_inode(out_inode);
if (ret)
return ret;
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
} }

View File

@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
{ {
struct nfs_client *clp = server->nfs_client; struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state; struct nfs4_state *state = exception->state;
const nfs4_stateid *stateid = exception->stateid;
struct inode *inode = exception->inode; struct inode *inode = exception->inode;
int ret = errorcode; int ret = errorcode;
@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID: case -NFS4ERR_BAD_STATEID:
if (inode && nfs_async_inode_return_delegation(inode, if (inode) {
NULL) == 0) int err;
goto wait_on_recovery;
err = nfs_async_inode_return_delegation(inode,
stateid);
if (err == 0)
goto wait_on_recovery;
if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
exception->retry = 1;
break;
}
}
if (state == NULL) if (state == NULL)
break; break;
ret = nfs4_schedule_stateid_recovery(server, state); ret = nfs4_schedule_stateid_recovery(server, state);
@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELAY: case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY); nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE: case -NFS4ERR_GRACE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT: case -NFS4ERR_RECALLCONFLICT:
exception->delay = 1; exception->delay = 1;
return 0; return 0;
@ -2669,10 +2680,61 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
return res; return res;
} }
static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, static int _nfs4_do_setattr(struct inode *inode,
struct nfs_fattr *fattr, struct iattr *sattr, struct nfs_setattrargs *arg,
struct nfs4_state *state, struct nfs4_label *ilabel, struct nfs_setattrres *res,
struct nfs4_label *olabel) struct rpc_cred *cred,
struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
.rpc_argp = arg,
.rpc_resp = res,
.rpc_cred = cred,
};
struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
fmode_t fmode;
bool truncate;
int status;
nfs_fattr_init(res->fattr);
/* Servers should only apply open mode checks for file size changes */
truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
.l_owner = current->files,
.l_pid = current->tgid,
};
if (!nfs4_valid_open_stateid(state))
return -EBADF;
if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
&arg->stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
nfs4_stateid_copy(&arg->stateid, &zero_stateid);
if (delegation_cred)
msg.rpc_cred = delegation_cred;
status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
}
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs4_state *state, struct nfs4_label *ilabel,
struct nfs4_label *olabel)
{ {
struct nfs_server *server = NFS_SERVER(inode); struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = { struct nfs_setattrargs arg = {
@ -2687,67 +2749,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.label = olabel, .label = olabel,
.server = server, .server = server,
}; };
struct rpc_message msg = { struct nfs4_exception exception = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], .state = state,
.rpc_argp = &arg, .inode = inode,
.rpc_resp = &res, .stateid = &arg.stateid,
.rpc_cred = cred, };
}; int err;
struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
fmode_t fmode;
bool truncate;
int status;
arg.bitmask = nfs4_bitmask(server, ilabel); arg.bitmask = nfs4_bitmask(server, ilabel);
if (ilabel) if (ilabel)
arg.bitmask = nfs4_bitmask(server, olabel); arg.bitmask = nfs4_bitmask(server, olabel);
nfs_fattr_init(fattr);
/* Servers should only apply open mode checks for file size changes */
truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
.l_owner = current->files,
.l_pid = current->tgid,
};
if (!nfs4_valid_open_stateid(state))
return -EBADF;
if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
&arg.stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
nfs4_stateid_copy(&arg.stateid, &zero_stateid);
if (delegation_cred)
msg.rpc_cred = delegation_cred;
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg.stateid, status);
return status;
}
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
struct nfs4_state *state, struct nfs4_label *ilabel,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = {
.state = state,
.inode = inode,
};
int err;
do { do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
switch (err) { switch (err) {
case -NFS4ERR_OPENMODE: case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) { if (!(sattr->ia_valid & ATTR_SIZE)) {
@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
return status; return status;
} }
static int nfs4_do_find_root_sec(struct nfs_server *server,
struct nfs_fh *fhandle, struct nfs_fsinfo *info)
{
int mv = server->nfs_client->cl_minorversion;
return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
}
/** /**
* nfs4_proc_get_rootfh - get file handle for server's pseudoroot * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
* @server: initialized nfs_server handle * @server: initialized nfs_server handle
@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
status = nfs4_lookup_root(server, fhandle, info); status = nfs4_lookup_root(server, fhandle, info);
if (auth_probe || status == NFS4ERR_WRONGSEC) if (auth_probe || status == NFS4ERR_WRONGSEC)
status = nfs4_do_find_root_sec(server, fhandle, info); status = server->nfs_client->cl_mvops->find_root_sec(server,
fhandle, info);
if (status == 0) if (status == 0)
status = nfs4_server_capabilities(server, fhandle); status = nfs4_server_capabilities(server, fhandle);
@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg) struct rpc_message *msg)
{ {
hdr->timestamp = jiffies; hdr->timestamp = jiffies;
hdr->pgio_done_cb = nfs4_read_done_cb; if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
} }
@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
struct inode *inode = lgp->args.inode; struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode); struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo; struct pnfs_layout_hdr *lo;
int status = task->tk_status; int nfs4err = task->tk_status;
int err, status = 0;
LIST_HEAD(head);
dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
switch (status) { switch (nfs4err) {
case 0: case 0:
goto out; goto out;
@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EOVERFLOW; status = -EOVERFLOW;
goto out; goto out;
} }
/* Fallthrough */ status = -EBUSY;
break;
case -NFS4ERR_RECALLCONFLICT: case -NFS4ERR_RECALLCONFLICT:
nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
exception);
status = -ERECALLCONFLICT; status = -ERECALLCONFLICT;
goto out; break;
case -NFS4ERR_EXPIRED: case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID: case -NFS4ERR_BAD_STATEID:
exception->timeout = 0; exception->timeout = 0;
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
if (nfs4_stateid_match(&lgp->args.stateid, lo = NFS_I(inode)->layout;
/* If the open stateid was bad, then recover it. */
if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
nfs4_stateid_match_other(&lgp->args.stateid,
&lgp->args.ctx->state->stateid)) { &lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
/* If the open stateid was bad, then recover it. */
exception->state = lgp->args.ctx->state; exception->state = lgp->args.ctx->state;
break; break;
} }
lo = NFS_I(inode)->layout;
if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
LIST_HEAD(head);
/* /*
* Mark the bad layout state as invalid, then retry * Mark the bad layout state as invalid, then retry
* with the current stateid. */
*/ pnfs_mark_layout_stateid_invalid(lo, &head);
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); spin_unlock(&inode->i_lock);
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); pnfs_free_lseg_list(&head);
spin_unlock(&inode->i_lock); status = -EAGAIN;
pnfs_free_lseg_list(&head); goto out;
status = -EAGAIN;
goto out;
} else
spin_unlock(&inode->i_lock);
} }
status = nfs4_handle_exception(server, status, exception); err = nfs4_handle_exception(server, nfs4err, exception);
if (exception->retry) if (!status) {
status = -EAGAIN; if (exception->retry)
status = -EAGAIN;
else
status = err;
}
out: out:
dprintk("<-- %s\n", __func__); dprintk("<-- %s\n", __func__);
return status; return status;
@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
spin_lock(&lo->plh_inode->i_lock); spin_lock(&lo->plh_inode->i_lock);
pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
be32_to_cpu(lrp->args.stateid.seqid)); be32_to_cpu(lrp->args.stateid.seqid));
pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
pnfs_clear_layoutreturn_waitbit(lo); pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&lo->plh_inode->i_lock); spin_unlock(&lo->plh_inode->i_lock);
@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif #endif
}; };
ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{ {
ssize_t error, error2; ssize_t error, error2;

View File

@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
*p = cpu_to_be32(0); /* reclaim */ *p = cpu_to_be32(0); /* reclaim */
encode_nfs4_stateid(xdr, &args->stateid); encode_nfs4_stateid(xdr, &args->stateid);
p = reserve_space(xdr, 20); if (args->lastbytewritten != U64_MAX) {
*p++ = cpu_to_be32(1); /* newoffset = TRUE */ p = reserve_space(xdr, 20);
p = xdr_encode_hyper(p, args->lastbytewritten); *p++ = cpu_to_be32(1); /* newoffset = TRUE */
p = xdr_encode_hyper(p, args->lastbytewritten);
} else {
p = reserve_space(xdr, 12);
*p++ = cpu_to_be32(0); /* newoffset = FALSE */
}
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */

View File

@ -37,7 +37,6 @@
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_STALE, "STALE" }, \
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })

View File

@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
* is required. * is required.
* Note that caller must hold inode->i_lock. * Note that caller must hold inode->i_lock.
*/ */
static int int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list) struct list_head *lseg_list)
{ {
@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
} }
static void static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
const struct pnfs_layout_range *range,
const nfs4_stateid *stateid)
{ {
INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list); INIT_LIST_HEAD(&lseg->pls_lc_list);
atomic_set(&lseg->pls_refcount, 1); atomic_set(&lseg->pls_refcount, 1);
smp_mb();
set_bit(NFS_LSEG_VALID, &lseg->pls_flags); set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo; lseg->pls_layout = lo;
lseg->pls_range = *range;
lseg->pls_seq = be32_to_cpu(stateid->seqid);
} }
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
(end2 == NFS4_MAX_UINT64 || end2 > start1); (end2 == NFS4_MAX_UINT64 || end2 > start1);
} }
static bool
should_free_lseg(const struct pnfs_layout_range *lseg_range,
const struct pnfs_layout_range *recall_range)
{
return (recall_range->iomode == IOMODE_ANY ||
lseg_range->iomode == recall_range->iomode) &&
pnfs_lseg_range_intersecting(lseg_range, recall_range);
}
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list) struct list_head *tmp_list)
{ {
@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0; return (s32)(s1 - s2) > 0;
} }
static bool
pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
const struct pnfs_layout_range *recall_range)
{
return (recall_range->iomode == IOMODE_ANY ||
lseg_range->iomode == recall_range->iomode) &&
pnfs_lseg_range_intersecting(lseg_range, recall_range);
}
static bool
pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
const struct pnfs_layout_range *recall_range,
u32 seq)
{
if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
return false;
if (recall_range == NULL)
return true;
return pnfs_should_free_range(&lseg->pls_range, recall_range);
}
/** /**
* pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
* @lo: layout header containing the lsegs * @lo: layout header containing the lsegs
@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
if (list_empty(&lo->plh_segs)) if (list_empty(&lo->plh_segs))
return 0; return 0;
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (!recall_range || if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
should_free_lseg(&lseg->pls_range, recall_range)) {
if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
continue;
dprintk("%s: freeing lseg %p iomode %d seq %u" dprintk("%s: freeing lseg %p iomode %d seq %u"
"offset %llu length %llu\n", __func__, "offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg, lseg->pls_range.iomode, lseg->pls_seq,
@ -761,24 +773,25 @@ void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
bool update_barrier) bool update_barrier)
{ {
u32 oldseq, newseq, new_barrier; u32 oldseq, newseq, new_barrier = 0;
int empty = list_empty(&lo->plh_segs); bool invalid = !pnfs_layout_is_valid(lo);
oldseq = be32_to_cpu(lo->plh_stateid.seqid); oldseq = be32_to_cpu(lo->plh_stateid.seqid);
newseq = be32_to_cpu(new->seqid); newseq = be32_to_cpu(new->seqid);
if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
nfs4_stateid_copy(&lo->plh_stateid, new); nfs4_stateid_copy(&lo->plh_stateid, new);
if (update_barrier) { /*
new_barrier = be32_to_cpu(new->seqid); * Because of wraparound, we want to keep the barrier
} else { * "close" to the current seqids.
/* Because of wraparound, we want to keep the barrier */
* "close" to the current seqids. new_barrier = newseq - atomic_read(&lo->plh_outstanding);
*/
new_barrier = newseq - atomic_read(&lo->plh_outstanding);
}
if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
lo->plh_barrier = new_barrier;
} }
if (update_barrier)
new_barrier = be32_to_cpu(new->seqid);
else if (new_barrier == 0)
return;
if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
lo->plh_barrier = new_barrier;
} }
static bool static bool
@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
} }
static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
{
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
}
static bool static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
nfs4_stateid *stateid,
enum pnfs_iomode *iomode)
{ {
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false; return false;
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
pnfs_get_layout_hdr(lo); pnfs_get_layout_hdr(lo);
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
if (stateid != NULL) {
nfs4_stateid_copy(stateid, &lo->plh_stateid);
if (lo->plh_return_seq != 0)
stateid->seqid = cpu_to_be32(lo->plh_return_seq);
}
if (iomode != NULL)
*iomode = lo->plh_return_iomode;
pnfs_clear_layoutreturn_info(lo);
return true;
}
if (stateid != NULL)
nfs4_stateid_copy(stateid, &lo->plh_stateid);
if (iomode != NULL)
*iomode = IOMODE_ANY;
return true; return true;
} }
@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
enum pnfs_iomode iomode; enum pnfs_iomode iomode;
bool send; bool send;
nfs4_stateid_copy(&stateid, &lo->plh_stateid); send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
stateid.seqid = cpu_to_be32(lo->plh_return_seq);
iomode = lo->plh_return_iomode;
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
if (send) { if (send) {
/* Send an async layoutreturn so we dont deadlock */ /* Send an async layoutreturn so we dont deadlock */
@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__); dprintk("NFS: %s no layout to return\n", __func__);
goto out; goto out;
} }
nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */ /* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo); pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs); empty = list_empty(&lo->plh_segs);
@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr; goto out_put_layout_hdr;
} }
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list); pnfs_free_lseg_list(&tmp_list);
if (send) if (send)
@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc; goto out_noroc;
} }
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */ /* always send layoutreturn if being marked so */
if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
&lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo,
layoutreturn = pnfs_prepare_layoutreturn(lo); &stateid, NULL);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */ /* If we are sending layoutreturn, invalidate all valid lsegs */
@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout; lo = NFS_I(ino)->layout;
pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier; lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock); spin_unlock(&ino->i_lock);
@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL; struct pnfs_layout_segment *lseg = NULL;
nfs4_stateid stateid; nfs4_stateid stateid;
long timeout = 0; long timeout = 0;
unsigned long giveup = jiffies + rpc_get_timeout(server->client); unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first; bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino))) { if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@ -1645,33 +1673,44 @@ lookup_again:
lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
atomic_dec(&lo->plh_outstanding);
if (IS_ERR(lseg)) { if (IS_ERR(lseg)) {
switch(PTR_ERR(lseg)) { switch(PTR_ERR(lseg)) {
case -ERECALLCONFLICT: case -EBUSY:
if (time_after(jiffies, giveup)) if (time_after(jiffies, giveup))
lseg = NULL; lseg = NULL;
break;
case -ERECALLCONFLICT:
/* Huh? We hold no layouts, how is there a recall? */
if (first) {
lseg = NULL;
break;
}
/* Destroy the existing layout and start over */
if (time_after(jiffies, giveup))
pnfs_destroy_layout(NFS_I(ino));
/* Fallthrough */ /* Fallthrough */
case -EAGAIN: case -EAGAIN:
pnfs_put_layout_hdr(lo); break;
if (first)
pnfs_clear_first_layoutget(lo);
if (lseg) {
trace_pnfs_update_layout(ino, pos, count,
iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
goto lookup_again;
}
/* Fallthrough */
default: default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) { if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
lseg = NULL; lseg = NULL;
} }
goto out_put_layout_hdr;
}
if (lseg) {
if (first)
pnfs_clear_first_layoutget(lo);
trace_pnfs_update_layout(ino, pos, count,
iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
pnfs_put_layout_hdr(lo);
goto lookup_again;
} }
} else { } else {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
} }
atomic_dec(&lo->plh_outstanding);
out_put_layout_hdr: out_put_layout_hdr:
if (first) if (first)
pnfs_clear_first_layoutget(lo); pnfs_clear_first_layoutget(lo);
@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
return lseg; return lseg;
} }
init_lseg(lo, lseg); pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
lseg->pls_range = res->range;
lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
spin_lock(&ino->i_lock); spin_lock(&ino->i_lock);
if (pnfs_layoutgets_blocked(lo)) { if (pnfs_layoutgets_blocked(lo)) {
@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* inode invalid, and don't bother validating the stateid * inode invalid, and don't bother validating the stateid
* sequence number. * sequence number.
*/ */
pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); pnfs_mark_layout_stateid_invalid(lo, &free_me);
nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
lo->plh_barrier = be32_to_cpu(res->stateid.seqid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
} }
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_get_lseg(lseg); pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg, &free_me); pnfs_layout_insert_lseg(lo, lseg, &free_me);
if (!pnfs_layout_is_valid(lo)) {
pnfs_clear_layoutreturn_info(lo);
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
}
if (res->return_on_close) if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags); set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@ -1787,14 +1827,14 @@ static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq) u32 seq)
{ {
if (lo->plh_return_iomode == iomode) if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
return;
if (lo->plh_return_iomode != 0)
iomode = IOMODE_ANY; iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode; lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) if (seq != 0) {
WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
lo->plh_return_seq = seq; lo->plh_return_seq = seq;
}
} }
/** /**
@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->plh_inode->i_lock); assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(&lseg->pls_range, return_range)) { if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
dprintk("%s: marking lseg %p iomode %d " dprintk("%s: marking lseg %p iomode %d "
"offset %llu length %llu\n", __func__, "offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg, lseg->pls_range.iomode,
@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false; bool return_now = false;
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); pnfs_set_plh_return_info(lo, range.iomode, 0);
/* /*
* mark all matching lsegs so that we are sure to have no live * mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg() * segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works. * for how it works.
*/ */
if (!pnfs_mark_matching_lsegs_return(lo, &free_me, if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
&range, lseg->pls_seq)) {
nfs4_stateid stateid; nfs4_stateid stateid;
enum pnfs_iomode iomode = lo->plh_return_iomode; enum pnfs_iomode iomode;
nfs4_stateid_copy(&stateid, &lo->plh_stateid); return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
return_now = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
if (return_now) if (return_now)
pnfs_send_layoutreturn(lo, &stateid, iomode, false); pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
nfs_fattr_init(&data->fattr); nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr; data->res.fattr = &data->fattr;
data->args.lastbytewritten = end_pos - 1; if (end_pos != 0)
data->args.lastbytewritten = end_pos - 1;
else
data->args.lastbytewritten = U64_MAX;
data->res.server = NFS_SERVER(inode); data->res.server = NFS_SERVER(inode);
if (ld->prepare_layoutcommit) { if (ld->prepare_layoutcommit) {

View File

@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list, struct list_head *tmp_list,
const struct pnfs_layout_range *recall_range, const struct pnfs_layout_range *recall_range,
u32 seq); u32 seq);
int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list);
bool pnfs_roc(struct inode *ino); bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino); void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
return NFS_I(inode)->layout != NULL; return NFS_I(inode)->layout != NULL;
} }
static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
{
return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
}
static inline struct nfs4_deviceid_node * static inline struct nfs4_deviceid_node *
nfs4_get_deviceid(struct nfs4_deviceid_node *d) nfs4_get_deviceid(struct nfs4_deviceid_node *d)
{ {
@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset; return 1 + end - offset;
} }
/**
* pnfs_mark_layout_returned_if_empty - marks the layout as returned
* @lo: layout header
*
* Note: Caller must hold inode->i_lock
*/
static inline void
pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
{
if (list_empty(&lo->plh_segs))
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
}
static inline void static inline void
pnfs_copy_range(struct pnfs_layout_range *dst, pnfs_copy_range(struct pnfs_layout_range *dst,
const struct pnfs_layout_range *src) const struct pnfs_layout_range *src)
@ -628,6 +622,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
return 0; return 0;
} }
static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
return false;
}
static inline bool static inline bool
pnfs_roc(struct inode *ino) pnfs_roc(struct inode *ino)
{ {
@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false; return false;
} }
static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
return false;
}
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{ {
return NULL; return NULL;

View File

@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
} }
static struct nfs_client *(*get_v3_ds_connect)( static struct nfs_client *(*get_v3_ds_connect)(
struct nfs_client *mds_clp, struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, const struct sockaddr *ds_addr,
int ds_addrlen, int ds_addrlen,
int ds_proto, int ds_proto,
@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_test_and_add_xprt, NULL); rpc_clnt_test_and_add_xprt, NULL);
} else } else
clp = get_v3_ds_connect(mds_srv->nfs_client, clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr, (struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP, da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor); timeo, retrans, au_flavor);
@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n", dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr); __func__, ds->ds_remotestr, da->da_remotestr);
clp = nfs4_set_ds_client(mds_srv->nfs_client, clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr, (struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP, da->da_addrlen, IPPROTO_TCP,
timeo, retrans, minor_version, timeo, retrans, minor_version,
@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
int int
pnfs_nfs_generic_sync(struct inode *inode, bool datasync) pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
{ {
int ret;
if (!pnfs_layoutcommit_outstanding(inode))
return 0;
ret = nfs_commit_inode(inode, FLUSH_SYNC);
if (ret < 0)
return ret;
if (datasync) if (datasync)
return 0; return 0;
return pnfs_layoutcommit_inode(inode, true); return pnfs_layoutcommit_inode(inode, true);

View File

@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
{ {
rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
unsigned int i; unsigned int i;
int use_auth_null = false;
/* /*
* If the sec= mount option is used, the specified flavor or AUTH_NULL * If the sec= mount option is used, the specified flavor or AUTH_NULL
@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
* *
* AUTH_NULL has a special meaning when it's in the server list - it * AUTH_NULL has a special meaning when it's in the server list - it
* means that the server will ignore the rpc creds, so any flavor * means that the server will ignore the rpc creds, so any flavor
* can be used. * can be used but still use the sec= that was specified.
*/ */
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
flavor = server_authlist[i]; flavor = server_authlist[i];
if (nfs_auth_info_match(&args->auth_info, flavor) || if (nfs_auth_info_match(&args->auth_info, flavor))
flavor == RPC_AUTH_NULL)
goto out; goto out;
if (flavor == RPC_AUTH_NULL)
use_auth_null = true;
}
if (use_auth_null) {
flavor = RPC_AUTH_NULL;
goto out;
} }
dfprintk(MOUNT, dfprintk(MOUNT,

View File

@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
int err; int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops); false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio, launder); err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio); nfs_pageio_complete(&pgio);
@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{ {
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio; struct nfs_pageio_descriptor pgio;
int err; int err;
/* Stop dirtying of new pages while we sync */
err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (err)
goto out_err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio); nfs_pageio_complete(&pgio);
clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
smp_mb__after_atomic();
wake_up_bit(bitlock, NFS_INO_FLUSHING);
if (err < 0) if (err < 0)
goto out_err; goto out_err;
err = pgio.pg_error; err = pgio.pg_error;
@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
/* /*
* Test if the open context credential key is marked to expire soon. * Test if the open context credential key is marked to expire soon.
*/ */
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{ {
return rpcauth_cred_key_to_expire(ctx->cred); struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
return rpcauth_cred_key_to_expire(auth, ctx->cred);
} }
/* /*
@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
file, count, (long long)(page_file_offset(page) + offset)); file, count, (long long)(page_file_offset(page) + offset));
if (!count)
goto out;
if (nfs_can_extend_write(file, page, inode)) { if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page)); count = max(count + offset, nfs_page_length(page));
offset = 0; offset = 0;
@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_set_pageerror(page); nfs_set_pageerror(page);
else else
__set_page_dirty_nobuffers(page); __set_page_dirty_nobuffers(page);
out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode)); status, (long long)i_size_read(inode));
return status; return status;
@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier /* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */ * returned by the server against all stored verfs. */
if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
/* We have a match */ /* We have a match */
nfs_inode_remove_request(req); nfs_inode_remove_request(req);
dprintk(" OK\n"); dprintk(" OK\n");
@ -1923,6 +1917,24 @@ out_mark_dirty:
} }
EXPORT_SYMBOL_GPL(nfs_write_inode); EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
* Wrapper for filemap_write_and_wait_range()
*
* Needed for pNFS in order to ensure data becomes visible to the
* client.
*/
int nfs_filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
int ret;
ret = filemap_write_and_wait_range(mapping, lstart, lend);
if (ret == 0)
ret = pnfs_sync_inode(mapping->host, true);
return ret;
}
EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
/* /*
* flush the inode to disk. * flush the inode to disk.
*/ */

View File

@ -205,12 +205,12 @@ struct nfs_inode {
#define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */
static inline struct nfs_inode *NFS_I(const struct inode *inode) static inline struct nfs_inode *NFS_I(const struct inode *inode)
{ {
@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino
extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
extern int nfs_revalidate_mapping_rcu(struct inode *inode); extern int nfs_revalidate_mapping_rcu(struct inode *inode);
extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
extern int nfs_setattr(struct dentry *, struct iattr *); extern int nfs_setattr(struct dentry *, struct iattr *);
extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,

View File

@ -1596,9 +1596,8 @@ struct nfs_rpc_ops {
int (*have_delegation)(struct inode *, fmode_t); int (*have_delegation)(struct inode *, fmode_t);
int (*return_delegation)(struct inode *); int (*return_delegation)(struct inode *);
struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *);
struct nfs_client * struct nfs_client *(*init_client) (struct nfs_client *,
(*init_client) (struct nfs_client *, const struct rpc_timeout *, const struct nfs_client_initdata *);
const char *);
void (*free_client) (struct nfs_client *); void (*free_client) (struct nfs_client *);
struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *);
struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,

View File

@ -37,7 +37,6 @@ struct rpcsec_gss_info;
/* auth_cred ac_flags bits */ /* auth_cred ac_flags bits */
enum { enum {
RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */
RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */
RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying
key will expire soon */ key will expire soon */
@ -82,6 +81,9 @@ struct rpc_cred {
#define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0
/* rpc_auth au_flags */
#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */
/* /*
* Client authentication handle * Client authentication handle
*/ */
@ -107,6 +109,9 @@ struct rpc_auth {
/* per-flavor data */ /* per-flavor data */
}; };
/* rpc_auth au_flags */
#define RPCAUTH_AUTH_DATATOUCH 0x00000002
struct rpc_auth_create_args { struct rpc_auth_create_args {
rpc_authflavor_t pseudoflavor; rpc_authflavor_t pseudoflavor;
const char *target_name; const char *target_name;
@ -196,7 +201,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *);
void rpcauth_clear_credcache(struct rpc_cred_cache *); void rpcauth_clear_credcache(struct rpc_cred_cache *);
int rpcauth_key_timeout_notify(struct rpc_auth *, int rpcauth_key_timeout_notify(struct rpc_auth *,
struct rpc_cred *); struct rpc_cred *);
bool rpcauth_cred_key_to_expire(struct rpc_cred *); bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
char * rpcauth_stringify_acceptor(struct rpc_cred *); char * rpcauth_stringify_acceptor(struct rpc_cred *);
static inline static inline

View File

@ -73,6 +73,7 @@ u32 gss_delete_sec_context(
rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
u32 service); u32 service);
u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor);
char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
struct pf_desc { struct pf_desc {
@ -81,6 +82,7 @@ struct pf_desc {
u32 service; u32 service;
char *name; char *name;
char *auth_domain_name; char *auth_domain_name;
bool datatouch;
}; };
/* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and

View File

@ -230,6 +230,10 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *,
struct rpc_task *); struct rpc_task *);
void rpc_wake_up(struct rpc_wait_queue *); void rpc_wake_up(struct rpc_wait_queue *);
struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *,
bool (*)(struct rpc_task *, void *),
void *);
struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
bool (*)(struct rpc_task *, void *), bool (*)(struct rpc_task *, void *),
void *); void *);
@ -247,6 +251,7 @@ void rpc_show_tasks(struct net *);
int rpc_init_mempool(void); int rpc_init_mempool(void);
void rpc_destroy_mempool(void); void rpc_destroy_mempool(void);
extern struct workqueue_struct *rpciod_workqueue; extern struct workqueue_struct *rpciod_workqueue;
extern struct workqueue_struct *xprtiod_workqueue;
void rpc_prepare_task(struct rpc_task *task); void rpc_prepare_task(struct rpc_task *task);
static inline int rpc_wait_for_completion_task(struct rpc_task *task) static inline int rpc_wait_for_completion_task(struct rpc_task *task)

View File

@ -80,6 +80,7 @@ struct sock_xprt {
#define TCP_RPC_REPLY (1UL << 6) #define TCP_RPC_REPLY (1UL << 6)
#define XPRT_SOCK_CONNECTING 1U #define XPRT_SOCK_CONNECTING 1U
#define XPRT_SOCK_DATA_READY (2)
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */

View File

@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
ret = kstrtoul(val, 0, &num); ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL) if (ret == -EINVAL)
goto out_inval; goto out_inval;
nbits = fls(num); nbits = fls(num - 1);
if (num > (1U << nbits))
nbits++;
if (nbits > MAX_HASHTABLE_BITS || nbits < 2) if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
goto out_inval; goto out_inval;
*(unsigned int *)kp->arg = nbits; *(unsigned int *)kp->arg = nbits;
@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
bool bool
rpcauth_cred_key_to_expire(struct rpc_cred *cred) rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
{ {
if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
return false;
if (!cred->cr_ops->crkey_to_expire) if (!cred->cr_ops->crkey_to_expire)
return false; return false;
return cred->cr_ops->crkey_to_expire(cred); return cred->cr_ops->crkey_to_expire(cred);

View File

@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
/* Fast track for non crkey_timeout (no key) underlying credentials */ /* Fast track for non crkey_timeout (no key) underlying credentials */
if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
return 0; return 0;
/* Fast track for the normal case */ /* Fast track for the normal case */
@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
if (IS_ERR(tcred)) if (IS_ERR(tcred))
return -EACCES; return -EACCES;
if (!tcred->cr_ops->crkey_timeout) {
set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
ret = 0;
goto out_put;
}
/* Test for the almost error case */ /* Test for the almost error case */
ret = tcred->cr_ops->crkey_timeout(tcred); ret = tcred->cr_ops->crkey_timeout(tcred);
if (ret != 0) { if (ret != 0) {
@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
} }
out_put:
put_rpccred(tcred); put_rpccred(tcred);
return ret; return ret;
} }

View File

@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
auth = &gss_auth->rpc_auth; auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2;
auth->au_flags = 0;
auth->au_ops = &authgss_ops; auth->au_ops = &authgss_ops;
auth->au_flavor = flavor; auth->au_flavor = flavor;
if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
atomic_set(&auth->au_count, 1); atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref); kref_init(&gss_auth->kref);

View File

@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
.qop = GSS_C_QOP_DEFAULT, .qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY, .service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i", .name = "krb5i",
.datatouch = true,
}, },
[2] = { [2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P, .pseudoflavor = RPC_AUTH_GSS_KRB5P,
.qop = GSS_C_QOP_DEFAULT, .qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY, .service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p", .name = "krb5p",
.datatouch = true,
}, },
}; };

View File

@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
} }
EXPORT_SYMBOL(gss_pseudoflavor_to_service); EXPORT_SYMBOL(gss_pseudoflavor_to_service);
bool
gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
{
int i;
for (i = 0; i < gm->gm_pf_num; i++) {
if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
return gm->gm_pfs[i].datatouch;
}
return false;
}
char * char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
{ {

View File

@ -115,6 +115,7 @@ static
struct rpc_auth null_auth = { struct rpc_auth null_auth = {
.au_cslack = NUL_CALLSLACK, .au_cslack = NUL_CALLSLACK,
.au_rslack = NUL_REPLYSLACK, .au_rslack = NUL_REPLYSLACK,
.au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authnull_ops, .au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL, .au_flavor = RPC_AUTH_NULL,
.au_count = ATOMIC_INIT(0), .au_count = ATOMIC_INIT(0),

View File

@ -228,6 +228,7 @@ static
struct rpc_auth unix_auth = { struct rpc_auth unix_auth = {
.au_cslack = UNX_CALLSLACK, .au_cslack = UNX_CALLSLACK,
.au_rslack = NUL_REPLYSLACK, .au_rslack = NUL_REPLYSLACK,
.au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authunix_ops, .au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX, .au_flavor = RPC_AUTH_UNIX,
.au_count = ATOMIC_INIT(0), .au_count = ATOMIC_INIT(0),

View File

@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
kfree(data); kfree(data);
} }
const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
.rpc_call_done = rpc_cb_add_xprt_done, .rpc_call_done = rpc_cb_add_xprt_done,
.rpc_release = rpc_cb_add_xprt_release, .rpc_release = rpc_cb_add_xprt_release,
}; };

View File

@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
/* /*
* rpciod-related stuff * rpciod-related stuff
*/ */
struct workqueue_struct *rpciod_workqueue; struct workqueue_struct *rpciod_workqueue __read_mostly;
struct workqueue_struct *xprtiod_workqueue __read_mostly;
/* /*
* Disable the timer for a given RPC task. Should be called with * Disable the timer for a given RPC task. Should be called with
@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
* lockless RPC_IS_QUEUED() test) before we've had a chance to test * lockless RPC_IS_QUEUED() test) before we've had a chance to test
* the RPC_TASK_RUNNING flag. * the RPC_TASK_RUNNING flag.
*/ */
static void rpc_make_runnable(struct rpc_task *task) static void rpc_make_runnable(struct workqueue_struct *wq,
struct rpc_task *task)
{ {
bool need_wakeup = !rpc_test_and_set_running(task); bool need_wakeup = !rpc_test_and_set_running(task);
@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
return; return;
if (RPC_IS_ASYNC(task)) { if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule); INIT_WORK(&task->u.tk_work, rpc_async_schedule);
queue_work(rpciod_workqueue, &task->u.tk_work); queue_work(wq, &task->u.tk_work);
} else } else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
} }
@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
/** /**
* __rpc_do_wake_up_task - wake up a single rpc_task * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
* @wq: workqueue on which to run task
* @queue: wait queue * @queue: wait queue
* @task: task to be woken up * @task: task to be woken up
* *
* Caller must hold queue->lock, and have cleared the task queued flag. * Caller must hold queue->lock, and have cleared the task queued flag.
*/ */
static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task)
{ {
dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
task->tk_pid, jiffies); task->tk_pid, jiffies);
@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
__rpc_remove_wait_queue(queue, task); __rpc_remove_wait_queue(queue, task);
rpc_make_runnable(task); rpc_make_runnable(wq, task);
dprintk("RPC: __rpc_wake_up_task done\n"); dprintk("RPC: __rpc_wake_up_task done\n");
} }
@ -436,15 +441,24 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
/* /*
* Wake up a queued task while the queue lock is being held * Wake up a queued task while the queue lock is being held
*/ */
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
struct rpc_wait_queue *queue, struct rpc_task *task)
{ {
if (RPC_IS_QUEUED(task)) { if (RPC_IS_QUEUED(task)) {
smp_rmb(); smp_rmb();
if (task->tk_waitqueue == queue) if (task->tk_waitqueue == queue)
__rpc_do_wake_up_task(queue, task); __rpc_do_wake_up_task_on_wq(wq, queue, task);
} }
} }
/*
* Wake up a queued task while the queue lock is being held
*/
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{
rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
}
/* /*
* Wake up a task on a specific queue * Wake up a task on a specific queue
*/ */
@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
/* /*
* Wake up the first task on the wait queue. * Wake up the first task on the wait queue.
*/ */
struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
bool (*func)(struct rpc_task *, void *), void *data) bool (*func)(struct rpc_task *, void *), void *data)
{ {
struct rpc_task *task = NULL; struct rpc_task *task = NULL;
@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
task = __rpc_find_next_queued(queue); task = __rpc_find_next_queued(queue);
if (task != NULL) { if (task != NULL) {
if (func(task, data)) if (func(task, data))
rpc_wake_up_task_queue_locked(queue, task); rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
else else
task = NULL; task = NULL;
} }
@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
return task; return task;
} }
/*
* Wake up the first task on the wait queue.
*/
struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
bool (*func)(struct rpc_task *, void *), void *data)
{
return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_first); EXPORT_SYMBOL_GPL(rpc_wake_up_first);
static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
bool is_async = RPC_IS_ASYNC(task); bool is_async = RPC_IS_ASYNC(task);
rpc_set_active(task); rpc_set_active(task);
rpc_make_runnable(task); rpc_make_runnable(rpciod_workqueue, task);
if (!is_async) if (!is_async)
__rpc_execute(task); __rpc_execute(task);
} }
@ -1071,10 +1095,22 @@ static int rpciod_start(void)
* Create the rpciod thread and wait for it to start. * Create the rpciod thread and wait for it to start.
*/ */
dprintk("RPC: creating workqueue rpciod\n"); dprintk("RPC: creating workqueue rpciod\n");
/* Note: highpri because network receive is latency sensitive */ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!wq)
goto out_failed;
rpciod_workqueue = wq; rpciod_workqueue = wq;
return rpciod_workqueue != NULL; /* Note: highpri because network receive is latency sensitive */
wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!wq)
goto free_rpciod;
xprtiod_workqueue = wq;
return 1;
free_rpciod:
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
destroy_workqueue(wq);
out_failed:
return 0;
} }
static void rpciod_stop(void) static void rpciod_stop(void)
@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
wq = rpciod_workqueue; wq = rpciod_workqueue;
rpciod_workqueue = NULL; rpciod_workqueue = NULL;
destroy_workqueue(wq); destroy_workqueue(wq);
wq = xprtiod_workqueue;
xprtiod_workqueue = NULL;
destroy_workqueue(wq);
} }
void void

View File

@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
/* Encode reply */ /* Encode reply */
if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (*statp == rpc_drop_reply ||
test_bit(RQ_DROPME, &rqstp->rq_flags)) {
if (procp->pc_release) if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp); procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit; goto dropit;
} }
if (*statp == rpc_autherr_badcred) {
if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto err_bad_auth;
}
if (*statp == rpc_success && if (*statp == rpc_success &&
(xdr = procp->pc_encode) && (xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {

View File

@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
clear_bit(XPRT_LOCKED, &xprt->state); clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_atomic(); smp_mb__after_atomic();
} else } else
queue_work(rpciod_workqueue, &xprt->task_cleanup); queue_work(xprtiod_workqueue, &xprt->task_cleanup);
} }
/* /*
@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return; return;
if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
__xprt_lock_write_func, xprt))
return; return;
xprt_clear_locked(xprt); xprt_clear_locked(xprt);
} }
@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
return; return;
if (RPCXPRT_CONGESTED(xprt)) if (RPCXPRT_CONGESTED(xprt))
goto out_unlock; goto out_unlock;
if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
__xprt_lock_write_cong_func, xprt))
return; return;
out_unlock: out_unlock:
xprt_clear_locked(xprt); xprt_clear_locked(xprt);
@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
set_bit(XPRT_CLOSE_WAIT, &xprt->state); set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */ /* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(rpciod_workqueue, &xprt->task_cleanup); queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN); xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock); spin_unlock_bh(&xprt->transport_lock);
} }
@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
set_bit(XPRT_CLOSE_WAIT, &xprt->state); set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */ /* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(rpciod_workqueue, &xprt->task_cleanup); queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN); xprt_wake_pending_tasks(xprt, -EAGAIN);
out: out:
spin_unlock_bh(&xprt->transport_lock); spin_unlock_bh(&xprt->transport_lock);
@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
goto out_abort; goto out_abort;
spin_unlock(&xprt->transport_lock); spin_unlock(&xprt->transport_lock);
queue_work(rpciod_workqueue, &xprt->task_cleanup); queue_work(xprtiod_workqueue, &xprt->task_cleanup);
return; return;
out_abort: out_abort:
spin_unlock(&xprt->transport_lock); spin_unlock(&xprt->transport_lock);

View File

@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
xprt_switch_find_xprt_t find_next) xprt_switch_find_xprt_t find_next)
{ {
struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
struct list_head *head;
if (xps == NULL) if (xps == NULL)
return NULL; return NULL;
head = &xps->xps_xprt_list; return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
if (xps->xps_nxprts < 2) &xpi->xpi_cursor,
return xprt_switch_find_first_entry(head); find_next);
return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
} }
static static

View File

@ -1,7 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o physical_ops.o \ fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o module.o

View File

@ -19,13 +19,6 @@
* verb (fmr_op_unmap). * verb (fmr_op_unmap).
*/ */
/* Transport recovery
*
* After a transport reconnect, fmr_op_map re-uses the MR already
* allocated for the RPC, but generates a fresh rkey then maps the
* MR again. This process is synchronous.
*/
#include "xprt_rdma.h" #include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@ -35,62 +28,132 @@
/* Maximum scatter/gather per FMR */ /* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64) #define RPCRDMA_MAX_FMR_SGES (64)
static struct workqueue_struct *fmr_recovery_wq; /* Access mode of externally registered pages */
enum {
RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ,
};
#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) bool
fmr_is_supported(struct rpcrdma_ia *ia)
int
fmr_alloc_recovery_wq(void)
{ {
fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); if (!ia->ri_device->alloc_fmr) {
return !fmr_recovery_wq ? -ENOMEM : 0; pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
ia->ri_device->name);
return false;
}
return true;
} }
void static int
fmr_destroy_recovery_wq(void) fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
{ {
struct workqueue_struct *wq; static struct ib_fmr_attr fmr_attr = {
.max_pages = RPCRDMA_MAX_FMR_SGES,
.max_maps = 1,
.page_shift = PAGE_SHIFT
};
if (!fmr_recovery_wq) mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
return; sizeof(u64), GFP_KERNEL);
if (!mw->fmr.fm_physaddrs)
goto out_free;
wq = fmr_recovery_wq; mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
fmr_recovery_wq = NULL; sizeof(*mw->mw_sg), GFP_KERNEL);
destroy_workqueue(wq); if (!mw->mw_sg)
goto out_free;
sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
&fmr_attr);
if (IS_ERR(mw->fmr.fm_mr))
goto out_fmr_err;
return 0;
out_fmr_err:
dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
PTR_ERR(mw->fmr.fm_mr));
out_free:
kfree(mw->mw_sg);
kfree(mw->fmr.fm_physaddrs);
return -ENOMEM;
} }
static int static int
__fmr_unmap(struct rpcrdma_mw *mw) __fmr_unmap(struct rpcrdma_mw *mw)
{ {
LIST_HEAD(l); LIST_HEAD(l);
int rc;
list_add(&mw->fmr.fmr->list, &l); list_add(&mw->fmr.fm_mr->list, &l);
return ib_unmap_fmr(&l); rc = ib_unmap_fmr(&l);
list_del_init(&mw->fmr.fm_mr->list);
return rc;
} }
/* Deferred reset of a single FMR. Generate a fresh rkey by static void
* replacing the MR. There's no recovery if this fails. fmr_op_release_mr(struct rpcrdma_mw *r)
{
LIST_HEAD(unmap_list);
int rc;
/* Ensure MW is not on any rl_registered list */
if (!list_empty(&r->mw_list))
list_del(&r->mw_list);
kfree(r->fmr.fm_physaddrs);
kfree(r->mw_sg);
/* In case this one was left mapped, try to unmap it
* to prevent dealloc_fmr from failing with EBUSY
*/
rc = __fmr_unmap(r);
if (rc)
pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
r, rc);
rc = ib_dealloc_fmr(r->fmr.fm_mr);
if (rc)
pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
r, rc);
kfree(r);
}
/* Reset of a single FMR.
*/ */
static void static void
__fmr_recovery_worker(struct work_struct *work) fmr_op_recover_mr(struct rpcrdma_mw *mw)
{ {
struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
mw_work);
struct rpcrdma_xprt *r_xprt = mw->mw_xprt; struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
int rc;
/* ORDER: invalidate first */
rc = __fmr_unmap(mw);
/* ORDER: then DMA unmap */
ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
if (rc)
goto out_release;
__fmr_unmap(mw);
rpcrdma_put_mw(r_xprt, mw); rpcrdma_put_mw(r_xprt, mw);
r_xprt->rx_stats.mrs_recovered++;
return; return;
}
/* A broken MR was discovered in a context that can't sleep. out_release:
* Defer recovery to the recovery worker. pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
*/ r_xprt->rx_stats.mrs_orphaned++;
static void
__fmr_queue_recovery(struct rpcrdma_mw *mw) spin_lock(&r_xprt->rx_buf.rb_mwlock);
{ list_del(&mw->mw_all);
INIT_WORK(&mw->mw_work, __fmr_recovery_worker); spin_unlock(&r_xprt->rx_buf.rb_mwlock);
queue_work(fmr_recovery_wq, &mw->mw_work);
fmr_op_release_mr(mw);
} }
static int static int
@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
} }
static int
fmr_op_init(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
struct ib_fmr_attr fmr_attr = {
.max_pages = RPCRDMA_MAX_FMR_SGES,
.max_maps = 1,
.page_shift = PAGE_SHIFT
};
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
struct rpcrdma_mw *r;
int i, rc;
spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
i += 2; /* head + tail */
i *= buf->rb_max_requests; /* one set for each RPC slot */
dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
rc = -ENOMEM;
while (i--) {
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
goto out;
r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
sizeof(u64), GFP_KERNEL);
if (!r->fmr.physaddrs)
goto out_free;
r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;
r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
return 0;
out_fmr_err:
rc = PTR_ERR(r->fmr.fmr);
dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
kfree(r->fmr.physaddrs);
out_free:
kfree(r);
out:
return rc;
}
/* Use the ib_map_phys_fmr() verb to register a memory region /* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE. * for remote access via RDMA READ or RDMA WRITE.
*/ */
static int static int
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing) int nsegs, bool writing, struct rpcrdma_mw **out)
{ {
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc; int len, pageoff, i, rc;
struct rpcrdma_mw *mw; struct rpcrdma_mw *mw;
u64 *dma_pages;
mw = seg1->rl_mw; mw = rpcrdma_get_mw(r_xprt);
seg1->rl_mw = NULL; if (!mw)
if (!mw) { return -ENOBUFS;
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
return -ENOMEM;
} else {
/* this is a retransmit; generate a fresh rkey */
rc = __fmr_unmap(mw);
if (rc)
return rc;
}
pageoff = offset_in_page(seg1->mr_offset); pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */ seg1->mr_offset -= pageoff; /* start of page */
@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > RPCRDMA_MAX_FMR_SGES) if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES; nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) { for (i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction); if (seg->mr_page)
mw->fmr.physaddrs[i] = seg->mr_dma; sg_set_page(&mw->mw_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
seg->mr_len);
len += seg->mr_len; len += seg->mr_len;
++seg; ++seg;
++i; ++i;
@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break; break;
} }
mw->mw_nents = i;
mw->mw_dir = rpcrdma_data_dir(writing);
if (i == 0)
goto out_dmamap_err;
rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
i, seg1->mr_dma); mw->mw_sg, mw->mw_nents, mw->mw_dir))
goto out_dmamap_err;
for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
dma_pages[0]);
if (rc) if (rc)
goto out_maperr; goto out_maperr;
seg1->rl_mw = mw; mw->mw_handle = mw->fmr.fm_mr->rkey;
seg1->mr_rkey = mw->fmr.fmr->rkey; mw->mw_length = len;
seg1->mr_base = seg1->mr_dma + pageoff; mw->mw_offset = dma_pages[0] + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len; *out = mw;
return i; return mw->mw_nents;
out_dmamap_err:
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
mw->mw_sg, mw->mw_nents);
rpcrdma_defer_mr_recovery(mw);
return -EIO;
out_maperr: out_maperr:
dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
__func__, len, (unsigned long long)seg1->mr_dma, len, (unsigned long long)dma_pages[0],
pageoff, i, rc); pageoff, mw->mw_nents, rc);
while (i--) rpcrdma_defer_mr_recovery(mw);
rpcrdma_unmap_one(device, --seg); return -EIO;
return rc;
}
static void
__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
int nsegs = seg->mr_nsegs;
while (nsegs--)
rpcrdma_unmap_one(device, seg++);
} }
/* Invalidate all memory regions that were registered for "req". /* Invalidate all memory regions that were registered for "req".
* *
* Sleeps until it is safe for the host CPU to access the * Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions. * previously mapped memory regions.
*
* Caller ensures that req->rl_registered is not empty.
*/ */
static void static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{ {
struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw, *tmp;
unsigned int i, nchunks;
struct rpcrdma_mw *mw;
LIST_HEAD(unmap_list); LIST_HEAD(unmap_list);
int rc; int rc;
@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* ORDER: Invalidate all of the req's MRs first /* ORDER: Invalidate all of the req's MRs first
* *
* ib_unmap_fmr() is slow, so use a single call instead * ib_unmap_fmr() is slow, so use a single call instead
* of one call per mapped MR. * of one call per mapped FMR.
*/ */
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { list_for_each_entry(mw, &req->rl_registered, mw_list)
seg = &req->rl_segments[i]; list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
mw = seg->rl_mw;
list_add(&mw->fmr.fmr->list, &unmap_list);
i += seg->mr_nsegs;
}
rc = ib_unmap_fmr(&unmap_list); rc = ib_unmap_fmr(&unmap_list);
if (rc) if (rc)
pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return /* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list. * them to the free MW list.
*/ */
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
seg = &req->rl_segments[i]; list_del_init(&mw->mw_list);
list_del_init(&mw->fmr.fm_mr->list);
__fmr_dma_unmap(r_xprt, seg); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
rpcrdma_put_mw(r_xprt, seg->rl_mw); mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
seg->rl_mw = NULL;
} }
req->rl_nchunks = 0; return;
out_reset:
pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
list_del_init(&mw->fmr.fm_mr->list);
fmr_op_recover_mr(mw);
}
} }
/* Use a slow, safe mechanism to invalidate all memory regions /* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req". * that were registered for "req".
*
* In the asynchronous case, DMA unmapping occurs first here
* because the rpcrdma_mr_seg is released immediately after this
* call. It's contents won't be available in __fmr_dma_unmap later.
* FIXME.
*/ */
static void static void
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync) bool sync)
{ {
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw; struct rpcrdma_mw *mw;
unsigned int i;
for (i = 0; req->rl_nchunks; req->rl_nchunks--) { while (!list_empty(&req->rl_registered)) {
seg = &req->rl_segments[i]; mw = list_first_entry(&req->rl_registered,
mw = seg->rl_mw; struct rpcrdma_mw, mw_list);
list_del_init(&mw->mw_list);
if (sync) { if (sync)
/* ORDER */ fmr_op_recover_mr(mw);
__fmr_unmap(mw); else
__fmr_dma_unmap(r_xprt, seg); rpcrdma_defer_mr_recovery(mw);
rpcrdma_put_mw(r_xprt, mw);
} else {
__fmr_dma_unmap(r_xprt, seg);
__fmr_queue_recovery(mw);
}
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
seg->rl_mw = NULL;
}
}
static void
fmr_op_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
int rc;
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
kfree(r->fmr.physaddrs);
rc = ib_dealloc_fmr(r->fmr.fmr);
if (rc)
dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
__func__, rc);
kfree(r);
} }
} }
@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map, .ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe, .ro_unmap_safe = fmr_op_unmap_safe,
.ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open, .ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages, .ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init, .ro_init_mr = fmr_op_init_mr,
.ro_destroy = fmr_op_destroy, .ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr", .ro_displayname = "fmr",
}; };

View File

@ -73,29 +73,71 @@
# define RPCDBG_FACILITY RPCDBG_TRANS # define RPCDBG_FACILITY RPCDBG_TRANS
#endif #endif
static struct workqueue_struct *frwr_recovery_wq; bool
frwr_is_supported(struct rpcrdma_ia *ia)
#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
int
frwr_alloc_recovery_wq(void)
{ {
frwr_recovery_wq = alloc_workqueue("frwr_recovery", struct ib_device_attr *attrs = &ia->ri_device->attrs;
FRWR_RECOVERY_WQ_FLAGS, 0);
return !frwr_recovery_wq ? -ENOMEM : 0; if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
goto out_not_supported;
if (attrs->max_fast_reg_page_list_len == 0)
goto out_not_supported;
return true;
out_not_supported:
pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
ia->ri_device->name);
return false;
} }
void static int
frwr_destroy_recovery_wq(void) frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
{ {
struct workqueue_struct *wq; unsigned int depth = ia->ri_max_frmr_depth;
struct rpcrdma_frmr *f = &r->frmr;
int rc;
if (!frwr_recovery_wq) f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
return; if (IS_ERR(f->fr_mr))
goto out_mr_err;
wq = frwr_recovery_wq; r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
frwr_recovery_wq = NULL; if (!r->mw_sg)
destroy_workqueue(wq); goto out_list_err;
sg_init_table(r->mw_sg, depth);
init_completion(&f->fr_linv_done);
return 0;
out_mr_err:
rc = PTR_ERR(f->fr_mr);
dprintk("RPC: %s: ib_alloc_mr status %i\n",
__func__, rc);
return rc;
out_list_err:
rc = -ENOMEM;
dprintk("RPC: %s: sg allocation failure\n",
__func__);
ib_dereg_mr(f->fr_mr);
return rc;
}
static void
frwr_op_release_mr(struct rpcrdma_mw *r)
{
int rc;
/* Ensure MW is not on any rl_registered list */
if (!list_empty(&r->mw_list))
list_del(&r->mw_list);
rc = ib_dereg_mr(r->frmr.fr_mr);
if (rc)
pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
r, rc);
kfree(r->mw_sg);
kfree(r);
} }
static int static int
@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0; return 0;
} }
static void /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_frmr *f = &mw->frmr;
int rc;
rc = __frwr_reset_mr(ia, mw);
ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
if (rc)
return;
rpcrdma_put_mw(r_xprt, mw);
}
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
* *
* There's no recovery if this fails. The FRMR is abandoned, but * There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is * remains in rb_all. It will be cleaned up when the transport is
* destroyed. * destroyed.
*/ */
static void static void
__frwr_recovery_worker(struct work_struct *work) frwr_op_recover_mr(struct rpcrdma_mw *mw)
{ {
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
mw_work); struct rpcrdma_ia *ia = &r_xprt->rx_ia;
__frwr_reset_and_unmap(r->mw_xprt, r);
return;
}
/* A broken MR was discovered in a context that can't sleep.
* Defer recovery to the recovery worker.
*/
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
INIT_WORK(&r->mw_work, __frwr_recovery_worker);
queue_work(frwr_recovery_wq, &r->mw_work);
}
static int
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
unsigned int depth)
{
struct rpcrdma_frmr *f = &r->frmr;
int rc; int rc;
f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); rc = __frwr_reset_mr(ia, mw);
if (IS_ERR(f->fr_mr)) ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
goto out_mr_err;
f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
if (!f->fr_sg)
goto out_list_err;
sg_init_table(f->fr_sg, depth);
init_completion(&f->fr_linv_done);
return 0;
out_mr_err:
rc = PTR_ERR(f->fr_mr);
dprintk("RPC: %s: ib_alloc_mr status %i\n",
__func__, rc);
return rc;
out_list_err:
rc = -ENOMEM;
dprintk("RPC: %s: sg allocation failure\n",
__func__);
ib_dereg_mr(f->fr_mr);
return rc;
}
static void
__frwr_release(struct rpcrdma_mw *r)
{
int rc;
rc = ib_dereg_mr(r->frmr.fr_mr);
if (rc) if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n", goto out_release;
__func__, rc);
kfree(r->frmr.fr_sg); rpcrdma_put_mw(r_xprt, mw);
r_xprt->rx_stats.mrs_recovered++;
return;
out_release:
pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
r_xprt->rx_stats.mrs_orphaned++;
spin_lock(&r_xprt->rx_buf.rb_mwlock);
list_del(&mw->mw_all);
spin_unlock(&r_xprt->rx_buf.rb_mwlock);
frwr_op_release_mr(mw);
} }
static int static int
@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
complete_all(&frmr->fr_linv_done); complete_all(&frmr->fr_linv_done);
} }
static int /* Post a REG_MR Work Request to register a memory region
frwr_op_init(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct ib_device *device = r_xprt->rx_ia.ri_device;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
int i;
spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
i += 2; /* head + tail */
i *= buf->rb_max_requests; /* one set for each RPC slot */
dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
while (i--) {
struct rpcrdma_mw *r;
int rc;
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
return -ENOMEM;
rc = __frwr_init(r, pd, device, depth);
if (rc) {
kfree(r);
return rc;
}
r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
return 0;
}
/* Post a FAST_REG Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE. * for remote access via RDMA READ or RDMA WRITE.
*/ */
static int static int
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing) int nsegs, bool writing, struct rpcrdma_mw **out)
{ {
struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw; struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr; struct rpcrdma_frmr *frmr;
struct ib_mr *mr; struct ib_mr *mr;
@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int rc, i, n, dma_nents; int rc, i, n, dma_nents;
u8 key; u8 key;
mw = seg1->rl_mw; mw = NULL;
seg1->rl_mw = NULL;
do { do {
if (mw) if (mw)
__frwr_queue_recovery(mw); rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt); mw = rpcrdma_get_mw(r_xprt);
if (!mw) if (!mw)
return -ENOMEM; return -ENOBUFS;
} while (mw->frmr.fr_state != FRMR_IS_INVALID); } while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr; frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID; frmr->fr_state = FRMR_IS_VALID;
@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > ia->ri_max_frmr_depth) if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth; nsegs = ia->ri_max_frmr_depth;
for (i = 0; i < nsegs;) { for (i = 0; i < nsegs;) {
if (seg->mr_page) if (seg->mr_page)
sg_set_page(&frmr->fr_sg[i], sg_set_page(&mw->mw_sg[i],
seg->mr_page, seg->mr_page,
seg->mr_len, seg->mr_len,
offset_in_page(seg->mr_offset)); offset_in_page(seg->mr_offset));
else else
sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
seg->mr_len); seg->mr_len);
++seg; ++seg;
@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break; break;
} }
frmr->fr_nents = i; mw->mw_nents = i;
frmr->fr_dir = direction; mw->mw_dir = rpcrdma_data_dir(writing);
if (i == 0)
goto out_dmamap_err;
dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); dma_nents = ib_dma_map_sg(ia->ri_device,
if (!dma_nents) { mw->mw_sg, mw->mw_nents, mw->mw_dir);
pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", if (!dma_nents)
__func__, frmr->fr_sg, frmr->fr_nents); goto out_dmamap_err;
return -ENOMEM;
}
n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->fr_nents)) { if (unlikely(n != mw->mw_nents))
pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", goto out_mapmr_err;
__func__, frmr->fr_mr, n, frmr->fr_nents);
rc = n < 0 ? n : -EINVAL;
goto out_senderr;
}
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
__func__, mw, frmr->fr_nents, mr->length); __func__, mw, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF); key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key); ib_update_fast_reg_key(mr, ++key);
@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc) if (rc)
goto out_senderr; goto out_senderr;
seg1->rl_mw = mw; mw->mw_handle = mr->rkey;
seg1->mr_rkey = mr->rkey; mw->mw_length = mr->length;
seg1->mr_base = mr->iova; mw->mw_offset = mr->iova;
seg1->mr_nsegs = frmr->fr_nents;
seg1->mr_len = mr->length;
return frmr->fr_nents; *out = mw;
return mw->mw_nents;
out_dmamap_err:
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
mw->mw_sg, mw->mw_nents);
rpcrdma_defer_mr_recovery(mw);
return -EIO;
out_mapmr_err:
pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
frmr->fr_mr, n, mw->mw_nents);
rpcrdma_defer_mr_recovery(mw);
return -EIO;
out_senderr: out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
__frwr_queue_recovery(mw); rpcrdma_defer_mr_recovery(mw);
return rc; return -ENOTCONN;
} }
static struct ib_send_wr * static struct ib_send_wr *
__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
{ {
struct rpcrdma_mw *mw = seg->rl_mw;
struct rpcrdma_frmr *f = &mw->frmr; struct rpcrdma_frmr *f = &mw->frmr;
struct ib_send_wr *invalidate_wr; struct ib_send_wr *invalidate_wr;
@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
* *
* Sleeps until it is safe for the host CPU to access the * Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions. * previously mapped memory regions.
*
* Caller ensures that req->rl_registered is not empty.
*/ */
static void static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{ {
struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw, *tmp;
unsigned int i, nchunks;
struct rpcrdma_frmr *f; struct rpcrdma_frmr *f;
struct rpcrdma_mw *mw;
int rc; int rc;
dprintk("RPC: %s: req %p\n", __func__, req); dprintk("RPC: %s: req %p\n", __func__, req);
@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with * Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call. * a single ib_post_send() call.
*/ */
f = NULL;
invalidate_wrs = pos = prev = NULL; invalidate_wrs = pos = prev = NULL;
seg = NULL; list_for_each_entry(mw, &req->rl_registered, mw_list) {
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { pos = __frwr_prepare_linv_wr(mw);
seg = &req->rl_segments[i];
pos = __frwr_prepare_linv_wr(seg);
if (!invalidate_wrs) if (!invalidate_wrs)
invalidate_wrs = pos; invalidate_wrs = pos;
else else
prev->next = pos; prev->next = pos;
prev = pos; prev = pos;
f = &mw->frmr;
i += seg->mr_nsegs;
} }
f = &seg->rl_mw->frmr;
/* Strong send queue ordering guarantees that when the /* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain * last WR in the chain completes, all WRs in the chain
@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list. * them to the free MW list.
*/ */
unmap: unmap:
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
seg = &req->rl_segments[i]; list_del_init(&mw->mw_list);
mw = seg->rl_mw; ib_dma_unmap_sg(ia->ri_device,
seg->rl_mw = NULL; mw->mw_sg, mw->mw_nents, mw->mw_dir);
ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
f->fr_dir);
rpcrdma_put_mw(r_xprt, mw); rpcrdma_put_mw(r_xprt, mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
} }
req->rl_nchunks = 0;
return; return;
reset_mrs: reset_mrs:
pr_warn("%s: ib_post_send failed %i\n", __func__, rc); pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
rdma_disconnect(ia->ri_id);
/* Find and reset the MRs in the LOCAL_INV WRs that did not /* Find and reset the MRs in the LOCAL_INV WRs that did not
* get posted. This is synchronous, and slow. * get posted. This is synchronous, and slow.
*/ */
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { list_for_each_entry(mw, &req->rl_registered, mw_list) {
seg = &req->rl_segments[i];
mw = seg->rl_mw;
f = &mw->frmr; f = &mw->frmr;
if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw); __frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next; bad_wr = bad_wr->next;
} }
i += seg->mr_nsegs;
} }
goto unmap; goto unmap;
} }
@ -621,38 +552,17 @@ static void
frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync) bool sync)
{ {
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw; struct rpcrdma_mw *mw;
unsigned int i;
for (i = 0; req->rl_nchunks; req->rl_nchunks--) { while (!list_empty(&req->rl_registered)) {
seg = &req->rl_segments[i]; mw = list_first_entry(&req->rl_registered,
mw = seg->rl_mw; struct rpcrdma_mw, mw_list);
list_del_init(&mw->mw_list);
if (sync) if (sync)
__frwr_reset_and_unmap(r_xprt, mw); frwr_op_recover_mr(mw);
else else
__frwr_queue_recovery(mw); rpcrdma_defer_mr_recovery(mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
seg->rl_mw = NULL;
}
}
static void
frwr_op_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
/* Ensure stale MWs for "buf" are no longer in flight */
flush_workqueue(frwr_recovery_wq);
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
__frwr_release(r);
kfree(r);
} }
} }
@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map, .ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap_safe = frwr_op_unmap_safe, .ro_unmap_safe = frwr_op_unmap_safe,
.ro_recover_mr = frwr_op_recover_mr,
.ro_open = frwr_op_open, .ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages, .ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init, .ro_init_mr = frwr_op_init_mr,
.ro_destroy = frwr_op_destroy, .ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr", .ro_displayname = "frwr",
}; };

View File

@ -1,122 +0,0 @@
/*
* Copyright (c) 2015 Oracle. All rights reserved.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*/
/* No-op chunk preparation. All client memory is pre-registered.
* Sometimes referred to as ALLPHYSICAL mode.
*
* Physical registration is simple because all client memory is
* pre-registered and never deregistered. This mode is good for
* adapter bring up, but is considered not safe: the server is
* trusted not to abuse its access to client memory not involved
* in RDMA I/O.
*/
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
static int
physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
struct ib_mr *mr;
/* Obtain an rkey to use for RPC data payloads.
*/
mr = ib_get_dma_mr(ia->ri_pd,
IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ);
if (IS_ERR(mr)) {
pr_err("%s: ib_get_dma_mr for failed with %lX\n",
__func__, PTR_ERR(mr));
return -ENOMEM;
}
ia->ri_dma_mr = mr;
rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
RPCRDMA_MAX_DATA_SEGS,
RPCRDMA_MAX_HDR_SEGS));
return 0;
}
/* PHYSICAL memory registration conveys one page per chunk segment.
*/
static size_t
physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
RPCRDMA_MAX_HDR_SEGS);
}
static int
physical_op_init(struct rpcrdma_xprt *r_xprt)
{
return 0;
}
/* The client's physical memory is already exposed for
* remote access via RDMA READ or RDMA WRITE.
*/
static int
physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
return 1;
}
/* DMA unmap all memory regions that were mapped for "req".
*/
static void
physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
unsigned int i;
for (i = 0; req->rl_nchunks; --req->rl_nchunks)
rpcrdma_unmap_one(device, &req->rl_segments[i++]);
}
/* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req".
*
* For physical memory registration, there is no good way to
* fence a single MR that has been advertised to the server. The
* client has already handed the server an R_key that cannot be
* invalidated and is shared by all MRs on this connection.
* Tearing down the PD might be the only safe choice, but it's
* not clear that a freshly acquired DMA R_key would be different
* than the one used by the PD that was just destroyed.
* FIXME.
*/
static void
physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
physical_op_unmap_sync(r_xprt, req);
}
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
}
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
.ro_unmap_safe = physical_op_unmap_safe,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
.ro_destroy = physical_op_destroy,
.ro_displayname = "physical",
};

View File

@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
* MR when they can. * MR when they can.
*/ */
static int static int
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
int n, int nsegs)
{ {
size_t page_offset; size_t page_offset;
u32 remaining; u32 remaining;
@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
base = vec->iov_base; base = vec->iov_base;
page_offset = offset_in_page(base); page_offset = offset_in_page(base);
remaining = vec->iov_len; remaining = vec->iov_len;
while (remaining && n < nsegs) { while (remaining && n < RPCRDMA_MAX_SEGS) {
seg[n].mr_page = NULL; seg[n].mr_page = NULL;
seg[n].mr_offset = base; seg[n].mr_offset = base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
static int static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
{ {
int len, n = 0, p; int len, n, p, page_base;
int page_base;
struct page **ppages; struct page **ppages;
n = 0;
if (pos == 0) { if (pos == 0) {
n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
if (n == nsegs) if (n == RPCRDMA_MAX_SEGS)
return -EIO; goto out_overflow;
} }
len = xdrbuf->page_len; len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = xdrbuf->page_base & ~PAGE_MASK; page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0; p = 0;
while (len && n < nsegs) { while (len && n < RPCRDMA_MAX_SEGS) {
if (!ppages[p]) { if (!ppages[p]) {
/* alloc the pagelist for receiving buffer */ /* alloc the pagelist for receiving buffer */
ppages[p] = alloc_page(GFP_ATOMIC); ppages[p] = alloc_page(GFP_ATOMIC);
if (!ppages[p]) if (!ppages[p])
return -ENOMEM; return -EAGAIN;
} }
seg[n].mr_page = ppages[p]; seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
if (seg[n].mr_len > PAGE_SIZE) if (seg[n].mr_len > PAGE_SIZE)
return -EIO; goto out_overflow;
len -= seg[n].mr_len; len -= seg[n].mr_len;
++n; ++n;
++p; ++p;
@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
} }
/* Message overflows the seg array */ /* Message overflows the seg array */
if (len && n == nsegs) if (len && n == RPCRDMA_MAX_SEGS)
return -EIO; goto out_overflow;
/* When encoding the read list, the tail is always sent inline */ /* When encoding the read list, the tail is always sent inline */
if (type == rpcrdma_readch) if (type == rpcrdma_readch)
@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* xdr pad bytes, saving the server an RDMA operation. */ * xdr pad bytes, saving the server an RDMA operation. */
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
return n; return n;
n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
if (n == nsegs) if (n == RPCRDMA_MAX_SEGS)
return -EIO; goto out_overflow;
} }
return n; return n;
out_overflow:
pr_err("rpcrdma: segment array overflow\n");
return -EIO;
} }
static inline __be32 * static inline __be32 *
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{ {
*iptr++ = cpu_to_be32(seg->mr_rkey); *iptr++ = cpu_to_be32(mw->mw_handle);
*iptr++ = cpu_to_be32(seg->mr_len); *iptr++ = cpu_to_be32(mw->mw_length);
return xdr_encode_hyper(iptr, seg->mr_base); return xdr_encode_hyper(iptr, mw->mw_offset);
} }
/* XDR-encode the Read list. Supports encoding a list of read /* XDR-encode the Read list. Supports encoding a list of read
@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst, struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype rtype) __be32 *iptr, enum rpcrdma_chunktype rtype)
{ {
struct rpcrdma_mr_seg *seg = req->rl_nextseg; struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
unsigned int pos; unsigned int pos;
int n, nsegs; int n, nsegs;
@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
pos = rqst->rq_snd_buf.head[0].iov_len; pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch) if (rtype == rpcrdma_areadch)
pos = 0; pos = 0;
nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, seg = req->rl_segments;
RPCRDMA_MAX_SEGS - req->rl_nchunks); nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
if (nsegs < 0) if (nsegs < 0)
return ERR_PTR(nsegs); return ERR_PTR(nsegs);
do { do {
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
if (n <= 0) false, &mw);
if (n < 0)
return ERR_PTR(n); return ERR_PTR(n);
list_add(&mw->mw_list, &req->rl_registered);
*iptr++ = xdr_one; /* item present */ *iptr++ = xdr_one; /* item present */
@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* have the same "position". * have the same "position".
*/ */
*iptr++ = cpu_to_be32(pos); *iptr++ = cpu_to_be32(pos);
iptr = xdr_encode_rdma_segment(iptr, seg); iptr = xdr_encode_rdma_segment(iptr, mw);
dprintk("RPC: %5u %s: read segment pos %u " dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
"%d@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos, rqst->rq_task->tk_pid, __func__, pos,
seg->mr_len, (unsigned long long)seg->mr_base, mw->mw_length, (unsigned long long)mw->mw_offset,
seg->mr_rkey, n < nsegs ? "more" : "last"); mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++; r_xprt->rx_stats.read_chunk_count++;
req->rl_nchunks++;
seg += n; seg += n;
nsegs -= n; nsegs -= n;
} while (nsegs); } while (nsegs);
req->rl_nextseg = seg;
/* Finish Read list */ /* Finish Read list */
*iptr++ = xdr_zero; /* Next item not present */ *iptr++ = xdr_zero; /* Next item not present */
@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, __be32 *iptr, struct rpc_rqst *rqst, __be32 *iptr,
enum rpcrdma_chunktype wtype) enum rpcrdma_chunktype wtype)
{ {
struct rpcrdma_mr_seg *seg = req->rl_nextseg; struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
int n, nsegs, nchunks; int n, nsegs, nchunks;
__be32 *segcount; __be32 *segcount;
@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return iptr; return iptr;
} }
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len, rqst->rq_rcv_buf.head[0].iov_len,
wtype, seg, wtype, seg);
RPCRDMA_MAX_SEGS - req->rl_nchunks);
if (nsegs < 0) if (nsegs < 0)
return ERR_PTR(nsegs); return ERR_PTR(nsegs);
@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0; nchunks = 0;
do { do {
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
if (n <= 0) true, &mw);
if (n < 0)
return ERR_PTR(n); return ERR_PTR(n);
list_add(&mw->mw_list, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, seg); iptr = xdr_encode_rdma_segment(iptr, mw);
dprintk("RPC: %5u %s: write segment " dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
"%d@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, rqst->rq_task->tk_pid, __func__,
seg->mr_len, (unsigned long long)seg->mr_base, mw->mw_length, (unsigned long long)mw->mw_offset,
seg->mr_rkey, n < nsegs ? "more" : "last"); mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len; r_xprt->rx_stats.total_rdma_request += seg->mr_len;
req->rl_nchunks++;
nchunks++; nchunks++;
seg += n; seg += n;
nsegs -= n; nsegs -= n;
} while (nsegs); } while (nsegs);
req->rl_nextseg = seg;
/* Update count of segments in this Write chunk */ /* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks); *segcount = cpu_to_be32(nchunks);
@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst, struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype wtype) __be32 *iptr, enum rpcrdma_chunktype wtype)
{ {
struct rpcrdma_mr_seg *seg = req->rl_nextseg; struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
int n, nsegs, nchunks; int n, nsegs, nchunks;
__be32 *segcount; __be32 *segcount;
@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr; return iptr;
} }
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, seg = req->rl_segments;
RPCRDMA_MAX_SEGS - req->rl_nchunks); nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0) if (nsegs < 0)
return ERR_PTR(nsegs); return ERR_PTR(nsegs);
@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
nchunks = 0; nchunks = 0;
do { do {
n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
if (n <= 0) true, &mw);
if (n < 0)
return ERR_PTR(n); return ERR_PTR(n);
list_add(&mw->mw_list, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, seg); iptr = xdr_encode_rdma_segment(iptr, mw);
dprintk("RPC: %5u %s: reply segment " dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
"%d@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, rqst->rq_task->tk_pid, __func__,
seg->mr_len, (unsigned long long)seg->mr_base, mw->mw_length, (unsigned long long)mw->mw_offset,
seg->mr_rkey, n < nsegs ? "more" : "last"); mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len; r_xprt->rx_stats.total_rdma_request += seg->mr_len;
req->rl_nchunks++;
nchunks++; nchunks++;
seg += n; seg += n;
nsegs -= n; nsegs -= n;
} while (nsegs); } while (nsegs);
req->rl_nextseg = seg;
/* Update count of segments in the Reply chunk */ /* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks); *segcount = cpu_to_be32(nchunks);
@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
enum rpcrdma_chunktype rtype, wtype; enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp; struct rpcrdma_msg *headerp;
bool ddp_allowed;
ssize_t hdrlen; ssize_t hdrlen;
size_t rpclen; size_t rpclen;
__be32 *iptr; __be32 *iptr;
@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
headerp->rm_type = rdma_msg; headerp->rm_type = rdma_msg;
/* When the ULP employs a GSS flavor that guarantees integrity
* or privacy, direct data placement of individual data items
* is not allowed.
*/
ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
RPCAUTH_AUTH_DATATOUCH);
/* /*
* Chunks needed for results? * Chunks needed for results?
* *
@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/ */
if (rpcrdma_results_inline(r_xprt, rqst)) if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch; wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech; wtype = rpcrdma_writech;
else else
wtype = rpcrdma_replych; wtype = rpcrdma_replych;
@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rtype = rpcrdma_noch; rtype = rpcrdma_noch;
rpcrdma_inline_pullup(rqst); rpcrdma_inline_pullup(rqst);
rpclen = rqst->rq_svec[0].iov_len; rpclen = rqst->rq_svec[0].iov_len;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch; rtype = rpcrdma_readch;
rpclen = rqst->rq_svec[0].iov_len; rpclen = rqst->rq_svec[0].iov_len;
rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a * send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time. * regular Read chunk at the same time.
*/ */
req->rl_nchunks = 0;
req->rl_nextseg = req->rl_segments;
iptr = headerp->rm_body.rm_chunks; iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr)) if (IS_ERR(iptr))
@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
out_overflow: out_overflow:
pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
/* Terminate this RPC. Chunks registered above will be iptr = ERR_PTR(-EIO);
* released by xprt_release -> xprt_rmda_free .
*/
return -EIO;
out_unmap: out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@ -705,15 +711,13 @@ out_unmap:
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-) * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
*/ */
static int static int
rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
{ {
unsigned int i, total_len; unsigned int i, total_len;
struct rpcrdma_write_chunk *cur_wchunk; struct rpcrdma_write_chunk *cur_wchunk;
char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
i = be32_to_cpu(**iptrp); i = be32_to_cpu(**iptrp);
if (i > max)
return -1;
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
total_len = 0; total_len = 0;
while (i--) { while (i--) {
@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
return total_len; return total_len;
} }
/* /**
* Scatter inline received data back into provided iov's. * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
* @rqst: controlling RPC request
* @srcp: points to RPC message payload in receive buffer
* @copy_len: remaining length of receive buffer content
* @pad: Write chunk pad bytes needed (zero for pure inline)
*
* The upper layer has set the maximum number of bytes it can
* receive in each component of rq_rcv_buf. These values are set in
* the head.iov_len, page_len, tail.iov_len, and buflen fields.
*
* Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
* many cases this function simply updates iov_base pointers in
* rq_rcv_buf to point directly to the received reply data, to
* avoid copying reply data.
*
* Returns the count of bytes which had to be memcopied.
*/ */
static void static unsigned long
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{ {
int i, npages, curlen, olen; unsigned long fixup_copy_count;
int i, npages, curlen;
char *destp; char *destp;
struct page **ppages; struct page **ppages;
int page_base; int page_base;
curlen = rqst->rq_rcv_buf.head[0].iov_len; /* The head iovec is redirected to the RPC reply message
if (curlen > copy_len) { /* write chunk header fixup */ * in the receive buffer, to avoid a memcopy.
curlen = copy_len; */
rqst->rq_rcv_buf.head[0].iov_len = curlen; rqst->rq_rcv_buf.head[0].iov_base = srcp;
} rqst->rq_private_buf.head[0].iov_base = srcp;
/* The contents of the receive buffer that follow
* head.iov_len bytes are copied into the page list.
*/
curlen = rqst->rq_rcv_buf.head[0].iov_len;
if (curlen > copy_len)
curlen = copy_len;
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
__func__, srcp, copy_len, curlen); __func__, srcp, copy_len, curlen);
/* Shift pointer for first receive segment only */
rqst->rq_rcv_buf.head[0].iov_base = srcp;
srcp += curlen; srcp += curlen;
copy_len -= curlen; copy_len -= curlen;
olen = copy_len;
i = 0;
rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
page_base = rqst->rq_rcv_buf.page_base; page_base = rqst->rq_rcv_buf.page_base;
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
page_base &= ~PAGE_MASK; page_base &= ~PAGE_MASK;
fixup_copy_count = 0;
if (copy_len && rqst->rq_rcv_buf.page_len) { if (copy_len && rqst->rq_rcv_buf.page_len) {
npages = PAGE_ALIGN(page_base + int pagelist_len;
rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
for (; i < npages; i++) { pagelist_len = rqst->rq_rcv_buf.page_len;
if (pagelist_len > copy_len)
pagelist_len = copy_len;
npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
for (i = 0; i < npages; i++) {
curlen = PAGE_SIZE - page_base; curlen = PAGE_SIZE - page_base;
if (curlen > copy_len) if (curlen > pagelist_len)
curlen = copy_len; curlen = pagelist_len;
dprintk("RPC: %s: page %d" dprintk("RPC: %s: page %d"
" srcp 0x%p len %d curlen %d\n", " srcp 0x%p len %d curlen %d\n",
__func__, i, srcp, copy_len, curlen); __func__, i, srcp, copy_len, curlen);
@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
kunmap_atomic(destp); kunmap_atomic(destp);
srcp += curlen; srcp += curlen;
copy_len -= curlen; copy_len -= curlen;
if (copy_len == 0) fixup_copy_count += curlen;
pagelist_len -= curlen;
if (!pagelist_len)
break; break;
page_base = 0; page_base = 0;
} }
/* Implicit padding for the last segment in a Write
* chunk is inserted inline at the front of the tail
* iovec. The upper layer ignores the content of
* the pad. Simply ensure inline content in the tail
* that follows the Write chunk is properly aligned.
*/
if (pad)
srcp -= pad;
} }
if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { /* The tail iovec is redirected to the remaining data
curlen = copy_len; * in the receive buffer, to avoid a memcopy.
if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) */
curlen = rqst->rq_rcv_buf.tail[0].iov_len; if (copy_len || pad) {
if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) rqst->rq_rcv_buf.tail[0].iov_base = srcp;
memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); rqst->rq_private_buf.tail[0].iov_base = srcp;
dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
__func__, srcp, copy_len, curlen);
rqst->rq_rcv_buf.tail[0].iov_len = curlen;
copy_len -= curlen; ++i;
} else
rqst->rq_rcv_buf.tail[0].iov_len = 0;
if (pad) {
/* implicit padding on terminal chunk */
unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
while (pad--)
p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
} }
if (copy_len) return fixup_copy_count;
dprintk("RPC: %s: %d bytes in"
" %d extra segments (%d lost)\n",
__func__, olen, i, copy_len);
/* TBD avoid a warning from call_decode() */
rqst->rq_private_buf = rqst->rq_rcv_buf;
} }
void void
@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
(headerp->rm_body.rm_chunks[1] == xdr_zero && (headerp->rm_body.rm_chunks[1] == xdr_zero &&
headerp->rm_body.rm_chunks[2] != xdr_zero) || headerp->rm_body.rm_chunks[2] != xdr_zero) ||
(headerp->rm_body.rm_chunks[1] != xdr_zero && (headerp->rm_body.rm_chunks[1] != xdr_zero &&
req->rl_nchunks == 0)) list_empty(&req->rl_registered)))
goto badheader; goto badheader;
if (headerp->rm_body.rm_chunks[1] != xdr_zero) { if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
/* count any expected write chunks in read reply */ /* count any expected write chunks in read reply */
/* start at write chunk array count */ /* start at write chunk array count */
iptr = &headerp->rm_body.rm_chunks[2]; iptr = &headerp->rm_body.rm_chunks[2];
rdmalen = rpcrdma_count_chunks(rep, rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
req->rl_nchunks, 1, &iptr);
/* check for validity, and no reply chunk after */ /* check for validity, and no reply chunk after */
if (rdmalen < 0 || *iptr++ != xdr_zero) if (rdmalen < 0 || *iptr++ != xdr_zero)
goto badheader; goto badheader;
@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rep->rr_len -= RPCRDMA_HDRLEN_MIN; rep->rr_len -= RPCRDMA_HDRLEN_MIN;
status = rep->rr_len; status = rep->rr_len;
} }
/* Fix up the rpc results for upper layer */
rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); r_xprt->rx_stats.fixup_copy_count +=
rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
rdmalen);
break; break;
case rdma_nomsg: case rdma_nomsg:
@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (headerp->rm_body.rm_chunks[0] != xdr_zero || if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
headerp->rm_body.rm_chunks[1] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero ||
headerp->rm_body.rm_chunks[2] != xdr_one || headerp->rm_body.rm_chunks[2] != xdr_one ||
req->rl_nchunks == 0) list_empty(&req->rl_registered))
goto badheader; goto badheader;
iptr = (__be32 *)((unsigned char *)headerp + iptr = (__be32 *)((unsigned char *)headerp +
RPCRDMA_HDRLEN_MIN); RPCRDMA_HDRLEN_MIN);
rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
if (rdmalen < 0) if (rdmalen < 0)
goto badheader; goto badheader;
r_xprt->rx_stats.total_rdma_reply += rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen;
@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
badheader: badheader:
default: default:
dprintk("%s: invalid rpcrdma reply header (type %d):" dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
" chunks[012] == %d %d %d" rqst->rq_task->tk_pid, __func__,
" expected chunks <= %d\n", be32_to_cpu(headerp->rm_type));
__func__, be32_to_cpu(headerp->rm_type),
headerp->rm_body.rm_chunks[0],
headerp->rm_body.rm_chunks[1],
headerp->rm_body.rm_chunks[2],
req->rl_nchunks);
status = -EIO; status = -EIO;
r_xprt->rx_stats.bad_reply_count++; r_xprt->rx_stats.bad_reply_count++;
break; break;
@ -1035,7 +1049,7 @@ out:
* control: waking the next RPC waits until this RPC has * control: waking the next RPC waits until this RPC has
* relinquished all its Send Queue entries. * relinquished all its Send Queue entries.
*/ */
if (req->rl_nchunks) if (!list_empty(&req->rl_registered))
r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
spin_lock_bh(&xprt->transport_lock); spin_lock_bh(&xprt->transport_lock);

View File

@ -558,7 +558,6 @@ out_sendbuf:
out_fail: out_fail:
rpcrdma_buffer_put(req); rpcrdma_buffer_put(req);
r_xprt->rx_stats.failed_marshal_count++;
return NULL; return NULL;
} }
@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
rpcrdma_buffer_put(req); rpcrdma_buffer_put(req);
} }
/* /**
* xprt_rdma_send_request - marshal and send an RPC request
* @task: RPC task with an RPC message in rq_snd_buf
*
* Return values:
* 0: The request has been sent
* ENOTCONN: Caller needs to invoke connect logic then call again
* ENOBUFS: Call again later to send the request
* EIO: A permanent error occurred. The request was not sent,
* and don't try it again
*
* send_request invokes the meat of RPC RDMA. It must do the following: * send_request invokes the meat of RPC RDMA. It must do the following:
*
* 1. Marshal the RPC request into an RPC RDMA request, which means * 1. Marshal the RPC request into an RPC RDMA request, which means
* putting a header in front of data, and creating IOVs for RDMA * putting a header in front of data, and creating IOVs for RDMA
* from those in the request. * from those in the request.
@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
* the request (rpcrdma_ep_post). * the request (rpcrdma_ep_post).
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
*/ */
static int static int
xprt_rdma_send_request(struct rpc_task *task) xprt_rdma_send_request(struct rpc_task *task)
{ {
@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0; int rc = 0;
/* On retransmit, remove any previously registered chunks */
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
rc = rpcrdma_marshal_req(rqst); rc = rpcrdma_marshal_req(rqst);
if (rc < 0) if (rc < 0)
goto failed_marshal; goto failed_marshal;
@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0; return 0;
failed_marshal: failed_marshal:
r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
__func__, rc); __func__, rc);
if (rc == -EIO) if (rc == -EIO)
return -EIO; r_xprt->rx_stats.failed_marshal_count++;
if (rc != -ENOTCONN)
return rc;
drop_connection: drop_connection:
xprt_disconnect_done(xprt); xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */ return -ENOTCONN; /* implies disconnect */
@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids, xprt->stat.bad_xids,
xprt->stat.req_u, xprt->stat.req_u,
xprt->stat.bklog_u); xprt->stat.bklog_u);
seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
r_xprt->rx_stats.read_chunk_count, r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count, r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count, r_xprt->rx_stats.reply_chunk_count,
@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count); r_xprt->rx_stats.nomsg_call_count);
seq_printf(seq, "%lu %lu %lu\n",
r_xprt->rx_stats.mrs_recovered,
r_xprt->rx_stats.mrs_orphaned,
r_xprt->rx_stats.mrs_allocated);
} }
static int static int
@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
__func__, rc); __func__, rc);
rpcrdma_destroy_wq(); rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
rc = xprt_unregister_transport(&xprt_rdma_bc); rc = xprt_unregister_transport(&xprt_rdma_bc);
if (rc) if (rc)
@ -753,20 +769,13 @@ int xprt_rdma_init(void)
{ {
int rc; int rc;
rc = frwr_alloc_recovery_wq(); rc = rpcrdma_alloc_wq();
if (rc) if (rc)
return rc; return rc;
rc = rpcrdma_alloc_wq();
if (rc) {
frwr_destroy_recovery_wq();
return rc;
}
rc = xprt_register_transport(&xprt_rdma); rc = xprt_register_transport(&xprt_rdma);
if (rc) { if (rc) {
rpcrdma_destroy_wq(); rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
return rc; return rc;
} }
@ -774,7 +783,6 @@ int xprt_rdma_init(void)
if (rc) { if (rc) {
xprt_unregister_transport(&xprt_rdma); xprt_unregister_transport(&xprt_rdma);
rpcrdma_destroy_wq(); rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
return rc; return rc;
} }

View File

@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc; int rc;
ia->ri_dma_mr = NULL;
ia->ri_id = rpcrdma_create_id(xprt, ia, addr); ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) { if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id); rc = PTR_ERR(ia->ri_id);
@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_pd = ib_alloc_pd(ia->ri_device); ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) { if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd); rc = PTR_ERR(ia->ri_pd);
dprintk("RPC: %s: ib_alloc_pd() failed %i\n", pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
__func__, rc);
goto out2; goto out2;
} }
if (memreg == RPCRDMA_FRMR) {
if (!(ia->ri_device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS) ||
(ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
}
}
if (memreg == RPCRDMA_MTHCAFMR) {
if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
rc = -EINVAL;
goto out3;
}
}
switch (memreg) { switch (memreg) {
case RPCRDMA_FRMR: case RPCRDMA_FRMR:
ia->ri_ops = &rpcrdma_frwr_memreg_ops; if (frwr_is_supported(ia)) {
break; ia->ri_ops = &rpcrdma_frwr_memreg_ops;
case RPCRDMA_ALLPHYSICAL: break;
ia->ri_ops = &rpcrdma_physical_memreg_ops; }
break; /*FALLTHROUGH*/
case RPCRDMA_MTHCAFMR: case RPCRDMA_MTHCAFMR:
ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (fmr_is_supported(ia)) {
break; ia->ri_ops = &rpcrdma_fmr_memreg_ops;
break;
}
/*FALLTHROUGH*/
default: default:
printk(KERN_ERR "RPC: Unsupported memory " pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
"registration mode: %d\n", memreg); memreg);
rc = -ENOMEM; rc = -EINVAL;
goto out3; goto out3;
} }
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
return 0; return 0;
@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
out2: out2:
ib_free_cq(sendcq); ib_free_cq(sendcq);
out1: out1:
if (ia->ri_dma_mr)
ib_dereg_mr(ia->ri_dma_mr);
return rc; return rc;
} }
@ -600,8 +578,6 @@ out1:
void void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{ {
int rc;
dprintk("RPC: %s: entering, connected is %d\n", dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected); __func__, ep->rep_connected);
@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.recv_cq); ib_free_cq(ep->rep_attr.recv_cq);
ib_free_cq(ep->rep_attr.send_cq); ib_free_cq(ep->rep_attr.send_cq);
if (ia->ri_dma_mr) {
rc = ib_dereg_mr(ia->ri_dma_mr);
dprintk("RPC: %s: ib_dereg_mr returned %i\n",
__func__, rc);
}
} }
/* /*
@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_drain_qp(ia->ri_id->qp); ib_drain_qp(ia->ri_id->qp);
} }
static void
rpcrdma_mr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
rb_recovery_worker.work);
struct rpcrdma_mw *mw;
spin_lock(&buf->rb_recovery_lock);
while (!list_empty(&buf->rb_stale_mrs)) {
mw = list_first_entry(&buf->rb_stale_mrs,
struct rpcrdma_mw, mw_list);
list_del_init(&mw->mw_list);
spin_unlock(&buf->rb_recovery_lock);
dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
spin_lock(&buf->rb_recovery_lock);
}
spin_unlock(&buf->rb_recovery_lock);
}
void
rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
{
struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
spin_lock(&buf->rb_recovery_lock);
list_add(&mw->mw_list, &buf->rb_stale_mrs);
spin_unlock(&buf->rb_recovery_lock);
schedule_delayed_work(&buf->rb_recovery_worker, 0);
}
static void
rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
unsigned int count;
LIST_HEAD(free);
LIST_HEAD(all);
for (count = 0; count < 32; count++) {
struct rpcrdma_mw *mw;
int rc;
mw = kzalloc(sizeof(*mw), GFP_KERNEL);
if (!mw)
break;
rc = ia->ri_ops->ro_init_mr(ia, mw);
if (rc) {
kfree(mw);
break;
}
mw->mw_xprt = r_xprt;
list_add(&mw->mw_list, &free);
list_add(&mw->mw_all, &all);
}
spin_lock(&buf->rb_mwlock);
list_splice(&free, &buf->rb_mws);
list_splice(&all, &buf->rb_all);
r_xprt->rx_stats.mrs_allocated += count;
spin_unlock(&buf->rb_mwlock);
dprintk("RPC: %s: created %u MRs\n", __func__, count);
}
static void
rpcrdma_mr_refresh_worker(struct work_struct *work)
{
struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
rb_refresh_worker.work);
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
rx_buf);
rpcrdma_create_mrs(r_xprt);
}
struct rpcrdma_req * struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{ {
@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buffer->rb_reqslock); spin_unlock(&buffer->rb_reqslock);
req->rl_cqe.done = rpcrdma_wc_send; req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf; req->rl_buffer = &r_xprt->rx_buf;
INIT_LIST_HEAD(&req->rl_registered);
return req; return req;
} }
@ -832,17 +887,23 @@ int
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{ {
struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int i, rc; int i, rc;
buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_max_requests = r_xprt->rx_data.max_requests;
buf->rb_bc_srv_max_requests = 0; buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
atomic_set(&buf->rb_credits, 1); atomic_set(&buf->rb_credits, 1);
spin_lock_init(&buf->rb_mwlock);
spin_lock_init(&buf->rb_lock);
spin_lock_init(&buf->rb_recovery_lock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
INIT_LIST_HEAD(&buf->rb_stale_mrs);
INIT_DELAYED_WORK(&buf->rb_refresh_worker,
rpcrdma_mr_refresh_worker);
INIT_DELAYED_WORK(&buf->rb_recovery_worker,
rpcrdma_mr_recovery_worker);
rc = ia->ri_ops->ro_init(r_xprt); rpcrdma_create_mrs(r_xprt);
if (rc)
goto out;
INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs); INIT_LIST_HEAD(&buf->rb_allreqs);
@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
} }
INIT_LIST_HEAD(&buf->rb_recv_bufs); INIT_LIST_HEAD(&buf->rb_recv_bufs);
for (i = 0; i < buf->rb_max_requests + 2; i++) { for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_rep *rep; struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt); rep = rpcrdma_create_rep(r_xprt);
@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
kfree(req); kfree(req);
} }
static void
rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
{
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
rx_buf);
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
struct rpcrdma_mw *mw;
unsigned int count;
count = 0;
spin_lock(&buf->rb_mwlock);
while (!list_empty(&buf->rb_all)) {
mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&mw->mw_all);
spin_unlock(&buf->rb_mwlock);
ia->ri_ops->ro_release_mr(mw);
count++;
spin_lock(&buf->rb_mwlock);
}
spin_unlock(&buf->rb_mwlock);
r_xprt->rx_stats.mrs_allocated = 0;
dprintk("RPC: %s: released %u MRs\n", __func__, count);
}
void void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{ {
struct rpcrdma_ia *ia = rdmab_to_ia(buf); struct rpcrdma_ia *ia = rdmab_to_ia(buf);
cancel_delayed_work_sync(&buf->rb_recovery_worker);
while (!list_empty(&buf->rb_recv_bufs)) { while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep; struct rpcrdma_rep *rep;
@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
} }
spin_unlock(&buf->rb_reqslock); spin_unlock(&buf->rb_reqslock);
ia->ri_ops->ro_destroy(buf); rpcrdma_destroy_mrs(buf);
} }
struct rpcrdma_mw * struct rpcrdma_mw *
@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buf->rb_mwlock); spin_unlock(&buf->rb_mwlock);
if (!mw) if (!mw)
pr_err("RPC: %s: no MWs available\n", __func__); goto out_nomws;
return mw; return mw;
out_nomws:
dprintk("RPC: %s: no MWs available\n", __func__);
schedule_delayed_work(&buf->rb_refresh_worker, 0);
/* Allow the reply handler and refresh worker to run */
cond_resched();
return NULL;
} }
void void
@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
/* /*
* Get a set of request/reply buffers. * Get a set of request/reply buffers.
*
* Reply buffer (if available) is attached to send buffer upon return.
*/ */
struct rpcrdma_req * struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
out_reqbuf: out_reqbuf:
spin_unlock(&buffers->rb_lock); spin_unlock(&buffers->rb_lock);
pr_warn("RPC: %s: out of request buffers\n", __func__); pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
return NULL; return NULL;
out_repbuf: out_repbuf:
list_add(&req->rl_free, &buffers->rb_send_bufs);
spin_unlock(&buffers->rb_lock); spin_unlock(&buffers->rb_lock);
pr_warn("RPC: %s: out of reply buffers\n", __func__); pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
req->rl_reply = NULL; return NULL;
return req;
} }
/* /*
@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* Wrappers for internal-use kmalloc memory registration, used by buffer code. * Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/ */
void
rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
{
dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
seg->mr_offset,
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
/** /**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia * @ia: controlling rpcrdma_ia
@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
if (rep) { if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep); rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc) if (rc)
goto out; return rc;
req->rl_reply = NULL; req->rl_reply = NULL;
} }
@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc) if (rc)
dprintk("RPC: %s: ib_post_send returned %i\n", __func__, goto out_postsend_err;
rc); return 0;
out:
return rc; out_postsend_err:
pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
return -ENOTCONN;
} }
/* /*
@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
DMA_BIDIRECTIONAL); DMA_BIDIRECTIONAL);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc) if (rc)
dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, goto out_postrecv;
rc); return 0;
return rc;
out_postrecv:
pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
return -ENOTCONN;
} }
/** /**

View File

@ -68,7 +68,6 @@ struct rpcrdma_ia {
struct ib_device *ri_device; struct ib_device *ri_device;
struct rdma_cm_id *ri_id; struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd; struct ib_pd *ri_pd;
struct ib_mr *ri_dma_mr;
struct completion ri_done; struct completion ri_done;
int ri_async_rc; int ri_async_rc;
unsigned int ri_max_frmr_depth; unsigned int ri_max_frmr_depth;
@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* o recv buffer (posted to provider) * o recv buffer (posted to provider)
* o ib_sge (also donated to provider) * o ib_sge (also donated to provider)
* o status of reply (length, success or not) * o status of reply (length, success or not)
* o bookkeeping state to get run by tasklet (list, etc) * o bookkeeping state to get run by reply handler (list, etc)
* *
* These are allocated during initialization, per-transport instance; * These are allocated during initialization, per-transport instance.
* however, the tasklet execution list itself is global, as it should
* always be pretty short.
* *
* N of these are associated with a transport instance, and stored in * N of these are associated with a transport instance, and stored in
* struct rpcrdma_buffer. N is the max number of outstanding requests. * struct rpcrdma_buffer. N is the max number of outstanding requests.
*/ */
#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
/* data segments + head/tail for Call + head/tail for Reply */
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
struct rpcrdma_buffer;
struct rpcrdma_rep { struct rpcrdma_rep {
struct ib_cqe rr_cqe; struct ib_cqe rr_cqe;
unsigned int rr_len; unsigned int rr_len;
@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
}; };
struct rpcrdma_frmr { struct rpcrdma_frmr {
struct scatterlist *fr_sg;
int fr_nents;
enum dma_data_direction fr_dir;
struct ib_mr *fr_mr; struct ib_mr *fr_mr;
struct ib_cqe fr_cqe; struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state; enum rpcrdma_frmr_state fr_state;
@ -235,18 +222,23 @@ struct rpcrdma_frmr {
}; };
struct rpcrdma_fmr { struct rpcrdma_fmr {
struct ib_fmr *fmr; struct ib_fmr *fm_mr;
u64 *physaddrs; u64 *fm_physaddrs;
}; };
struct rpcrdma_mw { struct rpcrdma_mw {
struct list_head mw_list;
struct scatterlist *mw_sg;
int mw_nents;
enum dma_data_direction mw_dir;
union { union {
struct rpcrdma_fmr fmr; struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr; struct rpcrdma_frmr frmr;
}; };
struct work_struct mw_work;
struct rpcrdma_xprt *mw_xprt; struct rpcrdma_xprt *mw_xprt;
struct list_head mw_list; u32 mw_handle;
u32 mw_length;
u64 mw_offset;
struct list_head mw_all; struct list_head mw_all;
}; };
@ -266,33 +258,30 @@ struct rpcrdma_mw {
* of iovs for send operations. The reason is that the iovs passed to * of iovs for send operations. The reason is that the iovs passed to
* ib_post_{send,recv} must not be modified until the work request * ib_post_{send,recv} must not be modified until the work request
* completes. * completes.
*
* NOTES:
* o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
* marshal. The number needed varies depending on the iov lists that
* are passed to us, the memory registration mode we are in, and if
* physical addressing is used, the layout.
*/ */
/* Maximum number of page-sized "segments" per chunk list to be
* registered or invalidated. Must handle a Reply chunk:
*/
enum {
RPCRDMA_MAX_IOV_SEGS = 3,
RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
RPCRDMA_MAX_IOV_SEGS,
};
struct rpcrdma_mr_seg { /* chunk descriptors */ struct rpcrdma_mr_seg { /* chunk descriptors */
struct rpcrdma_mw *rl_mw; /* registered MR */
u64 mr_base; /* registration result */
u32 mr_rkey; /* registration result */
u32 mr_len; /* length of chunk or segment */ u32 mr_len; /* length of chunk or segment */
int mr_nsegs; /* number of segments in chunk or 0 */
enum dma_data_direction mr_dir; /* segment mapping direction */
dma_addr_t mr_dma; /* segment mapping address */
size_t mr_dmalen; /* segment mapping length */
struct page *mr_page; /* owning page, if any */ struct page *mr_page; /* owning page, if any */
char *mr_offset; /* kva if no page, else offset */ char *mr_offset; /* kva if no page, else offset */
}; };
#define RPCRDMA_MAX_IOVS (2) #define RPCRDMA_MAX_IOVS (2)
struct rpcrdma_buffer;
struct rpcrdma_req { struct rpcrdma_req {
struct list_head rl_free; struct list_head rl_free;
unsigned int rl_niovs; unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie; unsigned int rl_connect_cookie;
struct rpc_task *rl_task; struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer; struct rpcrdma_buffer *rl_buffer;
@ -300,12 +289,13 @@ struct rpcrdma_req {
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf; struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe; struct ib_cqe rl_cqe;
struct list_head rl_all; struct list_head rl_all;
bool rl_backchannel; bool rl_backchannel;
struct list_head rl_registered; /* registered segments */
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
}; };
static inline struct rpcrdma_req * static inline struct rpcrdma_req *
@ -341,6 +331,11 @@ struct rpcrdma_buffer {
struct list_head rb_allreqs; struct list_head rb_allreqs;
u32 rb_bc_max_requests; u32 rb_bc_max_requests;
spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
struct list_head rb_stale_mrs;
struct delayed_work rb_recovery_worker;
struct delayed_work rb_refresh_worker;
}; };
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@ -387,6 +382,9 @@ struct rpcrdma_stats {
unsigned long bad_reply_count; unsigned long bad_reply_count;
unsigned long nomsg_call_count; unsigned long nomsg_call_count;
unsigned long bcall_count; unsigned long bcall_count;
unsigned long mrs_recovered;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
}; };
/* /*
@ -395,23 +393,25 @@ struct rpcrdma_stats {
struct rpcrdma_xprt; struct rpcrdma_xprt;
struct rpcrdma_memreg_ops { struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *, int (*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool); struct rpcrdma_mr_seg *, int, bool,
struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *, void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *); struct rpcrdma_req *);
void (*ro_unmap_safe)(struct rpcrdma_xprt *, void (*ro_unmap_safe)(struct rpcrdma_xprt *,
struct rpcrdma_req *, bool); struct rpcrdma_req *, bool);
void (*ro_recover_mr)(struct rpcrdma_mw *);
int (*ro_open)(struct rpcrdma_ia *, int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *, struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *); struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *); size_t (*ro_maxpages)(struct rpcrdma_xprt *);
int (*ro_init)(struct rpcrdma_xprt *); int (*ro_init_mr)(struct rpcrdma_ia *,
void (*ro_destroy)(struct rpcrdma_buffer *); struct rpcrdma_mw *);
void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname; const char *ro_displayname;
}; };
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
/* /*
* RPCRDMA transport -- encapsulates the structures above for * RPCRDMA transport -- encapsulates the structures above for
@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
*/ */
int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
void rpcrdma_ia_close(struct rpcrdma_ia *); void rpcrdma_ia_close(struct rpcrdma_ia *);
bool frwr_is_supported(struct rpcrdma_ia *);
bool fmr_is_supported(struct rpcrdma_ia *);
/* /*
* Endpoint calls - xprtrdma/verbs.c * Endpoint calls - xprtrdma/verbs.c
@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
size_t, gfp_t); size_t, gfp_t);
void rpcrdma_free_regbuf(struct rpcrdma_ia *, void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
int frwr_alloc_recovery_wq(void);
void frwr_destroy_recovery_wq(void);
int rpcrdma_alloc_wq(void); int rpcrdma_alloc_wq(void);
void rpcrdma_destroy_wq(void); void rpcrdma_destroy_wq(void);
@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
* Wrappers for chunk registration, shared by read/write chunk code. * Wrappers for chunk registration, shared by read/write chunk code.
*/ */
void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
static inline enum dma_data_direction static inline enum dma_data_direction
rpcrdma_data_dir(bool writing) rpcrdma_data_dir(bool writing)
{ {
return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
} }
static inline void
rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
enum dma_data_direction direction)
{
seg->mr_dir = direction;
seg->mr_dmalen = seg->mr_len;
if (seg->mr_page)
seg->mr_dma = ib_dma_map_page(device,
seg->mr_page, offset_in_page(seg->mr_offset),
seg->mr_dmalen, seg->mr_dir);
else
seg->mr_dma = ib_dma_map_single(device,
seg->mr_offset,
seg->mr_dmalen, seg->mr_dir);
if (ib_dma_mapping_error(device, seg->mr_dma))
rpcrdma_mapping_error(seg);
}
static inline void
rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
{
if (seg->mr_page)
ib_dma_unmap_page(device,
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
else
ib_dma_unmap_single(device,
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
}
/* /*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/ */

View File

@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit, .extra1 = &xprt_min_resvport_limit,
.extra2 = &xprt_max_resvport_limit .extra2 = &xprt_max_resvport
}, },
{ {
.procname = "max_resvport", .procname = "max_resvport",
@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit, .extra1 = &xprt_min_resvport,
.extra2 = &xprt_max_resvport_limit .extra2 = &xprt_max_resvport_limit
}, },
{ {
@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf; struct xdr_buf *xdr = &req->rq_snd_buf;
bool zerocopy = true; bool zerocopy = true;
bool vm_wait = false;
int status; int status;
int sent; int sent;
@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
return 0; return 0;
} }
WARN_ON_ONCE(sent == 0 && status == 0);
if (status == -EAGAIN ) {
/*
* Return EAGAIN if we're sure we're hitting the
* socket send buffer limits.
*/
if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
break;
/*
* Did we hit a memory allocation failure?
*/
if (sent == 0) {
status = -ENOBUFS;
if (vm_wait)
break;
/* Retry, knowing now that we're below the
* socket send buffer limit
*/
vm_wait = true;
}
continue;
}
if (status < 0) if (status < 0)
break; break;
if (sent == 0) { vm_wait = false;
status = -EAGAIN;
break;
}
} }
if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
status = -ENOBUFS;
switch (status) { switch (status) {
case -ENOTSOCK: case -ENOTSOCK:
@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_error_report = transport->old_error_report; sk->sk_error_report = transport->old_error_report;
} }
static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
}
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{ {
smp_mb__before_atomic(); smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state);
xs_sock_reset_state_flags(xprt);
smp_mb__after_atomic(); smp_mb__after_atomic();
} }
@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
goto out; goto out;
for (;;) { for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err); skb = skb_recv_datagram(sk, 0, 1, &err);
if (skb == NULL) if (skb != NULL) {
xs_local_data_read_skb(&transport->xprt, sk, skb);
skb_free_datagram(sk, skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break; break;
xs_local_data_read_skb(&transport->xprt, sk, skb);
skb_free_datagram(sk, skb);
} }
out: out:
mutex_unlock(&transport->recv_mutex); mutex_unlock(&transport->recv_mutex);
@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
goto out; goto out;
for (;;) { for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err); skb = skb_recv_datagram(sk, 0, 1, &err);
if (skb == NULL) if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
skb_free_datagram(sk, skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break; break;
xs_udp_data_read_skb(&transport->xprt, sk, skb);
skb_free_datagram(sk, skb);
} }
out: out:
mutex_unlock(&transport->recv_mutex); mutex_unlock(&transport->recv_mutex);
@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk)
if (xprt != NULL) { if (xprt != NULL) {
struct sock_xprt *transport = container_of(xprt, struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt); struct sock_xprt, xprt);
queue_work(rpciod_workqueue, &transport->recv_worker); transport->old_data_ready(sk);
/* Any data means we had a useful conversation, so
* then we don't need to delay the next reconnect
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
queue_work(xprtiod_workqueue, &transport->recv_worker);
} }
read_unlock_bh(&sk->sk_callback_lock); read_unlock_bh(&sk->sk_callback_lock);
} }
@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
for (;;) { for (;;) {
lock_sock(sk); lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
release_sock(sk); if (read <= 0) {
if (read <= 0) clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
break; release_sock(sk);
total += read; if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
} else {
release_sock(sk);
total += read;
}
rd_desc.count = 65536; rd_desc.count = 65536;
} }
out: out:
@ -1492,34 +1537,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
xs_tcp_data_receive(transport); xs_tcp_data_receive(transport);
} }
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
*
*/
static void xs_tcp_data_ready(struct sock *sk)
{
struct sock_xprt *transport;
struct rpc_xprt *xprt;
dprintk("RPC: xs_tcp_data_ready...\n");
read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
transport = container_of(xprt, struct sock_xprt, xprt);
/* Any data means we had a useful conversation, so
* the we don't need to delay the next reconnect
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
queue_work(rpciod_workqueue, &transport->recv_worker);
out:
read_unlock_bh(&sk->sk_callback_lock);
}
/** /**
* xs_tcp_state_change - callback to handle TCP socket state changes * xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed * @sk: socket whose state has changed
@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
static unsigned short xs_get_random_port(void) static unsigned short xs_get_random_port(void)
{ {
unsigned short range = xprt_max_resvport - xprt_min_resvport; unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
unsigned short rand = (unsigned short) prandom_u32() % range; unsigned short rand = (unsigned short) prandom_u32() % range;
return rand + xprt_min_resvport; return rand + xprt_min_resvport;
} }
@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk); xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt; sk->sk_user_data = xprt;
sk->sk_data_ready = xs_tcp_data_ready; sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change; sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space; sk->sk_write_space = xs_tcp_write_space;
sock_set_flag(sk, SOCK_FASYNC); sock_set_flag(sk, SOCK_FASYNC);
@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
/* Start by resetting any existing state */ /* Start by resetting any existing state */
xs_reset_transport(transport); xs_reset_transport(transport);
queue_delayed_work(rpciod_workqueue, queue_delayed_work(xprtiod_workqueue,
&transport->connect_worker, &transport->connect_worker,
xprt->reestablish_timeout); xprt->reestablish_timeout);
xprt->reestablish_timeout <<= 1; xprt->reestablish_timeout <<= 1;
@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
} else { } else {
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
queue_delayed_work(rpciod_workqueue, queue_delayed_work(xprtiod_workqueue,
&transport->connect_worker, 0); &transport->connect_worker, 0);
} }
} }
@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val,
static int param_set_portnr(const char *val, const struct kernel_param *kp) static int param_set_portnr(const char *val, const struct kernel_param *kp)
{ {
return param_set_uint_minmax(val, kp, if (kp->arg == &xprt_min_resvport)
return param_set_uint_minmax(val, kp,
RPC_MIN_RESVPORT, RPC_MIN_RESVPORT,
xprt_max_resvport);
return param_set_uint_minmax(val, kp,
xprt_min_resvport,
RPC_MAX_RESVPORT); RPC_MAX_RESVPORT);
} }