From ddeaa6379d50a530f9f57b3d12f7940e079b052c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 15 Oct 2016 13:59:03 -0700 Subject: [PATCH 01/64] sunrpc & nfs: Add and use dprintk_cont macros Allow line continuations to work properly with KERN_CONT. Signed-off-by: Joe Perches [Anna: Add fallback dprintk_cont() for when CONFIG_SUNRPC_DEBUG=n] Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 6 ++-- include/linux/sunrpc/debug.h | 56 +++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b00d53d13d47..ad4219a41f25 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1787,7 +1787,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) if (status < 0) { nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); - dprintk(", error = %d\n", status); + dprintk_cont(", error = %d\n", status); goto next; } @@ -1796,11 +1796,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); - dprintk(" OK\n"); + dprintk_cont(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk(" mismatch\n"); + dprintk_cont(" mismatch\n"); nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 59a7889e15db..8da0f37f3bdc 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -20,33 +20,55 @@ extern unsigned int nfsd_debug; extern unsigned int nlm_debug; #endif -#define dprintk(args...) dfprintk(FACILITY, ## args) -#define dprintk_rcu(args...) dfprintk_rcu(FACILITY, ## args) +#define dprintk(fmt, ...) \ + dfprintk(FACILITY, fmt, ##__VA_ARGS__) +#define dprintk_cont(fmt, ...) \ + dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__) +#define dprintk_rcu(fmt, ...) \ + dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__) +#define dprintk_rcu_cont(fmt, ...) \ + dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__) #undef ifdebug #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) -# define dfprintk(fac, args...) \ - do { \ - ifdebug(fac) \ - printk(KERN_DEFAULT args); \ - } while (0) +# define dfprintk(fac, fmt, ...) \ +do { \ + ifdebug(fac) \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ +} while (0) -# define dfprintk_rcu(fac, args...) \ - do { \ - ifdebug(fac) { \ - rcu_read_lock(); \ - printk(KERN_DEFAULT args); \ - rcu_read_unlock(); \ - } \ - } while (0) +# define dfprintk_cont(fac, fmt, ...) \ +do { \ + ifdebug(fac) \ + printk(KERN_CONT fmt, ##__VA_ARGS__); \ +} while (0) + +# define dfprintk_rcu(fac, fmt, ...) \ +do { \ + ifdebug(fac) { \ + rcu_read_lock(); \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + rcu_read_unlock(); \ + } \ +} while (0) + +# define dfprintk_rcu_cont(fac, fmt, ...) \ +do { \ + ifdebug(fac) { \ + rcu_read_lock(); \ + printk(KERN_CONT fmt, ##__VA_ARGS__); \ + rcu_read_unlock(); \ + } \ +} while (0) # define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) -# define dfprintk(fac, args...) do {} while (0) -# define dfprintk_rcu(fac, args...) do {} while (0) +# define dfprintk(fac, fmt, ...) do {} while (0) +# define dfprintk_cont(fac, fmt, ...) do {} while (0) +# define dfprintk_rcu(fac, fmt, ...) do {} while (0) # define RPC_IFDEBUG(x) #endif From 4c3ffd058c28bb1aac30a68d1f02de4a2077823e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Jan 2017 14:19:58 +1100 Subject: [PATCH 02/64] SUNRPC: two small improvements to rpcauth shrinker. 1/ If we find an entry that is too young to be pruned, return SHRINK_STOP to ensure we don't get called again. This is more correct, and avoids wasting a little CPU time. Prior to 3.12, it can prevent drop_slab() from spinning indefinitely. 2/ Return a precise number from rpcauth_cache_shrink_count(), rather than rounding down to a multiple of 100 (of whatever sysctl_vfs_cache_pressure is). This ensures that when we "echo 3 > /proc/sys/vm/drop_caches", this cache is still purged, even if it has fewer than 100 entires. Neither of these are really important, they just make behaviour more predicatable, which can be helpful when debugging related issues. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- net/sunrpc/auth.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 2bff63a73cf8..d8f17ea7932e 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -464,8 +464,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { + freed = SHRINK_STOP; break; + } list_del_init(&cred->cr_lru); number_cred_unused--; @@ -520,7 +522,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; + return number_cred_unused * sysctl_vfs_cache_pressure / 100; } static void From 6f6e3c09c0dde7fbaf36f02ffa97bbb13fecd637 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2017 11:04:27 +1100 Subject: [PATCH 03/64] NFS: tidy up nfs_show_mountd_netid This function is a bit clumsy, incorrectly producing ",mountproto=" if mountd_protocol is 0 and !showdefaults, and duplicating the code for reporting "auto". Tidy it up so that it only makes a single seq_printf() call, and more obviously does the right thing. Fixes: ee671b016fbf ("NFS: convert proto= option to use netids rather than a protoname") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6bca17883b93..54e0f9f2dd94 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -531,39 +531,32 @@ static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + char *proto = NULL; - seq_printf(m, ",mountproto="); switch (sap->sa_family) { case AF_INET: switch (nfss->mountd_protocol) { case IPPROTO_UDP: - seq_printf(m, RPCBIND_NETID_UDP); + proto = RPCBIND_NETID_UDP; break; case IPPROTO_TCP: - seq_printf(m, RPCBIND_NETID_TCP); + proto = RPCBIND_NETID_TCP; break; - default: - if (showdefaults) - seq_printf(m, "auto"); } break; case AF_INET6: switch (nfss->mountd_protocol) { case IPPROTO_UDP: - seq_printf(m, RPCBIND_NETID_UDP6); + proto = RPCBIND_NETID_UDP6; break; case IPPROTO_TCP: - seq_printf(m, RPCBIND_NETID_TCP6); + proto = RPCBIND_NETID_TCP6; break; - default: - if (showdefaults) - seq_printf(m, "auto"); } break; - default: - if (showdefaults) - seq_printf(m, "auto"); } + if (proto || showdefaults) + seq_printf(m, ",mountproto=%s", proto ?: "auto"); } static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, From efc6f4aa742d7e8a67e44d11ab18b1c865880b4c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 9 Jan 2017 15:14:33 -0500 Subject: [PATCH 04/64] NFS: Move nfs4_get_session() into nfs4_session.h This puts session related functions together in the same space. I only keep one version of this function, since this variable will always be NULL when using NFS v4.0. Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 1 + fs/nfs/nfs4_fs.h | 10 ---------- fs/nfs/nfs4session.h | 5 +++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index d12ff9385f49..b752d85ad8c6 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -12,6 +12,7 @@ #include "nfs42.h" #include "iostat.h" #include "pnfs.h" +#include "nfs4session.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 665165833660..04ffea093d56 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -273,11 +273,6 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, fmode_t fmode); #if defined(CONFIG_NFS_V4_1) -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) -{ - return server->nfs_client->cl_session; -} - extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); @@ -357,11 +352,6 @@ nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, hdr->args.stable = NFS_FILE_SYNC; } #else /* CONFIG_NFS_v4_1 */ -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) -{ - return NULL; -} - static inline bool is_ds_only_client(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index dae385500005..22cb55015b2b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -103,6 +103,11 @@ static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl, return !!test_bit(slotid, tbl->used_slots); } +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + #if defined(CONFIG_NFS_V4_1) extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, u32 target_highest_slotid); From 172d9de15a0da9b6d52d2bae21a362689d82d35a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 14 May 2015 10:56:10 -0400 Subject: [PATCH 05/64] NFS: Change nfs4_get_session() to take an nfs_client structure pNFS only has access to the nfs_client structure, and not the nfs_server, so we need to make this change so the function can be used by pNFS as well. Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 5 ++--- fs/nfs/nfs4proc.c | 10 +++++----- fs/nfs/nfs4session.h | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b752d85ad8c6..b03010eebafd 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -332,9 +332,8 @@ nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) } nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); spin_unlock(&inode->i_lock); - nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, - &data->res.seq_res, task); - + nfs41_setup_sequence(nfs4_get_session(server->nfs_client), + &data->args.seq_args, &data->res.seq_res, task); } static void diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0a0eaecf9676..46cb55a81dbf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -952,7 +952,7 @@ static int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_res *res, struct rpc_task *task) { - struct nfs4_session *session = nfs4_get_session(server); + struct nfs4_session *session = nfs4_get_session(server->nfs_client); int ret = 0; if (!session) @@ -972,7 +972,7 @@ static int nfs4_setup_sequence(const struct nfs_server *server, static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; - struct nfs4_session *session = nfs4_get_session(data->seq_server); + struct nfs4_session *session = nfs4_get_session(data->seq_server->nfs_client); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); @@ -8397,7 +8397,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); - struct nfs4_session *session = nfs4_get_session(server); + struct nfs4_session *session = nfs4_get_session(server->nfs_client); dprintk("--> %s\n", __func__); nfs41_setup_sequence(session, &lgp->args.seq_args, @@ -8794,7 +8794,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); - struct nfs4_session *session = nfs4_get_session(server); + struct nfs4_session *session = nfs4_get_session(server->nfs_client); nfs41_setup_sequence(session, &data->args.seq_args, @@ -9120,7 +9120,7 @@ struct nfs_free_stateid_data { static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) { struct nfs_free_stateid_data *data = calldata; - nfs41_setup_sequence(nfs4_get_session(data->server), + nfs41_setup_sequence(nfs4_get_session(data->server->nfs_client), &data->args.seq_args, &data->res.seq_res, task); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 22cb55015b2b..f6378d95b1b5 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -103,9 +103,9 @@ static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl, return !!test_bit(slotid, tbl->used_slots); } -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +static inline struct nfs4_session *nfs4_get_session(const struct nfs_client *clp) { - return server->nfs_client->cl_session; + return clp->cl_session; } #if defined(CONFIG_NFS_V4_1) From 42e1cca7e91e1a7502694cfd18857ed243f54174 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 9 Jan 2017 15:48:22 -0500 Subject: [PATCH 06/64] NFS: Change nfs4_setup_sequence() to take an nfs_client structure I want to have all callers use this function, rather than calling the NFS v4.0 and v4.1 versions directly. This includes pNFS, which only has access to the nfs_client structure in some places. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 46cb55a81dbf..f62d58cb3014 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -947,16 +947,16 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs41_setup_sequence); -static int nfs4_setup_sequence(const struct nfs_server *server, +static int nfs4_setup_sequence(const struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) { - struct nfs4_session *session = nfs4_get_session(server->nfs_client); + struct nfs4_session *session = nfs4_get_session(client); int ret = 0; if (!session) - return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); dprintk("--> %s clp %p session %p sr_slot %u\n", @@ -993,13 +993,12 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { #else /* !CONFIG_NFS_V4_1 */ -static int nfs4_setup_sequence(const struct nfs_server *server, +static int nfs4_setup_sequence(const struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) { - return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, - args, res, task); + return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); } static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -1025,7 +1024,7 @@ EXPORT_SYMBOL_GPL(nfs4_sequence_done); static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; - nfs4_setup_sequence(data->seq_server, + nfs4_setup_sequence(data->seq_server->nfs_client, data->seq_args, data->seq_res, task); } @@ -2172,7 +2171,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->o_arg.server, + if (nfs4_setup_sequence(data->o_arg.server->nfs_client, &data->o_arg.seq_args, &data->o_res.seq_res, task) != 0) @@ -3234,7 +3233,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) else if (calldata->arg.bitmask == NULL) calldata->res.fattr = NULL; calldata->timestamp = jiffies; - if (nfs4_setup_sequence(NFS_SERVER(inode), + if (nfs4_setup_sequence(NFS_SERVER(inode)->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, task) != 0) @@ -4114,7 +4113,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - nfs4_setup_sequence(NFS_SB(data->dentry->d_sb), + nfs4_setup_sequence(NFS_SB(data->dentry->d_sb)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -4148,7 +4147,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) { - nfs4_setup_sequence(NFS_SERVER(data->old_dir), + nfs4_setup_sequence(NFS_SERVER(data->old_dir)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -4723,7 +4722,7 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (nfs4_setup_sequence(NFS_SERVER(hdr->inode), + if (nfs4_setup_sequence(NFS_SERVER(hdr->inode)->nfs_client, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -4822,7 +4821,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->inode), + nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -5747,7 +5746,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; - nfs4_setup_sequence(d_data->res.server, + nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, &d_data->res.seq_res, task); @@ -5989,7 +5988,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) goto out_no_action; } calldata->timestamp = jiffies; - if (nfs4_setup_sequence(calldata->server, + if (nfs4_setup_sequence(calldata->server->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, task) != 0) @@ -6174,7 +6173,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) goto out_release_open_seqid; } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->server, + if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, &data->res.seq_res, task) == 0) From 6de7e12f53a154d35d9aceae718b764ada23e430 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 9 Jan 2017 16:51:52 -0500 Subject: [PATCH 07/64] NFS: Use nfs4_setup_sequence() everywhere This does the right thing depending on if we have a session, rather than needing to handle this manually in multiple places. Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 6 ++-- fs/nfs/flexfilelayout/flexfilelayout.c | 40 ++++++++------------------ fs/nfs/nfs42proc.c | 4 +-- fs/nfs/nfs4_fs.h | 5 +--- fs/nfs/nfs4proc.c | 30 +++++++++---------- 5 files changed, 33 insertions(+), 52 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a3fc48ba4931..7aff350f15b1 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -305,7 +305,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } hdr->pgio_done_cb = filelayout_read_done_cb; - if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + if (nfs4_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -403,7 +403,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + if (nfs4_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -438,7 +438,7 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - nfs41_setup_sequence(wdata->ds_clp->cl_session, + nfs4_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, task); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0ca4af8cca5d..cc9064974104 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1384,30 +1384,14 @@ static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data) rpc_call_start(task); } -static int ff_layout_setup_sequence(struct nfs_client *ds_clp, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - if (ds_clp->cl_session) - return nfs41_setup_sequence(ds_clp->cl_session, - args, - res, - task); - return nfs40_setup_sequence(ds_clp->cl_slot_tbl, - args, - res, - task); -} - static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_setup_sequence(hdr->ds_clp, - &hdr->args.seq_args, - &hdr->res.seq_res, - task)) + if (nfs4_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) return; if (ff_layout_read_prepare_common(task, hdr)) @@ -1578,10 +1562,10 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_setup_sequence(hdr->ds_clp, - &hdr->args.seq_args, - &hdr->res.seq_res, - task)) + if (nfs4_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) return; if (ff_layout_write_prepare_common(task, hdr)) @@ -1667,10 +1651,10 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - if (ff_layout_setup_sequence(wdata->ds_clp, - &wdata->args.seq_args, - &wdata->res.seq_res, - task)) + if (nfs4_setup_sequence(wdata->ds_clp, + &wdata->args.seq_args, + &wdata->res.seq_res, + task)) return; ff_layout_commit_prepare_common(task, data); } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b03010eebafd..98cf58341066 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -332,8 +332,8 @@ nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) } nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); spin_unlock(&inode->i_lock); - nfs41_setup_sequence(nfs4_get_session(server->nfs_client), - &data->args.seq_args, &data->res.seq_res, task); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); } static void diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 04ffea093d56..af285cc27ccf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -273,9 +273,6 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, fmode_t fmode); #if defined(CONFIG_NFS_V4_1) -extern int nfs41_setup_sequence(struct nfs4_session *session, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - struct rpc_task *task); extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); @@ -456,7 +453,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); -extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl, +extern int nfs4_setup_sequence(const struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f62d58cb3014..4ced1c964b1e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -622,10 +622,10 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) args->sa_privileged = 1; } -int nfs40_setup_sequence(struct nfs4_slot_table *tbl, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) +static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) { struct nfs4_slot *slot; @@ -662,7 +662,6 @@ out_sleep: spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } -EXPORT_SYMBOL_GPL(nfs40_setup_sequence); static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { @@ -882,7 +881,7 @@ int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -int nfs41_setup_sequence(struct nfs4_session *session, +static int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) @@ -945,12 +944,11 @@ out_sleep: spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } -EXPORT_SYMBOL_GPL(nfs41_setup_sequence); -static int nfs4_setup_sequence(const struct nfs_client *client, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) +int nfs4_setup_sequence(const struct nfs_client *client, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) { struct nfs4_session *session = nfs4_get_session(client); int ret = 0; @@ -968,6 +966,7 @@ static int nfs4_setup_sequence(const struct nfs_client *client, dprintk("<-- %s status=%d\n", __func__, ret); return ret; } +EXPORT_SYMBOL_GPL(nfs4_setup_sequence); static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { @@ -993,13 +992,14 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { #else /* !CONFIG_NFS_V4_1 */ -static int nfs4_setup_sequence(const struct nfs_client *client, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) +int nfs4_setup_sequence(const struct nfs_client *client, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) { return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); } +EXPORT_SYMBOL_GPL(nfs4_setup_sequence); static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { From 7981c8a659143173882dda43b3056a777faeeb7b Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 10 Jan 2017 11:39:53 -0500 Subject: [PATCH 08/64] NFS: Create a single nfs4_setup_sequence() function The inline ifdef lets us put everything in a single place, rather than having two (very similar) versions of this function. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 76 ++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 47 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4ced1c964b1e..e6f563c1689e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -945,37 +945,14 @@ out_sleep: return -EAGAIN; } -int nfs4_setup_sequence(const struct nfs_client *client, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_session *session = nfs4_get_session(client); - int ret = 0; - - if (!session) - return nfs40_setup_sequence(client->cl_slot_tbl, - args, res, task); - - dprintk("--> %s clp %p session %p sr_slot %u\n", - __func__, session->clp, session, res->sr_slot ? - res->sr_slot->slot_nr : NFS4_NO_SLOT); - - ret = nfs41_setup_sequence(session, args, res, task); - - dprintk("<-- %s status=%d\n", __func__, ret); - return ret; -} -EXPORT_SYMBOL_GPL(nfs4_setup_sequence); - static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; - struct nfs4_session *session = nfs4_get_session(data->seq_server->nfs_client); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); + nfs4_setup_sequence(data->seq_server->nfs_client, + data->seq_args, data->seq_res, task); } static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) @@ -992,15 +969,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { #else /* !CONFIG_NFS_V4_1 */ -int nfs4_setup_sequence(const struct nfs_client *client, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); -} -EXPORT_SYMBOL_GPL(nfs4_setup_sequence); - static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { return nfs40_sequence_done(task, res); @@ -1021,6 +989,22 @@ EXPORT_SYMBOL_GPL(nfs4_sequence_done); #endif /* !CONFIG_NFS_V4_1 */ +int nfs4_setup_sequence(const struct nfs_client *client, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session = nfs4_get_session(client); + + if (session) + return nfs41_setup_sequence(session, args, res, task); +#endif /* CONFIG_NFS_V4_1 */ + return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); + +} +EXPORT_SYMBOL_GPL(nfs4_setup_sequence); + static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; @@ -2047,8 +2031,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl, - &data->c_arg.seq_args, &data->c_res.seq_res, task); + nfs4_setup_sequence(data->o_arg.server->nfs_client, + &data->c_arg.seq_args, &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -6639,8 +6623,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata { struct nfs_release_lockowner_data *data = calldata; struct nfs_server *server = data->server; - nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, - &data->args.seq_args, &data->res.seq_res, task); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); data->args.lock_owner.clientid = server->nfs_client->cl_clientid; data->timestamp = jiffies; } @@ -7830,7 +7814,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, dprintk("--> %s\n", __func__); /* just setup sequence, do not trigger session recovery since we're invoked within one */ - nfs41_setup_sequence(data->clp->cl_session, + nfs4_setup_sequence(data->clp, &data->args->la_seq_args, &data->res->lr_seq_res, task); @@ -8201,7 +8185,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - nfs41_setup_sequence(clp->cl_session, args, res, task); + nfs4_setup_sequence(clp, args, res, task); } static const struct rpc_call_ops nfs41_sequence_ops = { @@ -8289,7 +8273,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; - nfs41_setup_sequence(calldata->clp->cl_session, + nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, &calldata->res.seq_res, task); @@ -8396,10 +8380,9 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); - struct nfs4_session *session = nfs4_get_session(server->nfs_client); dprintk("--> %s\n", __func__); - nfs41_setup_sequence(session, &lgp->args.seq_args, + nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args, &lgp->res.seq_res, task); dprintk("<-- %s\n", __func__); } @@ -8643,7 +8626,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - nfs41_setup_sequence(lrp->clp->cl_session, + nfs4_setup_sequence(lrp->clp, &lrp->args.seq_args, &lrp->res.seq_res, task); @@ -8793,9 +8776,8 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); - struct nfs4_session *session = nfs4_get_session(server->nfs_client); - nfs41_setup_sequence(session, + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -9119,7 +9101,7 @@ struct nfs_free_stateid_data { static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) { struct nfs_free_stateid_data *data = calldata; - nfs41_setup_sequence(nfs4_get_session(data->server->nfs_client), + nfs4_setup_sequence(data->server->nfs_client, &data->args.seq_args, &data->res.seq_res, task); From 9dd9107f330c5d371c62b460975a32d8bd5712b4 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 10 Jan 2017 12:01:46 -0500 Subject: [PATCH 09/64] NFS: Move slot-already-allocated check into nfs_setup_sequence() This puts the check in a single place, rather than needing to implement it twice for v4.0 and v4.1. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e6f563c1689e..dcb2f73c7d4c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -629,10 +629,6 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, { struct nfs4_slot *slot; - /* slot already allocated? */ - if (res->sr_slot != NULL) - goto out_start; - spin_lock(&tbl->slot_tbl_lock); if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) goto out_sleep; @@ -649,7 +645,6 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, args->sa_slot = slot; res->sr_slot = slot; -out_start: rpc_call_start(task); return 0; @@ -890,10 +885,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_slot_table *tbl; dprintk("--> %s\n", __func__); - /* slot already allocated? */ - if (res->sr_slot != NULL) - goto out_success; - tbl = &session->fc_slot_table; task->tk_timeout = 0; @@ -931,7 +922,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, */ res->sr_status = 1; trace_nfs4_setup_sequence(session, args); -out_success: rpc_call_start(task); return 0; out_sleep: @@ -996,12 +986,21 @@ int nfs4_setup_sequence(const struct nfs_client *client, { #if defined(CONFIG_NFS_V4_1) struct nfs4_session *session = nfs4_get_session(client); +#endif /* CONFIG_NFS_V4_1 */ + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_start; + +#if defined(CONFIG_NFS_V4_1) if (session) return nfs41_setup_sequence(session, args, res, task); #endif /* CONFIG_NFS_V4_1 */ return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); +out_start: + rpc_call_start(task); + return 0; } EXPORT_SYMBOL_GPL(nfs4_setup_sequence); From 6994cdd798568a0ddb8e0a85e2af24dbe655c341 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 10 Jan 2017 16:13:27 -0500 Subject: [PATCH 10/64] NFS: Lock the slot table from a single place during setup sequence Rather than implementing this twice for NFS v4.0 and v4.1 Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dcb2f73c7d4c..681cb5e77da2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -629,7 +629,6 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, { struct nfs4_slot *slot; - spin_lock(&tbl->slot_tbl_lock); if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) goto out_sleep; @@ -639,13 +638,10 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, task->tk_timeout = HZ >> 2; goto out_sleep; } - spin_unlock(&tbl->slot_tbl_lock); slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; - - rpc_call_start(task); return 0; out_sleep: @@ -654,7 +650,6 @@ out_sleep: NULL, RPC_PRIORITY_PRIVILEGED); else rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } @@ -889,7 +884,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, task->tk_timeout = 0; - spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ @@ -905,7 +899,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("<-- %s: no free slots\n", __func__); goto out_sleep; } - spin_unlock(&tbl->slot_tbl_lock); slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; @@ -922,7 +915,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, */ res->sr_status = 1; trace_nfs4_setup_sequence(session, args); - rpc_call_start(task); return 0; out_sleep: /* Privileged tasks are queued with top priority */ @@ -931,7 +923,6 @@ out_sleep: NULL, RPC_PRIORITY_PRIVILEGED); else rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } @@ -984,19 +975,27 @@ int nfs4_setup_sequence(const struct nfs_client *client, struct nfs4_sequence_res *res, struct rpc_task *task) { -#if defined(CONFIG_NFS_V4_1) struct nfs4_session *session = nfs4_get_session(client); -#endif /* CONFIG_NFS_V4_1 */ + struct nfs4_slot_table *tbl = session ? &session->fc_slot_table : + client->cl_slot_tbl; + int ret; /* slot already allocated? */ if (res->sr_slot != NULL) goto out_start; + spin_lock(&tbl->slot_tbl_lock); + #if defined(CONFIG_NFS_V4_1) if (session) - return nfs41_setup_sequence(session, args, res, task); + ret = nfs41_setup_sequence(session, args, res, task); + else #endif /* CONFIG_NFS_V4_1 */ - return nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); + ret = nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); + + spin_unlock(&tbl->slot_tbl_lock); + if (ret < 0) + return ret; out_start: rpc_call_start(task); From 0dcee8bb749e5de43b180ccfcb0c5005aa529c92 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 10 Jan 2017 16:29:54 -0500 Subject: [PATCH 11/64] NFS: Handle setup sequence task rescheduling in a single place Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 681cb5e77da2..9759d6f9e1a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -630,27 +630,19 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, struct nfs4_slot *slot; if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; + return -EAGAIN; slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { if (slot == ERR_PTR(-ENOMEM)) task->tk_timeout = HZ >> 2; - goto out_sleep; + return -EAGAIN; } slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; return 0; - -out_sleep: - if (args->sa_privileged) - rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); - else - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - return -EAGAIN; } static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) @@ -888,7 +880,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); - goto out_sleep; + return -EAGAIN; } slot = nfs4_alloc_slot(tbl); @@ -897,7 +889,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, if (slot == ERR_PTR(-ENOMEM)) task->tk_timeout = HZ >> 2; dprintk("<-- %s: no free slots\n", __func__); - goto out_sleep; + return -EAGAIN; } slot->privileged = args->sa_privileged ? 1 : 0; @@ -916,14 +908,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_status = 1; trace_nfs4_setup_sequence(session, args); return 0; -out_sleep: - /* Privileged tasks are queued with top priority */ - if (args->sa_privileged) - rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); - else - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - return -EAGAIN; } static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) @@ -993,13 +977,22 @@ int nfs4_setup_sequence(const struct nfs_client *client, #endif /* CONFIG_NFS_V4_1 */ ret = nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); + if (ret == -EAGAIN) + goto out_sleep; spin_unlock(&tbl->slot_tbl_lock); - if (ret < 0) - return ret; out_start: rpc_call_start(task); return 0; + +out_sleep: + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; } EXPORT_SYMBOL_GPL(nfs4_setup_sequence); From 76ee03540f316948c3bc89fc76ded86c88e887a5 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 10 Jan 2017 16:49:31 -0500 Subject: [PATCH 12/64] NFS: Check if the slot table is draining from nfs4_setup_sequence() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9759d6f9e1a5..a74e1a4e3264 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -629,9 +629,6 @@ static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, { struct nfs4_slot *slot; - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - return -EAGAIN; - slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { if (slot == ERR_PTR(-ENOMEM)) @@ -874,15 +871,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, dprintk("--> %s\n", __func__); tbl = &session->fc_slot_table; - task->tk_timeout = 0; - - if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && - !args->sa_privileged) { - /* The state manager will wait until the slot table is empty */ - dprintk("%s session is draining\n", __func__); - return -EAGAIN; - } - slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { /* If out of memory, try again in 1/4 second */ @@ -960,15 +948,22 @@ int nfs4_setup_sequence(const struct nfs_client *client, struct rpc_task *task) { struct nfs4_session *session = nfs4_get_session(client); - struct nfs4_slot_table *tbl = session ? &session->fc_slot_table : - client->cl_slot_tbl; + struct nfs4_slot_table *tbl = client->cl_slot_tbl; int ret; /* slot already allocated? */ if (res->sr_slot != NULL) goto out_start; + if (session) { + tbl = &session->fc_slot_table; + task->tk_timeout = 0; + } + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; #if defined(CONFIG_NFS_V4_1) if (session) From 3d35808b1de48e0c478668b3177fcd3360aae543 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 10:54:04 -0500 Subject: [PATCH 13/64] NFS: Merge the remaining setup_sequence functions This creates a single place for all the work to happen, using the presence of a session to determine if extra values need to be set. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 87 +++++++++++------------------------------------ 1 file changed, 20 insertions(+), 67 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a74e1a4e3264..f6a362ffcbd4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -622,26 +622,6 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) args->sa_privileged = 1; } -static int nfs40_setup_sequence(struct nfs4_slot_table *tbl, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_slot *slot; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - return -EAGAIN; - } - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - res->sr_slot = slot; - return 0; -} - static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; @@ -860,44 +840,6 @@ int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -static int nfs41_setup_sequence(struct nfs4_session *session, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_slot *slot; - struct nfs4_slot_table *tbl; - - dprintk("--> %s\n", __func__); - tbl = &session->fc_slot_table; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* If out of memory, try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - dprintk("<-- %s: no free slots\n", __func__); - return -EAGAIN; - } - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - - dprintk("<-- %s slotid=%u seqid=%u\n", __func__, - slot->slot_nr, slot->seq_nr); - - res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - /* - * sr_status is only set in decode_sequence, and so will remain - * set to 1 if an rpc level failure occurs. - */ - res->sr_status = 1; - trace_nfs4_setup_sequence(session, args); - return 0; -} - static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; @@ -949,7 +891,7 @@ int nfs4_setup_sequence(const struct nfs_client *client, { struct nfs4_session *session = nfs4_get_session(client); struct nfs4_slot_table *tbl = client->cl_slot_tbl; - int ret; + struct nfs4_slot *slot; /* slot already allocated? */ if (res->sr_slot != NULL) @@ -965,17 +907,28 @@ int nfs4_setup_sequence(const struct nfs_client *client, if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) goto out_sleep; -#if defined(CONFIG_NFS_V4_1) - if (session) - ret = nfs41_setup_sequence(session, args, res, task); - else -#endif /* CONFIG_NFS_V4_1 */ - ret = nfs40_setup_sequence(client->cl_slot_tbl, args, res, task); - - if (ret == -EAGAIN) + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; goto out_sleep; + } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; + args->sa_slot = slot; + + res->sr_slot = slot; + if (session) { + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; +#ifdef CONFIG_NFS_V4_1 + trace_nfs4_setup_sequence(session, args); +#endif /* CONFIG_NFS_V4_1 */ + } + out_start: rpc_call_start(task); return 0; From ad05cc0f04341216923895c05e2c364ef34f1bb4 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 13:37:06 -0500 Subject: [PATCH 14/64] NFS: Make trace_nfs4_setup_sequence() available to NFS v4.0 This tracepoint displays information about the slot that was chosen for the RPC, in addition to session information. This could be useful information for debugging, and we can set the session id hash to 0 to indicate that there is no session. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 +-- fs/nfs/nfs4session.h | 2 ++ fs/nfs/nfs4trace.h | 64 ++++++++++++++++++++++---------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f6a362ffcbd4..ae03f976fbd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -924,11 +924,9 @@ int nfs4_setup_sequence(const struct nfs_client *client, res->sr_timestamp = jiffies; res->sr_status_flags = 0; res->sr_status = 1; -#ifdef CONFIG_NFS_V4_1 - trace_nfs4_setup_sequence(session, args); -#endif /* CONFIG_NFS_V4_1 */ } + trace_nfs4_setup_sequence(session, args); out_start: rpc_call_start(task); return 0; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index f6378d95b1b5..dfae4880eacb 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -175,6 +175,8 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +#define nfs_session_id_hash(session) (0) + #endif /* defined(CONFIG_NFS_V4_1) */ #endif /* IS_ENABLED(CONFIG_NFS_V4) */ #endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index cfb8f7ce5cf6..845d0eadefc9 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -241,38 +241,6 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); -TRACE_EVENT(nfs4_setup_sequence, - TP_PROTO( - const struct nfs4_session *session, - const struct nfs4_sequence_args *args - ), - TP_ARGS(session, args), - - TP_STRUCT__entry( - __field(unsigned int, session) - __field(unsigned int, slot_nr) - __field(unsigned int, seq_nr) - __field(unsigned int, highest_used_slotid) - ), - - TP_fast_assign( - const struct nfs4_slot *sa_slot = args->sa_slot; - __entry->session = nfs_session_id_hash(&session->sess_id); - __entry->slot_nr = sa_slot->slot_nr; - __entry->seq_nr = sa_slot->seq_nr; - __entry->highest_used_slotid = - sa_slot->table->highest_used_slotid; - ), - TP_printk( - "session=0x%08x slot_nr=%u seq_nr=%u " - "highest_used_slotid=%u", - __entry->session, - __entry->slot_nr, - __entry->seq_nr, - __entry->highest_used_slotid - ) -); - #define show_nfs4_sequence_status_flags(status) \ __print_flags((unsigned long)status, "|", \ { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ @@ -382,6 +350,38 @@ TRACE_EVENT(nfs4_cb_sequence, ); #endif /* CONFIG_NFS_V4_1 */ +TRACE_EVENT(nfs4_setup_sequence, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_args *args + ), + TP_ARGS(session, args), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_used_slotid) + ), + + TP_fast_assign( + const struct nfs4_slot *sa_slot = args->sa_slot; + __entry->session = session ? nfs_session_id_hash(&session->sess_id) : 0; + __entry->slot_nr = sa_slot->slot_nr; + __entry->seq_nr = sa_slot->seq_nr; + __entry->highest_used_slotid = + sa_slot->table->highest_used_slotid; + ), + TP_printk( + "session=0x%08x slot_nr=%u seq_nr=%u " + "highest_used_slotid=%u", + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_used_slotid + ) +); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, From d9b67e1e499b056a83d2db6046d74652cf836998 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 15:04:25 -0500 Subject: [PATCH 15/64] NFS: Fix inconsistent indentation in nfs4proc.c Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 56 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ae03f976fbd3..328eda9e6db1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2209,15 +2209,15 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->is_recover = 1; } task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) { - data->cancelled = 1; - smp_wmb(); - } else - status = data->rpc_status; - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); return status; } @@ -2226,7 +2226,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) { struct inode *dir = d_inode(data->dir); struct nfs_openres *o_res = &data->o_res; - int status; + int status; status = nfs4_run_open_task(data, 1); if (status != 0 || !data->rpc_done) @@ -2860,12 +2860,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_open_context *ctx) { struct nfs_server *server = NFS_SERVER(inode); - struct rpc_message msg = { + struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], .rpc_argp = arg, .rpc_resp = res, .rpc_cred = cred, - }; + }; struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; @@ -2913,18 +2913,18 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_state *state = ctx ? ctx->state : NULL; - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, .server = server, .bitmask = server->attr_bitmask, .label = ilabel, - }; - struct nfs_setattrres res = { + }; + struct nfs_setattrres res = { .fattr = fattr, .label = olabel, .server = server, - }; + }; struct nfs4_exception exception = { .state = state, .inode = inode, @@ -3038,7 +3038,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) } } - /* hmm. we are done with the inode, and in the process of freeing + /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ switch (task->tk_status) { @@ -4895,8 +4895,8 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen, if (newpage == NULL) goto unwind; memcpy(page_address(newpage), buf, len); - buf += len; - buflen -= len; + buf += len; + buflen -= len; *pages++ = newpage; rc++; } while (buflen != 0); @@ -5219,8 +5219,8 @@ static int _nfs4_do_set_security_label(struct inode *inode, struct nfs_server *server = NFS_SERVER(inode); const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = &sattr, + .fh = NFS_FH(inode), + .iap = &sattr, .server = server, .bitmask = bitmask, .label = ilabel, @@ -5231,9 +5231,9 @@ static int _nfs4_do_set_security_label(struct inode *inode, .server = server, }; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, }; int status; @@ -5779,8 +5779,8 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = &arg, + .rpc_resp = &res, .rpc_cred = state->owner->so_cred, }; struct nfs4_lock_state *lsp; From eeea5361634eeef9a09b7b5ae449b41623c23886 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:01:21 -0500 Subject: [PATCH 16/64] NFS: Clean up _nfs4_is_integrity_protected() We can cut out the if statement and return the results of the comparison directly. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 328eda9e6db1..3f5d4fd4e95a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -577,12 +577,7 @@ nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server, static bool _nfs4_is_integrity_protected(struct nfs_client *clp) { rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; - - if (flavor == RPC_AUTH_GSS_KRB5I || - flavor == RPC_AUTH_GSS_KRB5P) - return true; - - return false; + return (flavor == RPC_AUTH_GSS_KRB5I) || (flavor == RPC_AUTH_GSS_KRB5P); } static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) From 820bf85ce249ec5b4da9f6aaefa3f5491b23b587 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 15:01:43 -0500 Subject: [PATCH 17/64] NFS: Remove nfs4_wait_for_completion_rpc_task() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3f5d4fd4e95a..b577e172c95b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1245,14 +1245,6 @@ static void nfs4_opendata_put(struct nfs4_opendata *p) kref_put(&p->kref, nfs4_opendata_free); } -static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) -{ - int ret; - - ret = rpc_wait_for_completion_task(task); - return ret; -} - static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, fmode_t fmode) { @@ -2039,7 +2031,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status != 0) { data->cancelled = 1; smp_wmb(); @@ -2206,7 +2198,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status != 0) { data->cancelled = 1; smp_wmb(); @@ -5732,7 +5724,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co return PTR_ERR(task); if (!issync) goto out; - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status != 0) goto out; status = data->rpc_status; @@ -6002,7 +5994,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = PTR_ERR(task); if (IS_ERR(task)) goto out; - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: request->fl_flags = fl_flags; @@ -6229,7 +6221,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - ret = nfs4_wait_for_completion_rpc_task(task); + ret = rpc_wait_for_completion_task(task); if (ret == 0) { ret = data->rpc_status; if (ret) @@ -8297,7 +8289,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, status = PTR_ERR(task); goto out; } - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status == 0) status = task->tk_status; rpc_put_task(task); @@ -8529,7 +8521,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status == 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; From 37a8484aef09638b9eead7070cb0ee27adb74ceb Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:08:35 -0500 Subject: [PATCH 18/64] NFS: Return errors directly in _nfs4_opendata_reclaim_to_nfs4_state() There is no need for a goto just to return an error code without any cleanup. Returning the error directly helps to clean up the code. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b577e172c95b..3e892b05e62b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1639,17 +1639,15 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) int ret; if (!data->rpc_done) { - if (data->rpc_status) { - ret = data->rpc_status; - goto err; - } + if (data->rpc_status) + return ERR_PTR(data->rpc_status); /* cached opens have already been processed */ goto update; } ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) - goto err; + return ERR_PTR(ret); if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); @@ -1659,9 +1657,6 @@ update: atomic_inc(&state->count); return state; -err: - return ERR_PTR(ret); - } static struct nfs4_state * From d7e9825848cede4cf7589b63029b76428ae76958 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:13:29 -0500 Subject: [PATCH 19/64] NFS: Remove an extra if in _nfs4_recover_proc_open() It's simpler just to return the status unconditionally Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3e892b05e62b..031b867aaefa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2216,11 +2216,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); - if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) status = _nfs4_proc_open_confirm(data); - if (status != 0) - return status; - } return status; } From 334f87dd11b7a0ed8f3d3b223fb5cede201e210a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:17:17 -0500 Subject: [PATCH 20/64] NFS: Remove nfs4_recover_expired_lease() This function doesn't add much, since all it does is access the server's nfs_client variable. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 031b867aaefa..baf968990475 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2311,11 +2311,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) -{ - return nfs4_client_recover_expired_lease(server->nfs_client); -} - /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -2693,7 +2688,7 @@ static int _nfs4_do_open(struct inode *dir, dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } - status = nfs4_recover_expired_lease(server); + status = nfs4_client_recover_expired_lease(server->nfs_client); if (status != 0) goto err_put_state_owner; if (d_really_is_positive(dentry)) From 9df1336ca42c16622f1cae5dff246a0c9c644514 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:30:08 -0500 Subject: [PATCH 21/64] NFS: Remove unnecessary goto in nfs4_lookup_root_sec() Once again, it's easier and cleaner just to return the error directly. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index baf968990475..6c40944cb83d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3416,16 +3416,11 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl .pseudoflavor = flavor, }; struct rpc_auth *auth; - int ret; auth = rpcauth_create(&auth_args, server->client); - if (IS_ERR(auth)) { - ret = -EACCES; - goto out; - } - ret = nfs4_lookup_root(server, fhandle, info); -out: - return ret; + if (IS_ERR(auth)) + return -EACCES; + return nfs4_lookup_root(server, fhandle, info); } /* From 81b68de493a3c60517ef0435d2635b531ed590b9 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:41:34 -0500 Subject: [PATCH 22/64] NFS: No need to set and return status in nfs41_lock_expired() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6c40944cb83d..81390e97c8d5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6282,8 +6282,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) return 0; - status = nfs4_lock_expired(state, request); - return status; + return nfs4_lock_expired(state, request); } #endif From 49ad0145c3a81612482ecc064ce6ab908ae3bb1a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:51:59 -0500 Subject: [PATCH 23/64] NFS: Clean up nfs41_same_server_scope() The function is cleaner this way, since we can use the result of memcmp() directly Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 81390e97c8d5..bf014a2deefd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7120,11 +7120,9 @@ static bool nfs41_same_server_scope(struct nfs41_server_scope *a, struct nfs41_server_scope *b) { - if (a->server_scope_sz == b->server_scope_sz && - memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) - return true; - - return false; + if (a->server_scope_sz != b->server_scope_sz) + return false; + return memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0; } static void From 045c551947a8c6e787437d696262b562520ff2f0 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 11 Jan 2017 16:59:48 -0500 Subject: [PATCH 24/64] NFS: Return the comparison result directly in nfs41_match_stateid() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bf014a2deefd..aeaa6678b071 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9116,10 +9116,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1, if (s1->seqid == s2->seqid) return true; - if (s1->seqid == 0 || s2->seqid == 0) - return true; - return false; + return s1->seqid == 0 || s2->seqid == 0; } #endif /* CONFIG_NFS_V4_1 */ From 2e54b9b1b0ace6997aa4f9811af0811b2d8160f2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 12 Jan 2017 15:38:06 +0000 Subject: [PATCH 25/64] pNFS/flexfiles: Make local symbol layoutreturn_ops static Fixes the following sparse warning: fs/nfs/flexfilelayout/flexfilelayout.c:2114:34: warning: symbol 'layoutreturn_ops' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index cc9064974104..6104696325be 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2076,7 +2076,7 @@ ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args) kfree(ff_args); } -const struct nfs4_xdr_opaque_ops layoutreturn_ops = { +static const struct nfs4_xdr_opaque_ops layoutreturn_ops = { .encode = ff_layout_encode_layoutreturn, .free = ff_layout_free_layoutreturn, }; From 68e33bd6bbb79819e5cb7bce26559191b144c465 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 12 Jan 2017 15:39:18 +0000 Subject: [PATCH 26/64] NFSv4: Fix warning for using 0 as NULL Fixes the following sparse warning: fs/nfs/nfs4state.c:862:60: warning: Using plain integer as NULL pointer Signed-off-by: Wei Yongjun Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index daeb94e3acd4..8156bad6b441 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -868,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner, 0); + lsp = __nfs4_find_lock_state(state, owner, NULL); if (lsp != NULL) break; if (new != NULL) { From 920b4530fb80430ff30ef83efe21ba1fa5623731 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 1 Feb 2017 00:00:07 -0500 Subject: [PATCH 27/64] NFS: nfs_rename() handle -ERESTARTSYS dentry left behind An interrupted rename will leave the old dentry behind if the rename succeeds. Fix this by moving the final local work of the rename to rpc_call_done so that the results of the RENAME can always be handled, even if the original process has already returned with -ERESTARTSYS. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fad81041f5ab..fb499a3f21b5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2002,6 +2002,29 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); +static void +nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *old_dentry = data->old_dentry; + struct dentry *new_dentry = data->new_dentry; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + + nfs_mark_for_revalidate(old_inode); + + switch (task->tk_status) { + case 0: + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(data->new_dir)); + break; + case -ENOENT: + nfs_dentry_handle_enoent(old_dentry); + } +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2084,7 +2107,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, + nfs_complete_rename); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2094,21 +2118,11 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); - nfs_mark_for_revalidate(old_inode); out: if (rehash) d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); - if (!error) { - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(new_dir)); - } else if (error == -ENOENT) - nfs_dentry_handle_enoent(old_dentry); - /* new dentry created? */ if (dentry) dput(dentry); From 600424e3d91be7c6c8c38b95db713983a6307efa Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 4 Jan 2017 04:22:33 +1000 Subject: [PATCH 28/64] nfs: no PG_private waiters remain, remove waker Since commit 4f52b6bb ("NFS: Don't call COMMIT in ->releasepage()"), no tasks wait on PagePrivate, so the wake introduced in commit 95905446 ("NFS: avoid deadlocks with loop-back mounted NFS filesystems.") can be removed. Signed-off-by: Nicholas Piggin Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ad4219a41f25..e75b056f46f4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); - smp_mb__after_atomic(); - wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->nrequests--; From 2864486bd0fdd14431058650c91fcb9fba605d43 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:46:39 +0800 Subject: [PATCH 29/64] sunrpc: error out if register_shrinker fail register_shrinker may return error when register fail, error out. Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- net/sunrpc/auth.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index d8f17ea7932e..8b344d0eb6ed 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -878,8 +878,12 @@ int __init rpcauth_init_module(void) err = rpc_init_generic_auth(); if (err < 0) goto out2; - register_shrinker(&rpc_cred_shrinker); + err = register_shrinker(&rpc_cred_shrinker); + if (err < 0) + goto out3; return 0; +out3: + rpc_destroy_generic_auth(); out2: rpc_destroy_authunix(); out1: From 863d7d9c2e0cbbde6cd15f87c4431dd806f2d917 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:47:16 +0800 Subject: [PATCH 30/64] sunrpc/nfs: cleanup procfs/pipefs entry in cache_detail Record flush/channel/content entries is useless, remove them. Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- fs/nfs/cache_lib.c | 3 +-- include/linux/sunrpc/cache.h | 15 +++-------- net/sunrpc/cache.c | 49 +++++++++++++----------------------- 3 files changed, 21 insertions(+), 46 deletions(-) diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 6de15709d024..2ae676f93e6b 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -141,8 +141,7 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd) void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) { - if (cd->u.pipefs.dir) - sunrpc_cache_unregister_pipefs(cd); + sunrpc_cache_unregister_pipefs(cd); } void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 62a60eeacb0a..bb5c9c80f12e 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -63,15 +63,6 @@ struct cache_head { #define CACHE_NEW_EXPIRY 120 /* keep new things pending confirmation for 120 seconds */ -struct cache_detail_procfs { - struct proc_dir_entry *proc_ent; - struct proc_dir_entry *flush_ent, *channel_ent, *content_ent; -}; - -struct cache_detail_pipefs { - struct dentry *dir; -}; - struct cache_detail { struct module * owner; int hash_size; @@ -123,9 +114,9 @@ struct cache_detail { time_t last_warn; /* when we last warned about no readers */ union { - struct cache_detail_procfs procfs; - struct cache_detail_pipefs pipefs; - } u; + struct proc_dir_entry *procfs; + struct dentry *pipefs; + }; struct net *net; }; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8147e8d56eb2..688ef8cbac62 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1600,21 +1600,12 @@ static const struct file_operations cache_flush_operations_procfs = { .llseek = no_llseek, }; -static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) +static void remove_cache_proc_entries(struct cache_detail *cd) { - struct sunrpc_net *sn; - - if (cd->u.procfs.proc_ent == NULL) - return; - if (cd->u.procfs.flush_ent) - remove_proc_entry("flush", cd->u.procfs.proc_ent); - if (cd->u.procfs.channel_ent) - remove_proc_entry("channel", cd->u.procfs.proc_ent); - if (cd->u.procfs.content_ent) - remove_proc_entry("content", cd->u.procfs.proc_ent); - cd->u.procfs.proc_ent = NULL; - sn = net_generic(net, sunrpc_net_id); - remove_proc_entry(cd->name, sn->proc_net_rpc); + if (cd->procfs) { + proc_remove(cd->procfs); + cd->procfs = NULL; + } } #ifdef CONFIG_PROC_FS @@ -1624,38 +1615,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); - cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); - if (cd->u.procfs.proc_ent == NULL) + cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); + if (cd->procfs == NULL) goto out_nomem; - cd->u.procfs.channel_ent = NULL; - cd->u.procfs.content_ent = NULL; p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, - cd->u.procfs.proc_ent, - &cache_flush_operations_procfs, cd); - cd->u.procfs.flush_ent = p; + cd->procfs, &cache_flush_operations_procfs, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->u.procfs.proc_ent, - &cache_file_operations_procfs, cd); - cd->u.procfs.channel_ent = p; + cd->procfs, &cache_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { p = proc_create_data("content", S_IFREG|S_IRUSR, - cd->u.procfs.proc_ent, - &content_file_operations_procfs, cd); - cd->u.procfs.content_ent = p; + cd->procfs, &content_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } return 0; out_nomem: - remove_cache_proc_entries(cd, net); + remove_cache_proc_entries(cd); return -ENOMEM; } #else /* CONFIG_PROC_FS */ @@ -1684,7 +1667,7 @@ EXPORT_SYMBOL_GPL(cache_register_net); void cache_unregister_net(struct cache_detail *cd, struct net *net) { - remove_cache_proc_entries(cd, net); + remove_cache_proc_entries(cd); sunrpc_destroy_cache_detail(cd); } EXPORT_SYMBOL_GPL(cache_unregister_net); @@ -1843,15 +1826,17 @@ int sunrpc_cache_register_pipefs(struct dentry *parent, struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd); if (IS_ERR(dir)) return PTR_ERR(dir); - cd->u.pipefs.dir = dir; + cd->pipefs = dir; return 0; } EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) { - rpc_remove_cache_dir(cd->u.pipefs.dir); - cd->u.pipefs.dir = NULL; + if (cd->pipefs) { + rpc_remove_cache_dir(cd->pipefs); + cd->pipefs = NULL; + } } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); From 5786461bd8ea81433d81297215ee814a9a33ce7a Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:48:11 +0800 Subject: [PATCH 31/64] sunrpc: rename NFS_NGROUPS to UNX_NGROUPS for auth unix NFS_NGROUPS has been move to sunrpc, rename to UNX_NGROUPS. Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 1 + net/sunrpc/auth_unix.c | 18 ++++++++---------- net/sunrpc/svcauth_unix.c | 4 ++-- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index b1bc62ba20a2..39c85fb1f0ed 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -32,6 +32,7 @@ */ #define UNX_MAXNODENAME __NEW_UTS_LEN #define UNX_CALLSLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) +#define UNX_NGROUPS 16 struct rpcsec_gss_info; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 306fc0f54596..82337e1ec9cd 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -14,12 +14,10 @@ #include #include -#define NFS_NGROUPS 16 - struct unx_cred { struct rpc_cred uc_base; kgid_t uc_gid; - kgid_t uc_gids[NFS_NGROUPS]; + kgid_t uc_gids[UNX_NGROUPS]; }; #define uc_uid uc_base.cr_uid @@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t if (acred->group_info != NULL) groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; + if (groups > UNX_NGROUPS) + groups = UNX_NGROUPS; cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) cred->uc_gids[i] = acred->group_info->gid[i]; - if (i < NFS_NGROUPS) + if (i < UNX_NGROUPS) cred->uc_gids[i] = INVALID_GID; return &cred->uc_base; @@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (acred->group_info != NULL) groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; + if (groups > UNX_NGROUPS) + groups = UNX_NGROUPS; for (i = 0; i < groups ; i++) if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; - if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) + if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; return 1; } @@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p) *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); hold = p++; - for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++) + for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++) *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); *hold = htonl(p - hold - 1); /* gid array length */ *base = htonl((p - base - 1) << 2); /* cred length */ diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 64af4f034de6..f81eaa8e0888 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt) /**************************************************************************** * auth.unix.gid cache * simple cache to map a UID to a list of GIDs - * because AUTH_UNIX aka AUTH_SYS has a max of 16 + * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS */ #define GID_HASHBITS 8 #define GID_HASHMAX (1<cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ slen = svc_getnl(argv); /* gids length */ - if (slen > 16 || (len -= (slen + 2)*4) < 0) + if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0) goto badcred; cred->cr_group_info = groups_alloc(slen); if (cred->cr_group_info == NULL) From af4926e5619f8a33463f7202f27ba091893ed2ad Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:48:49 +0800 Subject: [PATCH 32/64] sunrpc: remove dead codes of cr_magic in rpc_cred Don't found any place using the cr_magic. Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 5 ----- net/sunrpc/auth.c | 3 --- net/sunrpc/auth_null.c | 3 --- 3 files changed, 11 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 39c85fb1f0ed..8fd3504946ad 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -64,9 +64,6 @@ struct rpc_cred { struct rcu_head cr_rcu; struct rpc_auth * cr_auth; const struct rpc_credops *cr_ops; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - unsigned long cr_magic; /* 0x0f4aa4f0 */ -#endif unsigned long cr_expire; /* when to gc */ unsigned long cr_flags; /* various flags */ atomic_t cr_count; /* ref count */ @@ -80,8 +77,6 @@ struct rpc_cred { #define RPCAUTH_CRED_HASHED 2 #define RPCAUTH_CRED_NEGATIVE 3 -#define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 - /* rpc_auth au_flags */ #define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 8b344d0eb6ed..a1ee933e3029 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -648,9 +648,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - cred->cr_magic = RPCAUTH_CRED_MAGIC; -#endif cred->cr_uid = acred->uid; } EXPORT_SYMBOL_GPL(rpcauth_init_cred); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 4d17376b2acb..5f3d527dff65 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -139,7 +139,4 @@ struct rpc_cred null_cred = { .cr_ops = &null_credops, .cr_count = ATOMIC_INIT(1), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - .cr_magic = RPCAUTH_CRED_MAGIC, -#endif }; From 6489a8f41370bcc53dac7107e298179da8c6cc05 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:49:17 +0800 Subject: [PATCH 33/64] sunrpc: update the comments of sunrpc proc path Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- net/sunrpc/cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 688ef8cbac62..9e8561d1714d 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -717,7 +717,7 @@ void cache_clean_deferred(void *owner) /* * communicate with user-space * - * We have a magic /proc file - /proc/sunrpc//channel. + * We have a magic /proc file - /proc/net/rpc//channel. * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. @@ -1272,7 +1272,7 @@ EXPORT_SYMBOL_GPL(qword_get); /* - * support /proc/sunrpc/cache/$CACHENAME/content + * support /proc/net/rpc/$CACHENAME/content * as a seqfile. * We call ->cache_show passing NULL for the item to * get a header, then pass each real item in the cache From 3f373e81b1e8d26a90523cd12385cbce588f3f18 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:49:57 +0800 Subject: [PATCH 34/64] sunrpc: record rpc client pointer in seq->private directly pos in rpc_clnt_iter is useless, drop it and record clnt in seq_private. Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- net/sunrpc/debugfs.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e7b4d93566df..c8fd0b6c1618 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir; unsigned int rpc_inject_disconnect; -struct rpc_clnt_iter { - struct rpc_clnt *clnt; - loff_t pos; -}; - static int tasks_show(struct seq_file *f, void *v) { @@ -47,12 +42,10 @@ static void * tasks_start(struct seq_file *f, loff_t *ppos) __acquires(&clnt->cl_lock) { - struct rpc_clnt_iter *iter = f->private; + struct rpc_clnt *clnt = f->private; loff_t pos = *ppos; - struct rpc_clnt *clnt = iter->clnt; struct rpc_task *task; - iter->pos = pos + 1; spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) if (pos-- == 0) @@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos) static void * tasks_next(struct seq_file *f, void *v, loff_t *pos) { - struct rpc_clnt_iter *iter = f->private; - struct rpc_clnt *clnt = iter->clnt; + struct rpc_clnt *clnt = f->private; struct rpc_task *task = v; struct list_head *next = task->tk_task.next; - ++iter->pos; ++*pos; /* If there's another task on list, return it */ @@ -81,9 +72,7 @@ static void tasks_stop(struct seq_file *f, void *v) __releases(&clnt->cl_lock) { - struct rpc_clnt_iter *iter = f->private; - struct rpc_clnt *clnt = iter->clnt; - + struct rpc_clnt *clnt = f->private; spin_unlock(&clnt->cl_lock); } @@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = { static int tasks_open(struct inode *inode, struct file *filp) { - int ret = seq_open_private(filp, &tasks_seq_operations, - sizeof(struct rpc_clnt_iter)); - + int ret = seq_open(filp, &tasks_seq_operations); if (!ret) { struct seq_file *seq = filp->private_data; - struct rpc_clnt_iter *iter = seq->private; + struct rpc_clnt *clnt = seq->private = inode->i_private; - iter->clnt = inode->i_private; - - if (!atomic_inc_not_zero(&iter->clnt->cl_count)) { - seq_release_private(inode, filp); + if (!atomic_inc_not_zero(&clnt->cl_count)) { + seq_release(inode, filp); ret = -EINVAL; } } @@ -118,10 +103,10 @@ static int tasks_release(struct inode *inode, struct file *filp) { struct seq_file *seq = filp->private_data; - struct rpc_clnt_iter *iter = seq->private; + struct rpc_clnt *clnt = seq->private; - rpc_release_client(iter->clnt); - return seq_release_private(inode, filp); + rpc_release_client(clnt); + return seq_release(inode, filp); } static const struct file_operations tasks_fops = { From 8ccc869169d0c59d288b20ec2b060317f87e08b4 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 7 Feb 2017 21:50:32 +0800 Subject: [PATCH 35/64] sunrpc: use simple_read_from_buffer for reading cache flush Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- net/sunrpc/cache.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 9e8561d1714d..2f06f510b570 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1427,20 +1427,11 @@ static ssize_t read_flush(struct file *file, char __user *buf, struct cache_detail *cd) { char tbuf[22]; - unsigned long p = *ppos; size_t len; - snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); - len = strlen(tbuf); - if (p >= len) - return 0; - len -= p; - if (len > count) - len = count; - if (copy_to_user(buf, (void*)(tbuf+p), len)) - return -EFAULT; - *ppos += len; - return len; + len = snprintf(tbuf, sizeof(tbuf), "%lu\n", + convert_to_wallclock(cd->flush_time)); + return simple_read_from_buffer(buf, count, ppos, tbuf, len); } static ssize_t write_flush(struct file *file, const char __user *buf, From a974deee477af89411e0f80456bfb344ac433c98 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Feb 2017 11:29:46 -0500 Subject: [PATCH 36/64] NFSv4: Fix memory and state leak in _nfs4_open_and_get_state If we exit because the file access check failed, we currently leak the struct nfs4_state. We need to attach it to the open context before returning. Fixes: 3efb9722475e ("NFSv4: Refactor _nfs4_open_and_get_state..") Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index aeaa6678b071..45271e5994f0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2624,6 +2624,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = PTR_ERR(state); if (IS_ERR(state)) goto out; + ctx->state = state; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) @@ -2649,7 +2650,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - ctx->state = state; if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) From d23bb113952db88b03a71c9533e5a40e444e18d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Feb 2017 11:17:52 -0500 Subject: [PATCH 37/64] SUNRPC: Remove unused function rpc_get_timeout() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 - net/sunrpc/clnt.c | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 333ad11b3dd9..33f216edb434 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -182,7 +182,6 @@ int rpc_protocol(struct rpc_clnt *); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); -unsigned long rpc_get_timeout(struct rpc_clnt *clnt); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1dc9f3bac099..2838a1fab460 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1452,21 +1452,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_max_bc_payload); -/** - * rpc_get_timeout - Get timeout for transport in units of HZ - * @clnt: RPC client to query - */ -unsigned long rpc_get_timeout(struct rpc_clnt *clnt) -{ - unsigned long ret; - - rcu_read_lock(); - ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval; - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(rpc_get_timeout); - /** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind From 8d1b8c62e0805af7df900ef121389778d2126997 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Feb 2017 11:17:53 -0500 Subject: [PATCH 38/64] SUNRPC: Refactor TCP socket timeout code into a helper function Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 45 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index af392d9b9cec..c8ac649a51cb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2235,6 +2235,31 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) xs_reset_transport(transport); } +static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, + struct socket *sock) +{ + unsigned int keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); + unsigned int keepcnt = xprt->timeout->to_retries + 1; + unsigned int opt_on = 1; + unsigned int timeo; + + /* TCP Keepalive options */ + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&opt_on, sizeof(opt_on)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + + /* TCP user timeout (see RFC5482) */ + timeo = jiffies_to_msecs(xprt->timeout->to_initval) * + (xprt->timeout->to_retries + 1); + kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&timeo, sizeof(timeo)); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2242,22 +2267,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int keepidle = xprt->timeout->to_initval / HZ; - unsigned int keepcnt = xprt->timeout->to_retries + 1; - unsigned int opt_on = 1; - unsigned int timeo; unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; - /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: @@ -2268,11 +2279,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, (char *)&addr_pref, sizeof(addr_pref)); - /* TCP user timeout (see RFC5482) */ - timeo = jiffies_to_msecs(xprt->timeout->to_initval) * - (xprt->timeout->to_retries + 1); - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + xs_tcp_set_socket_timeouts(xprt, sock); write_lock_bh(&sk->sk_callback_lock); From 7196dbb02ea05835b9ee56910ee82cb55422c7f1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Feb 2017 11:17:54 -0500 Subject: [PATCH 39/64] SUNRPC: Allow changing of the TCP timeout parameters on the fly When the NFSv4 server tells us the lease period, we usually want to adjust down the timeout parameters on the TCP connection to ensure that we don't miss lease renewals due to a faulty connection. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 4 +++ include/linux/sunrpc/xprtsock.h | 3 ++ net/sunrpc/clnt.c | 30 ++++++++++++++----- net/sunrpc/xprtsock.c | 51 ++++++++++++++++++++++++++++++--- 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a5da60b24d83..eab1c749e192 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -137,6 +137,9 @@ struct rpc_xprt_ops { void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); + void (*set_connect_timeout)(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout); void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); int (*enable_swap)(struct rpc_xprt *xprt); void (*disable_swap)(struct rpc_xprt *xprt); @@ -221,6 +224,7 @@ struct rpc_xprt { struct timer_list timer; unsigned long last_used, idle_timeout, + connect_timeout, max_reconnect_timeout; /* diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index bef3fb0abb8f..c9959d7e3579 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -55,6 +55,8 @@ struct sock_xprt { size_t rcvsize, sndsize; + struct rpc_timeout tcp_timeout; + /* * Saved socket callback addresses */ @@ -81,6 +83,7 @@ struct sock_xprt { #define XPRT_SOCK_CONNECTING 1U #define XPRT_SOCK_DATA_READY (2) +#define XPRT_SOCK_UPD_TIMEOUT (3) #endif /* __KERNEL__ */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2838a1fab460..b5bc0c589f6a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2684,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; + unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport; int ret = 0; @@ -2696,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, return -EAGAIN; } resvport = xprt->resvport; + connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; rcu_read_unlock(); @@ -2705,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, goto out_put_switch; } xprt->resvport = resvport; - xprt->max_reconnect_timeout = reconnect_timeout; + if (xprt->ops->set_connect_timeout != NULL) + xprt->ops->set_connect_timeout(xprt, + connect_timeout, + reconnect_timeout); rpc_xprt_switch_set_roundrobin(xps); if (setup) { @@ -2722,24 +2727,35 @@ out_put_switch: } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +struct connect_timeout_data { + unsigned long connect_timeout; + unsigned long reconnect_timeout; +}; + static int -rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, +rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - unsigned long timeout = *((unsigned long *)data); + struct connect_timeout_data *timeo = data; - if (timeout < xprt->max_reconnect_timeout) - xprt->max_reconnect_timeout = timeout; + if (xprt->ops->set_connect_timeout) + xprt->ops->set_connect_timeout(xprt, + timeo->connect_timeout, + timeo->reconnect_timeout); return 0; } void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) { + struct connect_timeout_data timeout = { + .connect_timeout = timeo, + .reconnect_timeout = timeo, + }; rpc_clnt_iterate_for_each_xprt(clnt, - rpc_xprt_cap_max_reconnect_timeout, - &timeo); + rpc_xprt_set_connect_timeout, + &timeout); } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c8ac649a51cb..810e9b59be16 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,8 @@ #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); +static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, + struct socket *sock); /* * xprtsock tunables @@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task) if (task->tk_flags & RPC_TASK_SENT) zerocopy = false; + if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) + xs_tcp_set_socket_timeouts(xprt, transport->sock); + /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ @@ -2238,11 +2243,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { - unsigned int keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); - unsigned int keepcnt = xprt->timeout->to_retries + 1; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + unsigned int keepidle; + unsigned int keepcnt; unsigned int opt_on = 1; unsigned int timeo; + spin_lock_bh(&xprt->transport_lock); + keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); + keepcnt = xprt->timeout->to_retries + 1; + timeo = jiffies_to_msecs(xprt->timeout->to_initval) * + (xprt->timeout->to_retries + 1); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); + spin_unlock_bh(&xprt->transport_lock); + /* TCP Keepalive options */ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&opt_on, sizeof(opt_on)); @@ -2254,12 +2268,38 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, (char *)&keepcnt, sizeof(keepcnt)); /* TCP user timeout (see RFC5482) */ - timeo = jiffies_to_msecs(xprt->timeout->to_initval) * - (xprt->timeout->to_retries + 1); kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, (char *)&timeo, sizeof(timeo)); } +static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct rpc_timeout to; + unsigned long initval; + + spin_lock_bh(&xprt->transport_lock); + if (reconnect_timeout < xprt->max_reconnect_timeout) + xprt->max_reconnect_timeout = reconnect_timeout; + if (connect_timeout < xprt->connect_timeout) { + memcpy(&to, xprt->timeout, sizeof(to)); + initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1); + /* Arbitrary lower limit */ + if (initval < XS_TCP_INIT_REEST_TO << 1) + initval = XS_TCP_INIT_REEST_TO << 1; + to.to_initval = initval; + to.to_maxval = initval; + memcpy(&transport->tcp_timeout, &to, + sizeof(transport->tcp_timeout)); + xprt->timeout = &transport->tcp_timeout; + xprt->connect_timeout = connect_timeout; + } + set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); + spin_unlock_bh(&xprt->transport_lock); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2728,6 +2768,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, + .set_connect_timeout = xs_tcp_set_connect_timeout, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, @@ -3014,6 +3055,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + xprt->connect_timeout = xprt->timeout->to_initval * + (xprt->timeout->to_retries + 1); INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); From 26ae102f2cfd0215daa57dc790aa3bfe534403a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Feb 2017 11:17:55 -0500 Subject: [PATCH 40/64] NFSv4: Set the connection timeout to match the lease period Set the timeout for TCP connections to be 1 lease period to ensure that we don't lose our lease due to a faulty TCP connection. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4renewd.c | 2 +- include/linux/sunrpc/clnt.h | 5 +++-- net/sunrpc/clnt.c | 10 ++++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 82e77198d17e..1f8c2ae43a8d 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -153,7 +153,7 @@ void nfs4_set_lease_period(struct nfs_client *clp, spin_unlock(&clp->cl_lock); /* Cap maximum reconnect timeout at 1/2 lease period */ - rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); + rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1); } /* diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 33f216edb434..6095ecba0dde 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -201,8 +201,9 @@ int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, struct rpc_xprt *, void *), void *data); -void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, - unsigned long timeo); +void rpc_set_connect_timeout(struct rpc_clnt *clnt, + unsigned long connect_timeout, + unsigned long reconnect_timeout); int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, struct rpc_xprt_switch *, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b5bc0c589f6a..52da3ce54bb5 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2747,17 +2747,19 @@ rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt, } void -rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) +rpc_set_connect_timeout(struct rpc_clnt *clnt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { struct connect_timeout_data timeout = { - .connect_timeout = timeo, - .reconnect_timeout = timeo, + .connect_timeout = connect_timeout, + .reconnect_timeout = reconnect_timeout, }; rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_set_connect_timeout, &timeout); } -EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +EXPORT_SYMBOL_GPL(rpc_set_connect_timeout); void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) { From 24abdf1be15c478e2821d6fc903a4a4440beff02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 16:59:46 -0500 Subject: [PATCH 41/64] xprtrdma: Fix Read chunk padding When pad optimization is disabled, rpcrdma_convert_iovs still does not add explicit XDR round-up padding to a Read chunk. Commit 677eb17e94ed ("xprtrdma: Fix XDR tail buffer marshalling") incorrectly short-circuited the test for whether round-up padding is needed that appears later in rpcrdma_convert_iovs. However, if this is indeed a regular Read chunk (and not a Position-Zero Read chunk), the tail iovec _always_ contains the chunk's padding, and never anything else. So, it's easy to just skip the tail when padding optimization is enabled, and add the tail in a subsequent Read chunk segment, if disabled. Fixes: 677eb17e94ed ("xprtrdma: Fix XDR tail buffer marshalling") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c52e0f2ffe52..a524d3c2e8ac 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -226,8 +226,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == RPCRDMA_MAX_SEGS) goto out_overflow; - /* When encoding the read list, the tail is always sent inline */ - if (type == rpcrdma_readch) + /* When encoding a Read chunk, the tail iovec contains an + * XDR pad and may be omitted. + */ + if (type == rpcrdma_readch && xprt_rdma_pad_optimize) return n; /* When encoding the Write list, some servers need to see an extra @@ -238,10 +240,6 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; if (xdrbuf->tail[0].iov_len) { - /* the rpcrdma protocol allows us to omit any trailing - * xdr pad bytes, saving the server an RDMA operation. */ - if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) - return n; n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); if (n == RPCRDMA_MAX_SEGS) goto out_overflow; From b5f0afbea4f2ea52c613ac2b06cb6de2ea18cb6d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 16:59:54 -0500 Subject: [PATCH 42/64] xprtrdma: Per-connection pad optimization Pad optimization is changed by echoing into /proc/sys/sunrpc/rdma_pad_optimize. This is a global setting, affecting all RPC-over-RDMA connections to all servers. The marshaling code picks up that value and uses it for decisions about how to construct each RPC-over-RDMA frame. Having it change suddenly in mid-operation can result in unexpected failures. And some servers a client mounts might need chunk round-up, while others don't. So instead, copy the pad_optimize setting into each connection's rpcrdma_ia when the transport is created, and use the copy, which can't change during the life of the connection, instead. This also removes a hack: rpcrdma_convert_iovs was using the remote-invalidation-expected flag to predict when it could leave out Write chunk padding. This is because the Linux server handles implicit XDR padding on Write chunks correctly, and only Linux servers can set the connection's remote-invalidation-expected flag. It's more sensible to use the pad optimization setting instead. Fixes: 677eb17e94ed ("xprtrdma: Fix XDR tail buffer marshalling") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 28 ++++++++++++++-------------- net/sunrpc/xprtrdma/verbs.c | 1 + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a524d3c2e8ac..c634f0f3f9ce 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -186,9 +186,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) */ static int -rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, - bool reminv_expected) +rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type, + struct rpcrdma_mr_seg *seg) { int len, n, p, page_base; struct page **ppages; @@ -229,14 +229,15 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && xprt_rdma_pad_optimize) + if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) return n; - /* When encoding the Write list, some servers need to see an extra - * segment for odd-length Write chunks. The upper layer provides - * space in the tail iovec for this purpose. + /* When encoding a Write chunk, some servers need to see an + * extra segment for non-XDR-aligned Write chunks. The upper + * layer provides space in the tail iovec that may be used + * for this purpose. */ - if (type == rpcrdma_writech && reminv_expected) + if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) return n; if (xdrbuf->tail[0].iov_len) { @@ -291,7 +292,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, if (rtype == rpcrdma_areadch) pos = 0; seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, + rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -353,10 +355,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -421,8 +422,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11d07748f699..2a6a367a2dac 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -208,6 +208,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, /* Default settings for RPC-over-RDMA Version One */ r_xprt->rx_ia.ri_reminv_expected = false; + r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e35efd4ac1e4..c13715431419 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -75,6 +75,7 @@ struct rpcrdma_ia { unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; bool ri_reminv_expected; + bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; From c95a3c6b88658bcb8f77f85f31a0b9d9036e8016 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:02 -0500 Subject: [PATCH 43/64] xprtrdma: Disable pad optimization by default Commit d5440e27d3e5 ("xprtrdma: Enable pad optimization") made the Linux client omit XDR round-up padding in normal Read and Write chunks so that the client doesn't have to register and invalidate 3-byte memory regions that contain no real data. Unfortunately, my cheery 2014 assessment that this optimization "is supported now by both Linux and Solaris servers" was premature. We've found bugs in Solaris in this area since commit d5440e27d3e5 ("xprtrdma: Enable pad optimization") was merged (SYMLINK is the main offender). So for maximum interoperability, I'm disabling this optimization again. If a CM private message is exchanged when connecting, the client recognizes that the server is Linux, and enables the optimization for that connection. Until now the Solaris server bugs did not impact common operations, and were thus largely benign. Soon, less capable devices on Linux NFS/RDMA clients will make use of Read chunks more often, and these Solaris bugs will prevent interoperation in more cases. Fixes: 677eb17e94ed ("xprtrdma: Fix XDR tail buffer marshalling") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 534c178d2a7e..699058169cfc 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 1; + int xprt_rdma_pad_optimize = 0; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2a6a367a2dac..23f4da419a64 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -216,6 +216,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { r_xprt->rx_ia.ri_reminv_expected = true; + r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } From 16f906d66cd76fb9895cbc628f447532a7ac1faa Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:10 -0500 Subject: [PATCH 44/64] xprtrdma: Reduce required number of send SGEs The MAX_SEND_SGES check introduced in commit 655fec6987be ("xprtrdma: Use gathered Send for large inline messages") fails for devices that have a small max_sge. Instead of checking for a large fixed maximum number of SGEs, check for a minimum small number. RPC-over-RDMA will switch to using a Read chunk if an xdr_buf has more pages than can fit in the device's max_sge limit. This is considerably better than failing all together to mount the server. This fix supports devices that have as few as three send SGEs available. Reported-by: Selvin Xavier Reported-by: Devesh Sharma Reported-by: Honggang Li Reported-by: Ram Amrani Fixes: 655fec6987be ("xprtrdma: Use gathered Send for large ...") Cc: stable@vger.kernel.org # v4.9+ Tested-by: Honggang Li Tested-by: Ram Amrani Tested-by: Steve Wise Reviewed-by: Parav Pandit Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 26 +++++++++++++++++++++++--- net/sunrpc/xprtrdma/verbs.c | 13 +++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c634f0f3f9ce..d88988365cd2 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use - * the read chunk list for this operation. + * a Read chunk for this operation. + * + * A Read chunk is also required if sending the RPC call inline would + * exceed this device's max_sge limit. */ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct xdr_buf *xdr = &rqst->rq_snd_buf; + unsigned int count, remaining, offset; - return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; + if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + return false; + + if (xdr->page_len) { + remaining = xdr->page_len; + offset = xdr->page_base & ~PAGE_MASK; + count = 0; + while (remaining) { + remaining -= min_t(unsigned int, + PAGE_SIZE - offset, remaining); + offset = 0; + if (++count > r_xprt->rx_ia.ri_max_send_sges) + return false; + } + } + + return true; } /* The client can't know how large the actual reply will be. Thus it diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 23f4da419a64..61d16c39e92c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -488,18 +488,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) */ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata) + struct rpcrdma_create_data_internal *cdata) { struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + unsigned int max_qp_wr, max_sge; struct ib_cq *sendcq, *recvcq; - unsigned int max_qp_wr; int rc; - if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { - dprintk("RPC: %s: insufficient sge's available\n", - __func__); + max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES); + if (max_sge < RPCRDMA_MIN_SEND_SGES) { + pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); return -ENOMEM; } + ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES; if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", @@ -524,7 +525,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ - ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; + ep->rep_attr.cap.max_send_sge = max_sge; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c13715431419..3d7e9c9bad1f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -74,6 +74,7 @@ struct rpcrdma_ia { unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; + unsigned int ri_max_send_sges; bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; @@ -311,6 +312,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ * - xdr_buf tail iovec */ enum { + RPCRDMA_MIN_SEND_SGES = 3, RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, From c6f5b47f9fdeef12c0896e5af4bb3416c97d91c4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:18 -0500 Subject: [PATCH 45/64] xprtrdma: Shrink send SGEs array We no longer need to accommodate an xdr_buf whose pages start at an offset and cross extra page boundaries. If there are more partial or whole pages to send than there are available SGEs, the marshaling logic is now smart enough to use a Read chunk instead of failing. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3d7e9c9bad1f..852dd0a750a5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -305,16 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; -/* Reserve enough Send SGEs to send a maximum size inline request: +/* The Send SGE array is provisioned to send a maximum size + * inline request: * - RPC-over-RDMA header * - xdr_buf head iovec - * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages + * - RPCRDMA_MAX_INLINE bytes, in pages * - xdr_buf tail iovec + * + * The actual number of array elements consumed by each RPC + * depends on the device's max_sge limit. */ enum { RPCRDMA_MIN_SEND_SGES = 3, - RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, - RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, + RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT, RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, }; From 18c0fb31a034023e5cb2d3c9c1320d5d47d91afe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:27 -0500 Subject: [PATCH 46/64] xprtrdma: Properly recover FRWRs with in-flight FASTREG WRs Sriharsha (sriharsha.basavapatna@broadcom.com) reports an occasional double DMA unmap of an FRWR MR when a connection is lost. I see one way this can happen. When a request requires more than one segment or chunk, rpcrdma_marshal_req loops, invoking ->frwr_op_map for each segment (MR) in each chunk. Each call posts a FASTREG Work Request to register one MR. Now suppose that the transport connection is lost part-way through marshaling this request. As part of recovering and resetting that req, rpcrdma_marshal_req invokes ->frwr_op_unmap_safe, which hands all the req's registered FRWRs to the MR recovery thread. But note: FRWR registration is asynchronous. So it's possible that some of these "already registered" FRWRs are fully registered, and some are still waiting for their FASTREG WR to complete. When the connection is lost, the "already registered" frmrs are marked FRMR_IS_VALID, and the "still waiting" WRs flush. Then frwr_wc_fastreg marks these frmrs FRMR_FLUSHED_FR. But thanks to ->frwr_op_unmap_safe, the MR recovery thread is doing an unreg / alloc_mr, a DMA unmap, and marking each of these frwrs FRMR_IS_INVALID, at the same time frwr_wc_fastreg might be running. - If the recovery thread runs last, then the frmr is marked FRMR_IS_INVALID, and life continues. - If frwr_wc_fastreg runs last, the frmr is marked FRMR_FLUSHED_FR, but the recovery thread has already DMA unmapped that MR. When ->frwr_op_map later re-uses this frmr, it sees it is not marked FRMR_IS_INVALID, and tries to recover it before using it, resulting in a second DMA unmap of the same MR. The fix is to guarantee in-flight FASTREG WRs have flushed before MR recovery runs on those FRWRs. Thus we depend on ro_unmap_safe (called from xprt_rdma_send_request on retransmit, or from xprt_rdma_free) to clean up old registrations as needed. Reported-by: Sriharsha Basavapatna Signed-off-by: Chuck Lever Tested-by: Sriharsha Basavapatna Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 14 ++++++++------ net/sunrpc/xprtrdma/transport.c | 4 ---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index d88988365cd2..72b3ca0253a0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -759,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", @@ -776,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, &rqst->rq_snd_buf, rtype)) { iptr = ERR_PTR(-EIO); - goto out_unmap; + goto out_err; } return 0; -out_unmap: - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); +out_err: + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 699058169cfc..c717f5410776 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", - __func__, rc); - if (rc == -EIO) - r_xprt->rx_stats.failed_marshal_count++; if (rc != -ENOTCONN) return rc; drop_connection: From 0a90487bf7182c74830616b91bd33f68f8c6e18b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:35 -0500 Subject: [PATCH 47/64] xprtrdma: Handle stale connection rejection A server rejects a connection attempt with STALE_CONNECTION when a client attempts to connect to a working remote service, but uses a QPN and GUID that corresponds to an old connection that was abandoned. This might occur after a client crashes and restarts. Fix rpcrdma_conn_upcall() to distinguish between a normal rejection and rejection of stale connection parameters. As an additional clean-up, remove the code that retries the connection attempt with different ORD/IRD values. Code audit of other ULP initiators shows no similar special case handling of initiator_depth or responder_resources. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 66 ++++++++++++------------------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 61d16c39e92c..d1ee33fa8055 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -54,6 +54,7 @@ #include #include #include /* try_module_get()/module_put() */ +#include #include "xprt_rdma.h" @@ -279,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n", + sap, rpc_get_port(sap), ia->ri_device->name, + rdma_reject_msg(id, event->status)); +#endif connstate = -ECONNREFUSED; + if (event->status == IB_CM_REJ_STALE_CONN) + connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; @@ -643,20 +651,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); struct rdma_cm_id *id, *old; + struct sockaddr *sap; + unsigned int extras; int rc = 0; - int retry_count = 0; if (ep->rep_connected != 0) { - struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); rpcrdma_ep_disconnect(ep, ia); - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - id = rpcrdma_create_id(xprt, ia, - (struct sockaddr *)&xprt->rx_data.addr); + sap = (struct sockaddr *)&r_xprt->rx_data.addr; + id = rpcrdma_create_id(r_xprt, ia, sap); if (IS_ERR(id)) { rc = -EHOSTUNREACH; goto out; @@ -711,51 +720,18 @@ retry: } wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - - /* - * Check state. A non-peer reject indicates no listener - * (ECONNREFUSED), which may be a transient state. All - * others indicate a transport condition which has already - * undergone a best-effort. - */ - if (ep->rep_connected == -ECONNREFUSED && - ++retry_count <= RDMA_CONNECT_RETRY_MAX) { - dprintk("RPC: %s: non-peer_reject, retry\n", __func__); - goto retry; - } if (ep->rep_connected <= 0) { - /* Sometimes, the only way to reliably connect to remote - * CMs is to use same nonzero values for ORD and IRD. */ - if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && - (ep->rep_remote_cma.responder_resources == 0 || - ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources)) { - if (ep->rep_remote_cma.responder_resources == 0) - ep->rep_remote_cma.responder_resources = 1; - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; + if (ep->rep_connected == -EAGAIN) goto retry; - } rc = ep->rep_connected; - } else { - struct rpcrdma_xprt *r_xprt; - unsigned int extras; - - dprintk("RPC: %s: connected\n", __func__); - - r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - extras = r_xprt->rx_buf.rb_bc_srv_max_requests; - - if (extras) { - rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) { - pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", - __func__, rc); - rc = 0; - } - } + goto out; } + dprintk("RPC: %s: connected\n", __func__); + extras = r_xprt->rx_buf.rb_bc_srv_max_requests; + if (extras) + rpcrdma_ep_post_extra_recv(r_xprt, extras); + out: if (rc) ep->rep_connected = rc; From 9a5c63e9c4056de8a73555131e6f698ddb0b9e0d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:43 -0500 Subject: [PATCH 48/64] xprtrdma: Refactor management of mw_list field Clean up some duplicate code. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 5 +---- net/sunrpc/xprtrdma/frwr_ops.c | 11 ++++------- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 15 +++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 16 ++++++++++++++++ 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 1ebb09e1ac4f..59e64025ed96 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) fmr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 47bed5333c7f..f81dd93176c0 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; + struct rpcrdma_mw *mw; int count, rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + while (!list_empty(&req->rl_registered)) { + mw = rpcrdma_pop_mw(&req->rl_registered); dprintk("RPC: %s: DMA unmapping frmr %p\n", __func__, &mw->frmr); - list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) frwr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 72b3ca0253a0..a044be2d6ad7 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -322,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, false, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -390,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); @@ -455,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d1ee33fa8055..81cd31acf690 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -776,9 +776,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = list_first_entry(&buf->rb_stale_mrs, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); + mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); dprintk("RPC: %s: recovering MR %p\n", __func__, mw); @@ -796,7 +794,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - list_add(&mw->mw_list, &buf->rb_stale_mrs); + rpcrdma_push_mw(mw, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); @@ -1072,11 +1070,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mw *mw = NULL; spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) { - mw = list_first_entry(&buf->rb_mws, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - } + if (!list_empty(&buf->rb_mws)) + mw = rpcrdma_pop_mw(&buf->rb_mws); spin_unlock(&buf->rb_mwlock); if (!mw) @@ -1099,7 +1094,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_mwlock); - list_add_tail(&mw->mw_list, &buf->rb_mws); + rpcrdma_push_mw(mw, &buf->rb_mws); spin_unlock(&buf->rb_mwlock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 852dd0a750a5..171a35116de9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -354,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) return rqst->rq_xprtdata; } +static inline void +rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +{ + list_add_tail(&mw->mw_list, list); +} + +static inline struct rpcrdma_mw * +rpcrdma_pop_mw(struct list_head *list) +{ + struct rpcrdma_mw *mw; + + mw = list_first_entry(list, struct rpcrdma_mw, mw_list); + list_del(&mw->mw_list); + return mw; +} + /* * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for * inline requests/replies, and client/server credits. From b977b644ccf821ab1269582f7efe1d0d85faa1f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Feb 2017 17:00:51 -0500 Subject: [PATCH 49/64] sunrpc: Allow xprt->ops->timer method to sleep The transport lock is needed to protect the xprt_adjust_cwnd() call in xs_udp_timer, but it is not necessary for accessing the rq_reply_bytes_recvd or tk_status fields. It is correct to sublimate the lock into UDP's xs_udp_timer method, where it is required. The ->timer method has to take the transport lock if needed, but it can now sleep safely, or even call back into the RPC scheduler. This is more a clean-up than a fix, but the "issue" was introduced by my transport switch patches back in 2005. Fixes: 46c0ee8bc4ad ("RPC: separate xprt_timer implementations") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 2 -- net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9a6be030ca7d..b530a2852ba8 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -897,13 +897,11 @@ static void xprt_timer(struct rpc_task *task) return; dprintk("RPC: %5u xprt_timer\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); } else task->tk_status = 0; - spin_unlock_bh(&xprt->transport_lock); } /** diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 810e9b59be16..18b4e7ff8879 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1739,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t */ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) { + spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); + spin_unlock_bh(&xprt->transport_lock); } static unsigned short xs_get_random_port(void) From 251af29c320d86071664f02c76f0d063a19fefdf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 11 Feb 2017 10:37:38 -0500 Subject: [PATCH 50/64] nlm: Ensure callback code also checks that the files match It is not sufficient to just check that the lock pids match when granting a callback, we also need to ensure that we're granting the callback on the right file. Reported-by: Pankaj Singh Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/lockd/lockd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c15373894a42..b37dee3acaba 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -355,7 +355,8 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return fl1->fl_pid == fl2->fl_pid + return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) + && fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner && fl1->fl_start == fl2->fl_start && fl1->fl_end == fl2->fl_end From 9761a2469dc287c6d75ca148f4fc483becbcad88 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 19 Feb 2017 00:34:59 +0300 Subject: [PATCH 51/64] sunrpc: silence uninitialized variable warning kstrtouint() can return a couple different error codes so the check for "ret == -EINVAL" is wrong and static analysis tools correctly complain that we can use "num" without initializing it. It's not super harmful because we check the bounds. But it's also easy enough to fix. Signed-off-by: Dan Carpenter Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 18b4e7ff8879..5cbabf2c75b2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3261,7 +3261,9 @@ static int param_set_uint_minmax(const char *val, if (!val) return -EINVAL; ret = kstrtouint(val, 0, &num); - if (ret == -EINVAL || num < min || num > max) + if (ret) + return ret; + if (num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; return 0; From 0ae060ca2bdfe11181094699b0f76437ec210b17 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:25 -0500 Subject: [PATCH 52/64] SUNRPC: Add generic helpers for xdr_stream encode/decode Add some generic helpers for encoding/decoding opaque structures and basic u32/u64. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 177 +++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 56c48c884a24..dc7e51a769ac 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -242,6 +242,183 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +/** + * xdr_align_size - Calculate padded size of an object + * @n: Size of an object being XDR encoded (in bytes) + * + * Return value: + * Size (in bytes) of the object including xdr padding + */ +static inline size_t +xdr_align_size(size_t n) +{ + const size_t mask = sizeof(__u32) - 1; + + return (n + mask) & ~mask; +} + +/** + * xdr_stream_encode_u32 - Encode a 32-bit integer + * @xdr: pointer to xdr_stream + * @n: integer to encode + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) +{ + const size_t len = sizeof(n); + __be32 *p = xdr_reserve_space(xdr, len); + + if (unlikely(!p)) + return -EMSGSIZE; + *p = cpu_to_be32(n); + return len; +} + +/** + * xdr_stream_encode_u64 - Encode a 64-bit integer + * @xdr: pointer to xdr_stream + * @n: 64-bit integer to encode + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) +{ + const size_t len = sizeof(n); + __be32 *p = xdr_reserve_space(xdr, len); + + if (unlikely(!p)) + return -EMSGSIZE; + xdr_encode_hyper(p, n); + return len; +} + +/** + * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data + * @xdr: pointer to xdr_stream + * @ptr: pointer to opaque data object + * @len: size of object pointed to by @ptr + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t len) +{ + __be32 *p = xdr_reserve_space(xdr, len); + + if (unlikely(!p)) + return -EMSGSIZE; + xdr_encode_opaque_fixed(p, ptr, len); + return xdr_align_size(len); +} + +/** + * xdr_stream_encode_opaque - Encode variable length opaque xdr data + * @xdr: pointer to xdr_stream + * @ptr: pointer to opaque data object + * @len: size of object pointed to by @ptr + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) +{ + size_t count = sizeof(__u32) + xdr_align_size(len); + __be32 *p = xdr_reserve_space(xdr, count); + + if (unlikely(!p)) + return -EMSGSIZE; + xdr_encode_opaque(p, ptr, len); + return count; +} + +/** + * xdr_stream_decode_u32 - Decode a 32-bit integer + * @xdr: pointer to xdr_stream + * @ptr: location to store integer + * + * Return values: + * %0 on success + * %-EBADMSG on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) +{ + const size_t count = sizeof(*ptr); + __be32 *p = xdr_inline_decode(xdr, count); + + if (unlikely(!p)) + return -EBADMSG; + *ptr = be32_to_cpup(p); + return 0; +} + +/** + * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data + * @xdr: pointer to xdr_stream + * @ptr: location to store data + * @len: size of buffer pointed to by @ptr + * + * Return values: + * On success, returns size of object stored in @ptr + * %-EBADMSG on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) +{ + __be32 *p = xdr_inline_decode(xdr, len); + + if (unlikely(!p)) + return -EBADMSG; + xdr_decode_opaque_fixed(p, ptr, len); + return len; +} + +/** + * xdr_stream_decode_opaque_inline - Decode variable length opaque xdr data + * @xdr: pointer to xdr_stream + * @ptr: location to store pointer to opaque data + * @maxlen: maximum acceptable object size + * + * Note: the pointer stored in @ptr cannot be assumed valid after the XDR + * buffer has been destroyed, or even after calling xdr_inline_decode() + * on @xdr. It is therefore expected that the object it points to should + * be processed immediately. + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the object would exceed @maxlen + */ +static inline ssize_t +xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxlen) +{ + __be32 *p; + __u32 len; + + *ptr = NULL; + if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) + return -EBADMSG; + if (len != 0) { + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -EBADMSG; + if (unlikely(len > maxlen)) + return -EMSGSIZE; + *ptr = p; + } + return len; +} #endif /* __KERNEL__ */ #endif /* _SUNRPC_XDR_H_ */ From ab6e9aaf16cfdfca630f9745fd6d453818df7f64 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:26 -0500 Subject: [PATCH 53/64] NFSv4: Replace ad-hoc xdr encode/decode helpers with xdr_stream_* generics Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 8 ++----- fs/nfs/flexfilelayout/flexfilelayout.c | 5 +--- fs/nfs/nfs4xdr.c | 33 +++++++------------------- 3 files changed, 12 insertions(+), 34 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index eb094c6011d8..e732a65db546 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -582,12 +582,8 @@ out: static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { - __be32 *p; - - p = xdr_reserve_space(xdr, 4 + len); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - xdr_encode_opaque(p, str, len); + if (unlikely(xdr_stream_encode_opaque(xdr, str, len) < 0)) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6104696325be..7b5f37f5ddf1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1949,10 +1949,7 @@ static int ff_layout_encode_ioerr(struct xdr_stream *xdr, static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) { - __be32 *p; - - p = xdr_reserve_space(xdr, len); - xdr_encode_opaque_fixed(p, buf, len); + WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); } static void diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e9255cb453e6..5f0a6fcd0030 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -924,34 +924,22 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) { - __be32 *p; - - p = xdr_reserve_space(xdr, len); - xdr_encode_opaque_fixed(p, buf, len); + WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); } static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { - __be32 *p; - - p = reserve_space(xdr, 4 + len); - xdr_encode_opaque(p, str, len); + WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0); } static void encode_uint32(struct xdr_stream *xdr, u32 n) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(n); + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); } static void encode_uint64(struct xdr_stream *xdr, u64 n) { - __be32 *p; - - p = reserve_space(xdr, 8); - xdr_encode_hyper(p, n); + WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } static void encode_nfs4_seqid(struct xdr_stream *xdr, @@ -4294,15 +4282,12 @@ out_overflow: static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { - __be32 *p; - - p = xdr_inline_decode(xdr, len); - if (likely(p)) { - memcpy(buf, p, len); - return 0; + ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); + if (unlikely(ret < 0)) { + print_overflow_msg(__func__, xdr); + return -EIO; } - print_overflow_msg(__func__, xdr); - return -EIO; + return 0; } static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) From 6da59ce2fd2047fd9cb141479f20d5f84614e84f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:27 -0500 Subject: [PATCH 54/64] NFSv4: Replace the open coded decode_opaque_inline() with the new generic Also ensure that we always check that the size of the decoded object matches the expectation that it must be smaller than NFS4_OPAQUE_LIMIT. This should be true for all the current users of decode_opaque_inline(), including decode_ace(), decode_pathname(), decode_attr_fs_locations() and decode_exchange_id(). Note that this allows us to get rid of a number of existing checks in decode_exchange_id(), Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5f0a6fcd0030..d10cc2a62dff 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3050,20 +3050,15 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { - __be32 *p; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - *len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, *len); - if (unlikely(!p)) - goto out_overflow; - *string = (char *)p; + ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, + NFS4_OPAQUE_LIMIT); + if (unlikely(ret < 0)) { + if (ret == -EBADMSG) + print_overflow_msg(__func__, xdr); + return -EIO; + } + *len = ret; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -5645,8 +5640,6 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->server_owner->major_id, dummy_str, dummy); res->server_owner->major_id_sz = dummy; @@ -5654,8 +5647,6 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->server_scope->server_scope, dummy_str, dummy); res->server_scope->server_scope_sz = dummy; @@ -5670,16 +5661,12 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->impl_id->domain, dummy_str, dummy); /* nii_name */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->impl_id->name, dummy_str, dummy); /* nii_date */ From c065eeea3ba03232cb8aaf9d3dc7ad0a9fcb267b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:28 -0500 Subject: [PATCH 55/64] NFSv4: Replace callback string decode function with a generic Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e732a65db546..2ade5cb52b8e 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -83,23 +83,15 @@ static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) return p; } -static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) +static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, + const char **str, size_t maxlen) { - __be32 *p; - - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *len = ntohl(*p); - - if (*len != 0) { - p = read_buf(xdr, *len); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *str = (const char *)p; - } else - *str = NULL; + ssize_t err; + err = xdr_stream_decode_opaque_inline(xdr, (void **)str, maxlen); + if (err < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); + *len = err; return 0; } @@ -162,15 +154,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound __be32 *p; __be32 status; - status = decode_string(xdr, &hdr->taglen, &hdr->tag); + status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - /* We do not like overly long tags! */ - if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) { - printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", - __func__, hdr->taglen); - return htonl(NFS4ERR_RESOURCE); - } p = read_buf(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); From 5a1f6d9e9b803003271b40b67786ff46fa4eda01 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:29 -0500 Subject: [PATCH 56/64] NFSv4: Fix the underestimation of delegation XDR space reservation Account for the "space_limit" field in struct open_write_delegation4. Fixes: 2cebf82883f4 ("NFSv4: Fix the underestimate of NFSv4 open request size") Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d10cc2a62dff..26808fbaacb0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -169,8 +169,10 @@ static int nfs4_stat_to_errno(int); open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) +#define decode_space_limit_maxsz (3) #define decode_ace_maxsz (3 + nfs4_owner_maxsz) #define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \ + decode_space_limit_maxsz + \ decode_ace_maxsz) #define decode_change_info_maxsz (5) #define decode_open_maxsz (op_decode_hdr_maxsz + \ From 1bbe60ff49becd7554e9b32b5e8e6fcee4b33db2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:30 -0500 Subject: [PATCH 57/64] NFSv4: Remove bogus "struct nfs_client" argument from decode_ace() We shouldn't need to force callers to carry an unused argument. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 26808fbaacb0..ec0d76712e43 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3127,7 +3127,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) } /* Dummy routine */ -static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) +static int decode_ace(struct xdr_stream *xdr, void *ace) { __be32 *p; unsigned int strlen; @@ -5075,7 +5075,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } - return decode_ace(xdr, NULL, res->server->nfs_client); + return decode_ace(xdr, NULL); out_overflow: print_overflow_msg(__func__, xdr); return -EIO; From 5c741d4f2215371ae17e3dbdf3d46da850662e13 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:31 -0500 Subject: [PATCH 58/64] SUNRPC: Add a helper function xdr_stream_decode_string_dup() Create a helper function that decodes a xdr string object, allocates a memory buffer and then store it as a NUL terminated string. Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 2 ++ net/sunrpc/xdr.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index dc7e51a769ac..054c8cde18f3 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -242,6 +242,8 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, + size_t maxlen, gfp_t gfp_flags); /** * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 7f1071e103ca..1f7082144e01 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1518,3 +1518,37 @@ out: } EXPORT_SYMBOL_GPL(xdr_process_buf); +/** + * xdr_stream_decode_string_dup - Decode and duplicate variable length string + * @xdr: pointer to xdr_stream + * @str: location to store pointer to string + * @maxlen: maximum acceptable string length + * @gfp_flags: GFP mask to use + * + * Return values: + * On success, returns length of NUL-terminated string stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the string would exceed @maxlen + * %-ENOMEM on memory allocation failure + */ +ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, + size_t maxlen, gfp_t gfp_flags) +{ + void *p; + ssize_t ret; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); + if (ret > 0) { + char *s = kmalloc(ret + 1, gfp_flags); + if (s != NULL) { + memcpy(s, p, ret); + s[ret] = '\0'; + *str = s; + return strlen(s); + } + ret = -ENOMEM; + } + *str = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup); From 686a816ab6bedb99a892f3171f6c9ccecabff180 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 19 Feb 2017 16:08:32 -0500 Subject: [PATCH 59/64] NFSv4: Clean up owner/group attribute decode Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 117 +++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 60 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ec0d76712e43..3268a2393512 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3875,45 +3875,50 @@ out_overflow: return -EIO; } +static ssize_t decode_nfs4_string(struct xdr_stream *xdr, + struct nfs4_string *name, gfp_t gfp_flags) +{ + ssize_t ret; + + ret = xdr_stream_decode_string_dup(xdr, &name->data, + XDR_MAX_NETOBJ, gfp_flags); + name->len = 0; + if (ret > 0) + name->len = ret; + return ret; +} + static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, const struct nfs_server *server, kuid_t *uid, struct nfs4_string *owner_name) { - uint32_t len; - __be32 *p; - int ret = 0; + ssize_t len; + char *p; *uid = make_kuid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; - if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, len); - if (unlikely(!p)) - goto out_overflow; - if (owner_name != NULL) { - owner_name->data = kmemdup(p, len, GFP_NOWAIT); - if (owner_name->data != NULL) { - owner_name->len = len; - ret = NFS_ATTR_FATTR_OWNER_NAME; - } - } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) - ret = NFS_ATTR_FATTR_OWNER; - else - dprintk("%s: nfs_map_name_to_uid failed!\n", - __func__); - } else - dprintk("%s: name too long (%u)!\n", - __func__, len); - bitmap[1] &= ~FATTR4_WORD1_OWNER; + if (!(bitmap[1] & FATTR4_WORD1_OWNER)) + return 0; + bitmap[1] &= ~FATTR4_WORD1_OWNER; + + if (owner_name != NULL) { + len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + if (len <= 0) + goto out; + dprintk("%s: name=%s\n", __func__, owner_name->data); + return NFS_ATTR_FATTR_OWNER_NAME; + } else { + len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, + XDR_MAX_NETOBJ); + if (len <= 0 || nfs_map_name_to_uid(server, p, len, uid) != 0) + goto out; + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); + return NFS_ATTR_FATTR_OWNER; } - dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); - return ret; -out_overflow: +out: + if (len != -EBADMSG) + return 0; print_overflow_msg(__func__, xdr); return -EIO; } @@ -3922,41 +3927,33 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, const struct nfs_server *server, kgid_t *gid, struct nfs4_string *group_name) { - uint32_t len; - __be32 *p; - int ret = 0; + ssize_t len; + char *p; *gid = make_kgid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; - if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, len); - if (unlikely(!p)) - goto out_overflow; - if (group_name != NULL) { - group_name->data = kmemdup(p, len, GFP_NOWAIT); - if (group_name->data != NULL) { - group_name->len = len; - ret = NFS_ATTR_FATTR_GROUP_NAME; - } - } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) - ret = NFS_ATTR_FATTR_GROUP; - else - dprintk("%s: nfs_map_group_to_gid failed!\n", - __func__); - } else - dprintk("%s: name too long (%u)!\n", - __func__, len); - bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + if (!(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) + return 0; + bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + + if (group_name != NULL) { + len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + if (len <= 0) + goto out; + dprintk("%s: name=%s\n", __func__, group_name->data); + return NFS_ATTR_FATTR_OWNER_NAME; + } else { + len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, + XDR_MAX_NETOBJ); + if (len <= 0 || nfs_map_group_to_gid(server, p, len, gid) != 0) + goto out; + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); + return NFS_ATTR_FATTR_GROUP; } - dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); - return ret; -out_overflow: +out: + if (len != -EBADMSG) + return 0; print_overflow_msg(__func__, xdr); return -EIO; } From df3ab232e462bce20710596d697ade6b72497694 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Feb 2017 19:49:09 -0500 Subject: [PATCH 60/64] pNFS/flexfiles: If the layout is invalid, it must be updated before retrying If we see that our pNFS READ/WRITE/COMMIT operation failed, but we also see that our layout segment is no longer valid, then we need to get a new layout segment before retrying. Fixes: 90816d1ddacf ("NFSv4.1/flexfiles: Don't mark the entire deviceid...") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayout.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7b5f37f5ddf1..78f9a3081127 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1053,9 +1053,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs_client *mds_client = mds_server->nfs_client; struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: @@ -1157,9 +1154,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1195,6 +1189,13 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; + if (task->tk_status >= 0) + return 0; + + /* Handle the case of an invalid layout segment */ + if (!pnfs_is_valid_lseg(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, lseg, idx); From 9d8cacbf5636657d2cd0dda17438a56d806d3224 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Feb 2017 18:42:32 -0500 Subject: [PATCH 61/64] NFSv4: Fix reboot recovery in copy offload Copy offload code needs to be hooked into the code for handling NFS4ERR_BAD_STATEID by ensuring that we set the "stateid" field in struct nfs4_exception. Reported-by: Olga Kornievskaia Fixes: 2e72448b07dc3 ("NFS: Add COPY nfs operation") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 63 +++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 98cf58341066..1e486c73ec94 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -129,30 +129,26 @@ out_unlock: return err; } -static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, +static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, - struct file *dst, loff_t pos_dst, + struct file *dst, struct nfs_lock_context *dst_lock, - size_t count) + struct nfs42_copy_args *args, + struct nfs42_copy_res *res) { - struct nfs42_copy_args args = { - .src_fh = NFS_FH(file_inode(src)), - .src_pos = pos_src, - .dst_fh = NFS_FH(file_inode(dst)), - .dst_pos = pos_dst, - .count = count, - }; - struct nfs42_copy_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], - .rpc_argp = &args, - .rpc_resp = &res, + .rpc_argp = args, + .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); struct nfs_server *server = NFS_SERVER(dst_inode); + loff_t pos_src = args->src_pos; + loff_t pos_dst = args->dst_pos; + size_t count = args->count; int status; - status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, src_lock, FMODE_READ); if (status) return status; @@ -162,7 +158,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; - status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; @@ -172,22 +168,22 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, return status; status = nfs4_call_sync(server->client, server, &msg, - &args.seq_args, &res.seq_res, 0); + &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) return status; - if (res.write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (res->write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res->write_res.verifier.verifier); if (status) return status; } truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res.write_res.count); + pos_dst + res->write_res.count); - return res.write_res.count; + return res->write_res.count; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -197,8 +193,22 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; struct nfs_lock_context *dst_lock; - struct nfs4_exception src_exception = { }; - struct nfs4_exception dst_exception = { }; + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct nfs4_exception src_exception = { + .inode = file_inode(src), + .stateid = &args.src_stateid, + }; + struct nfs4_exception dst_exception = { + .inode = file_inode(dst), + .stateid = &args.dst_stateid, + }; ssize_t err, err2; if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) @@ -208,7 +218,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (IS_ERR(src_lock)) return PTR_ERR(src_lock); - src_exception.inode = file_inode(src); src_exception.state = src_lock->open_context->state; dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); @@ -217,15 +226,17 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, goto out_put_src_lock; } - dst_exception.inode = file_inode(dst); dst_exception.state = dst_lock->open_context->state; do { inode_lock(file_inode(dst)); - err = _nfs42_proc_copy(src, pos_src, src_lock, - dst, pos_dst, dst_lock, count); + err = _nfs42_proc_copy(src, src_lock, + dst, dst_lock, + &args, &res); inode_unlock(file_inode(dst)); + if (err >= 0) + break; if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; From a5e14c9376871ee74fe93fdcb380c5d54eaa2d43 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Feb 2017 19:50:18 -0500 Subject: [PATCH 62/64] Revert "NFSv4.1: Handle NFS4ERR_BADSESSION/NFS4ERR_DEADSESSION replies to OP_SEQUENCE" This reverts commit 2cf10cdd486c362f983abdce00dc1127e8ab8c59. The patch has been seen to cause excessive looping. Reported-by: Olga Kornievskaia Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # 4.10+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 45271e5994f0..92b2805ffb82 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -768,10 +768,6 @@ static int nfs41_sequence_process(struct rpc_task *task, case -NFS4ERR_SEQ_FALSE_RETRY: ++slot->seq_nr; goto retry_nowait; - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_BADSESSION: - nfs4_schedule_session_recovery(session, res->sr_status); - goto retry_nowait; default: /* Just update the slot sequence no. */ slot->seq_done = 1; From 6682c14bbe505a8b912c57faf544f866777ee48d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 23 Feb 2017 14:53:39 -0500 Subject: [PATCH 63/64] NFSv4: fix getacl head length estimation Bitmap and attrlen follow immediately after the op reply header. This was an oversight from commit bf118a342f. Consequences of this are just minor efficiency (extra calls to xdr_shrink_bufhead). Fixes: bf118a342f10 "NFSv4: include bitmap in nfsv4 get acl data" Reviewed-by: Kinglong Mee Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3268a2393512..f0369e362753 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2514,7 +2514,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, From ed92d8c137b7794c2c2aa14479298b9885967607 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 23 Feb 2017 14:54:21 -0500 Subject: [PATCH 64/64] NFSv4: fix getacl ERANGE for some ACL buffer sizes We're not taking into account that the space needed for the (variable length) attr bitmap, with the result that we'd sometimes get a spurious ERANGE when the ACL data got close to the end of a page. Just add in an extra page to make sure. Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92b2805ffb82..1b183686c6d4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4954,7 +4954,7 @@ out: */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -4968,13 +4968,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; int ret = -ENOMEM, i; - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - if (npages == 0) - npages = 1; if (npages > ARRAY_SIZE(pages)) return -ERANGE;